Add round_to_tf32() lowering support.

arnamoy10 · igcbot · commit 8440909bc75f · 2023-08-24T12:48:33.000+02:00
This patch adds round_to_tf32() lowering support in IGC.
diff --git a/IGC/AdaptorOCL/SPIRV/SPIRVInternal.h b/IGC/AdaptorOCL/SPIRV/SPIRVInternal.h
@@ -443,6 +443,8 @@ _SPIRV_OP(OpSatConvertUToS)
 _SPIRV_OP(OpSatConvertSToU)
 _SPIRV_OP(OpConvertFToBF16INTEL)
 _SPIRV_OP(OpConvertBF16ToFINTEL)
+// Rounding builtins
+_SPIRV_OP(OpRoundFToTF32INTEL)
 // SPV_INTEL_arithmetic_fence
 _SPIRV_OP(OpArithmeticFenceINTEL)
 // Arithmetic Instructions
diff --git a/IGC/AdaptorOCL/SPIRV/libSPIRV/SPIRVInstruction.h b/IGC/AdaptorOCL/SPIRV/libSPIRV/SPIRVInstruction.h
@@ -1057,6 +1057,7 @@ _SPIRV_OP(Any)
 _SPIRV_OP(All)
 _SPIRV_OP(ConvertFToBF16INTEL)
 _SPIRV_OP(ConvertBF16ToFINTEL)
+_SPIRV_OP(RoundFToTF32INTEL)
 _SPIRV_OP(ArithmeticFenceINTEL)
 _SPIRV_OP(BitReverse)
 #undef _SPIRV_OP
diff --git a/IGC/AdaptorOCL/SPIRV/libSPIRV/SPIRVOpCodeEnum.h b/IGC/AdaptorOCL/SPIRV/libSPIRV/SPIRVOpCodeEnum.h
@@ -515,6 +515,7 @@ _SPIRV_OP(TypeTokenINTEL, 6113)
 //_SPIRV_OP(DebugInfoModuleINTEL, 6114)
 _SPIRV_OP(ConvertFToBF16INTEL, 6116)
 _SPIRV_OP(ConvertBF16ToFINTEL, 6117)
+_SPIRV_OP(RoundFToTF32INTEL, 6426)
 // SPV_INTEL_matrix
 //_SPIRV_OP(TypeJointMatrixINTEL_OLD, 6119) Replaced by 6184
 _SPIRV_OP(TypeJointMatrixINTEL, 6184)
diff --git a/IGC/BiFModule/Headers/spirv.h b/IGC/BiFModule/Headers/spirv.h
@@ -3697,6 +3697,13 @@ float4 SPIRV_OVERLOADABLE  SPIRV_BUILTIN(ConvertBF16ToFINTEL, _v4i16, )(short4 x
 float8 SPIRV_OVERLOADABLE  SPIRV_BUILTIN(ConvertBF16ToFINTEL, _v8i16, )(short8 x);
 float16 SPIRV_OVERLOADABLE SPIRV_BUILTIN(ConvertBF16ToFINTEL, _v16i16, )(short16 x);
 
+int SPIRV_OVERLOADABLE   SPIRV_BUILTIN(RoundFToTF32INTEL, _f32, )(float x);
+int2 SPIRV_OVERLOADABLE  SPIRV_BUILTIN(RoundFToTF32INTEL, _v2f32, )(float2 x);
+int3 SPIRV_OVERLOADABLE  SPIRV_BUILTIN(RoundFToTF32INTEL, _v3f32, )(float3 x);
+int4 SPIRV_OVERLOADABLE  SPIRV_BUILTIN(RoundFToTF32INTEL, _v4f32, )(float4 x);
+int8 SPIRV_OVERLOADABLE  SPIRV_BUILTIN(RoundFToTF32INTEL, _v8f32, )(float8 x);
+int16 SPIRV_OVERLOADABLE SPIRV_BUILTIN(RoundFToTF32INTEL, _v16f32, )(float16 x);
+
 #if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 private void* SPIRV_OVERLOADABLE SPIRV_BUILTIN(GenericCastToPtrExplicit, _p0i8_p4i8_i32, _ToPrivate)(generic char *Pointer, int Storage);
 local   void* SPIRV_OVERLOADABLE SPIRV_BUILTIN(GenericCastToPtrExplicit, _p3i8_p4i8_i32, _ToLocal)(generic char *Pointer, int Storage);
diff --git a/IGC/BiFModule/Implementation/conversions.cl b/IGC/BiFModule/Implementation/conversions.cl
@@ -925,6 +925,36 @@ float16  SPIRV_OVERLOADABLE SPIRV_BUILTIN(ConvertBF16ToFINTEL, _v16i16, )(short1
   return __builtin_IB_bftof_16(Value);
 }
 
+int  SPIRV_OVERLOADABLE SPIRV_BUILTIN(RoundFToTF32INTEL, _f32, )(float Value)
+{
+  return __builtin_IB_ftotf32_1(Value);
+}
+
+int2  SPIRV_OVERLOADABLE SPIRV_BUILTIN(RoundFToTF32INTEL, _v2f32, )(float2 Value)
+{
+  return __builtin_IB_ftotf32_2(Value);
+}
+
+int3  SPIRV_OVERLOADABLE SPIRV_BUILTIN(RoundFToTF32INTEL, _v3f32, )(float3 Value)
+{
+  return __builtin_IB_ftotf32_3(Value);
+}
+
+int4  SPIRV_OVERLOADABLE SPIRV_BUILTIN(RoundFToTF32INTEL, _v4f32, )(float4 Value)
+{
+  return __builtin_IB_ftotf32_4(Value);
+}
+
+int8  SPIRV_OVERLOADABLE SPIRV_BUILTIN(RoundFToTF32INTEL, _v8f32, )(float8 Value)
+{
+  return __builtin_IB_ftotf32_8(Value);
+}
+
+int16  SPIRV_OVERLOADABLE SPIRV_BUILTIN(RoundFToTF32INTEL, _v16f32, )(float16 Value)
+{
+  return __builtin_IB_ftotf32_16(Value);
+}
+
 /*
 // Next is all Scalar types with Rounding modes [RTE,RTZ,RTN,RTP] and Sat
 //
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
@@ -20392,7 +20392,18 @@ void EmitPass::emitfcvt(llvm::GenIntrinsicInst* GII)
         }
         else if (id == GenISAIntrinsic::GenISA_ftotf32) {
             tDst = m_currShader->GetNewAlias(dst, ISA_TYPE_UD, 0, 0);
-            tSrc = src;
+            // Does not support immediate source of type float, therefore we
+            // need a temporary "general" variable and copy the immediate
+            // value to that temporary variable first.  Then we can use this
+            // temporary as an operand of fcvt.
+            if (src->IsImmediate()) {
+                CVariable *tfSrc = m_currShader->GetNewVariable(
+                    1, ISA_TYPE_F, EALIGN_GRF, "tmp_cvt");
+                m_encoder->Copy(tfSrc, src);
+                tSrc = tfSrc;
+            } else {
+                tSrc = src;
+            }
         }
         else {
             IGC_ASSERT_EXIT_MESSAGE(0, "Something wrong in cvt!");
diff --git a/IGC/Compiler/Optimizer/BuiltInFuncImport.cpp b/IGC/Compiler/Optimizer/BuiltInFuncImport.cpp
@@ -535,6 +535,67 @@ void BIImport::fixSPIRFunctionsReturnType(Module& M)
         F->eraseFromParent();
 }
 
+// The built-in definition returns i32, however, at this point the function call
+// that has been added for round_to_tf32() call returns a float (as the orig
+// matrix type was float).  So we need to: 1) Change the return type of the
+// function declaration to int so that it matches the builtin definition; 2)
+// Cast the returned value of the function back to float so that the previous
+// users of the return value are happy.
+void fixRoundToTF32ReturnType(Module &M) {
+    SmallPtrSet<Function *, 8> funcsToRemove;
+    for (auto &F : M) {
+        if (!F.isDeclaration())
+            continue;
+        auto FuncName = F.getName();
+
+        if (!FuncName.contains("OpRoundFToTF32INTEL") ||
+            FuncName.contains("_old"))
+            continue;
+        if (!F.getReturnType()->isFloatTy())
+            continue;
+
+        FunctionType *FT = F.getFunctionType();
+
+        FunctionType *NewFT = FunctionType::get(
+            Type::getInt32Ty(M.getContext()), FT->params(), false);
+        auto *NewF =
+            Function::Create(NewFT, F.getLinkage(), FuncName + ".cloned", M);
+
+        SmallPtrSet<CallInst *, 16> Calls;
+
+        for (auto user : F.users())
+            if (CallInst *CI = dyn_cast<CallInst>(user))
+                Calls.insert(CI);
+
+        for (auto CI : Calls) {
+            IRBuilder<> builder(CI);
+
+            SmallVector<Value *, 4> Args;
+            for (auto &Arg : CI->args())
+                Args.push_back(Arg);
+
+            auto *newCall = builder.CreateCall(NewF, Args);
+            newCall->setCallingConv(CI->getCallingConv());
+            newCall->setAttributes(CI->getAttributes());
+            // Convert the value back so that previous users of
+            // the return value are happy
+            auto *converted = builder.CreateBitCast(newCall, CI->getType());
+
+            CI->replaceAllUsesWith(converted);
+            CI->eraseFromParent();
+        }
+
+        std::string originalName = FuncName.str();
+        F.setName(FuncName + "_old");
+        NewF->setName(originalName);
+
+        funcsToRemove.insert(&F);
+    }
+
+    for (auto *F : funcsToRemove)
+        F->eraseFromParent();
+}
+
 // Older Clang versions generate invalid bitcast instructions for explicit
 // C-style casts with specified address space. For example:
 //   %0 = bitcast i8 addrspace(1)* %mem to i32 addrspace(4)*
@@ -594,6 +655,7 @@ bool BIImport::runOnModule(Module& M)
     }
 
     fixSPIRFunctionsReturnType(M);
+    fixRoundToTF32ReturnType(M);
 
     for (auto& F : M)
     {
diff --git a/IGC/Compiler/Optimizer/OpenCLPasses/JointMatrixFuncsResolutionPass.cpp b/IGC/Compiler/Optimizer/OpenCLPasses/JointMatrixFuncsResolutionPass.cpp
@@ -1094,6 +1094,15 @@ Value *JointMatrixFuncsResolutionPass::ResolveFill(CallInst *CI) {
         fillValue = builder.CreateLoad(vectorElementType, fillValue);
     }
 
+    // For TF32 type, the slice has a type of i32, however, the value we are
+    // filling with has a type of float.  So we need a bitcast.
+    bool isTF32 = (desc.isFloating) && (desc.bitWidth == 32);
+    if (isTF32) {
+        fillValue = builder.CreateBitCast(
+            fillValue, Type::getIntNTy(builder.getContext(),
+                                       getResolvedVectorElemSize(matTy)));
+    }
+
     Value *slice = fillValue;
 
     if (IGCLLVM::FixedVectorType *ty =