[ARM] Introduce intrinsics for MVE fma under strict-fp. (#169771)

davemgreen · web-flow · commit b22825631293 · 2025-11-30T11:12:53.000Z
Similar to #169156, this adds an @arm.mve.fma intrinsic for strict-fp. A Builder class is added to act as the common subclass of IRBuilder and IRInt.
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
@@ -167,7 +167,9 @@ multiclass FMA<bit add> {
   // second multiply input.
   defvar m2_cg = !if(add, (id $m2), (fneg $m2));
 
-  defvar unpred_cg = (IRIntBase<"fma", [Vector]> $m1, m2_cg, $addend);
+  defvar fma = strictFPAlt<IRIntBase<"fma", [Vector]>,
+                           IRInt<"fma", [Vector]>>;
+  defvar unpred_cg = (fma $m1, m2_cg, $addend);
   defvar pred_cg   = (IRInt<"fma_predicated", [Vector, Predicate]>
                           $m1, m2_cg, $addend, $pred);
 
@@ -723,7 +725,7 @@ multiclass compare_with_pred<string condname, dag arguments,
        NameOverride<"vcmp" # condname # "q_m" # suffix>;
 }
 
-multiclass compare<string condname, IRBuilder cmpop> {
+multiclass compare<string condname, Builder cmpop> {
   // Make all four variants of a comparison: the vector/vector and
   // vector/scalar forms, each using compare_with_pred to make a
   // predicated and unpredicated version.
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
@@ -34,7 +34,8 @@ class IRBuilderAddrParam<int index_> : IRBuilderParam<index_>;
 class IRBuilderIntParam<int index_, string type_> : IRBuilderParam<index_> {
   string type = type_;
 }
-class IRBuilderBase {
+class Builder {}
+class IRBuilderBase : Builder {
   // The prefix of the function call, including an open parenthesis.
   string prefix;
 
@@ -166,7 +167,7 @@ def address;
 // Another node class you can use in the codegen dag. This one corresponds to
 // an IR intrinsic function, which has to be specialized to a particular list
 // of types.
-class IRIntBase<string name_, list<Type> params_ = [], bit appendKind_ = 0> {
+class IRIntBase<string name_, list<Type> params_ = [], bit appendKind_ = 0> : Builder {
   string intname = name_;       // base name of the intrinsic
   list<Type> params = params_;  // list of parameter types
 
@@ -214,8 +215,8 @@ def bitsize;
 
 // strictFPAlt allows a node to have different code generation under strict-fp.
 // TODO: The standard node can be IRBuilderBase or IRIntBase.
-class strictFPAlt<IRBuilderBase standard_, IRIntBase strictfp_> {
-  IRBuilderBase standard = standard_;
+class strictFPAlt<Builder standard_, IRIntBase strictfp_> : Builder {
+  Builder standard = standard_;
   IRIntBase strictfp = strictfp_;
 }
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/ternary.c b/clang/test/CodeGen/arm-mve-intrinsics/ternary.c
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
@@ -1260,7 +1260,9 @@ Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
     for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i)
       Args.push_back(getCodeForDagArg(D, i, Scope, Param));
 
-    auto GenIRBuilderBase = [&](const Record *Op) {
+    auto GenIRBuilderBase = [&](const Record *Op) -> Result::Ptr {
+      assert(Op->isSubClassOf("IRBuilderBase") &&
+             "Expected IRBuilderBase in GenIRBuilderBase\n");
       std::set<unsigned> AddressArgs;
       std::map<unsigned, std::string> IntegerArgs;
       for (const Record *sp : Op->getValueAsListOfDefs("special_params")) {
@@ -1274,7 +1276,9 @@ Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
       return std::make_shared<IRBuilderResult>(Op->getValueAsString("prefix"),
                                                Args, AddressArgs, IntegerArgs);
     };
-    auto GenIRIntBase = [&](const Record *Op) {
+    auto GenIRIntBase = [&](const Record *Op) -> Result::Ptr {
+      assert(Op->isSubClassOf("IRIntBase") &&
+             "Expected IRIntBase in GenIRIntBase\n");
       std::vector<const Type *> ParamTypes;
       for (const Record *RParam : Op->getValueAsListOfDefs("params"))
         ParamTypes.push_back(getType(RParam, Param));
@@ -1289,8 +1293,11 @@ Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
     } else if (Op->isSubClassOf("IRIntBase")) {
       return GenIRIntBase(Op);
     } else if (Op->isSubClassOf("strictFPAlt")) {
-      auto Standard = GenIRBuilderBase(Op->getValueAsDef("standard"));
-      auto StrictFp = GenIRIntBase(Op->getValueAsDef("strictfp"));
+      auto StardardBuilder = Op->getValueAsDef("standard");
+      Result::Ptr Standard = StardardBuilder->isSubClassOf("IRBuilder")
+                                 ? GenIRBuilderBase(StardardBuilder)
+                                 : GenIRIntBase(StardardBuilder);
+      Result::Ptr StrictFp = GenIRIntBase(Op->getValueAsDef("strictfp"));
       return std::make_shared<StrictFpAltResult>(Standard, StrictFp);
     } else {
       PrintFatalError("Unsupported dag node " + Op->getName());
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1362,6 +1362,9 @@ def int_arm_mve_vqmovn_predicated: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
     llvm_i32_ty /* unsigned output */, llvm_i32_ty /* unsigned input */,
     llvm_i32_ty /* top half */, llvm_anyvector_ty /* pred */], [IntrNoMem]>;
 
+def int_arm_mve_fma: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* mult op #2 */,
+    LLVMMatchType<0> /* addend */], [IntrNoMem]>;
 // fma_predicated returns the add operand for disabled lanes.
 def int_arm_mve_fma_predicated: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
    [LLVMMatchType<0> /* mult op #1 */, LLVMMatchType<0> /* mult op #2 */,
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -3723,6 +3723,10 @@ multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
     if fms then {
       def : Pat<(VTI.Vec (fma (fneg m1), m2, add)),
                 (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma (fneg m1), m2, add)),
+                (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma m1, (fneg m2), add)),
+                (Inst $add, $m1, $m2)>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma (fneg m1), m2, add)),
                                   add)),
@@ -3734,6 +3738,8 @@ multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
     } else {
       def : Pat<(VTI.Vec (fma m1, m2, add)),
                 (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma m1, m2, add)),
+                (Inst $add, $m1, $m2)>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma m1, m2, add)),
                                   add)),
@@ -5672,6 +5678,8 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
     if scalar_addend then {
       def : Pat<(VTI.Vec (fma v1, v2, vs)),
                 (VTI.Vec (Inst v1, v2, is))>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma v1, v2, vs)),
+                (VTI.Vec (Inst v1, v2, is))>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma v1, v2, vs)),
                                   v1)),
@@ -5681,6 +5689,10 @@ multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
                 (VTI.Vec (Inst v2, v1, is))>;
       def : Pat<(VTI.Vec (fma vs, v1, v2)),
                 (VTI.Vec (Inst v2, v1, is))>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma v1, vs, v2)),
+                (VTI.Vec (Inst v2, v1, is))>;
+      def : Pat<(VTI.Vec (int_arm_mve_fma vs, v1, v2)),
+                (VTI.Vec (Inst v2, v1, is))>;
       def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
                                   (VTI.Vec (fma vs, v2, v1)),
                                   v1)),
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/strict-intrinsics.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/strict-intrinsics.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -o - %s | FileCheck %s
 
-define arm_aapcs_vfpcc <8 x half> @test_vaddq_f16(<8 x half> %a, <8 x half> %b) {
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_f16(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-LABEL: test_vaddq_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
@@ -11,7 +11,7 @@ entry:
   ret <8 x half> %0
 }
 
-define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) {
+define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_vaddq_f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
@@ -21,7 +21,7 @@ entry:
   ret <4 x float> %0
 }
 
-define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16(<8 x half> %a, <8 x half> %b) {
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-LABEL: test_vsubq_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vsub.f16 q0, q0, q1
@@ -31,7 +31,7 @@ entry:
   ret <8 x half> %0
 }
 
-define arm_aapcs_vfpcc <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) {
+define arm_aapcs_vfpcc <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_vsubq_f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vsub.f32 q0, q0, q1
@@ -41,7 +41,7 @@ entry:
   ret <4 x float> %0
 }
 
-define arm_aapcs_vfpcc <8 x half> @test_vmulq_f16(<8 x half> %a, <8 x half> %b) {
+define arm_aapcs_vfpcc <8 x half> @test_vmulq_f16(<8 x half> %a, <8 x half> %b) #0 {
 ; CHECK-LABEL: test_vmulq_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.f16 q0, q0, q1
@@ -51,7 +51,7 @@ entry:
   ret <8 x half> %0
 }
 
-define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) {
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_vmulq_f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.f32 q0, q0, q1
@@ -64,7 +64,7 @@ entry:
 
 
 
-define arm_aapcs_vfpcc <8 x half> @test_vaddq_f16_splat(<8 x half> %a, half %b) {
+define arm_aapcs_vfpcc <8 x half> @test_vaddq_f16_splat(<8 x half> %a, half %b) #0 {
 ; CHECK-LABEL: test_vaddq_f16_splat:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f16 r0, s4
@@ -77,7 +77,7 @@ entry:
   ret <8 x half> %0
 }
 
-define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32_splat(<4 x float> %a, float %b) {
+define arm_aapcs_vfpcc <4 x float> @test_vaddq_f32_splat(<4 x float> %a, float %b) #0 {
 ; CHECK-LABEL: test_vaddq_f32_splat:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s4
@@ -90,7 +90,7 @@ entry:
   ret <4 x float> %0
 }
 
-define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16_splat(<8 x half> %a, half %b) {
+define arm_aapcs_vfpcc <8 x half> @test_vsubq_f16_splat(<8 x half> %a, half %b) #0 {
 ; CHECK-LABEL: test_vsubq_f16_splat:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f16 r0, s4
@@ -103,7 +103,7 @@ entry:
   ret <8 x half> %0
 }
 
-define arm_aapcs_vfpcc <4 x float> @test_vsubq_f32_splat(<4 x float> %a, float %b) {
+define arm_aapcs_vfpcc <4 x float> @test_vsubq_f32_splat(<4 x float> %a, float %b) #0 {
 ; CHECK-LABEL: test_vsubq_f32_splat:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s4
@@ -116,7 +116,7 @@ entry:
   ret <4 x float> %0
 }
 
-define arm_aapcs_vfpcc <8 x half> @test_vmulq_f16_splat(<8 x half> %a, half %b) {
+define arm_aapcs_vfpcc <8 x half> @test_vmulq_f16_splat(<8 x half> %a, half %b) #0 {
 ; CHECK-LABEL: test_vmulq_f16_splat:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f16 r0, s4
@@ -129,7 +129,7 @@ entry:
   ret <8 x half> %0
 }
 
-define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32_splat(<4 x float> %a, float %b) {
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32_splat(<4 x float> %a, float %b) #0 {
 ; CHECK-LABEL: test_vmulq_f32_splat:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r0, s4
@@ -141,3 +141,103 @@ entry:
   %0 = tail call <4 x float> @llvm.arm.mve.vmul.v4f32(<4 x float> %a, <4 x float> %s)
   ret <4 x float> %0
 }
+
+define arm_aapcs_vfpcc <4 x float> @fma_v4f32(<4 x float> %dst, <4 x float> %s1, <4 x float> %s2) #0 {
+; CHECK-LABEL: fma_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vfma.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.arm.mve.fma.v4f32(<4 x float> %s1, <4 x float> %s2, <4 x float> %dst)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @fma_v8f16(<8 x half> %dst, <8 x half> %s1, <8 x half> %s2) #0 {
+; CHECK-LABEL: fma_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vfma.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x half> @llvm.arm.mve.fma.v8f16(<8 x half> %s1, <8 x half> %s2, <8 x half> %dst)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @fma_n_v8f16(<4 x float> %s1, <4 x float> %s2, float %s3) #0 {
+; CHECK-LABEL: fma_n_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vfma.f32 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> poison, float %s3, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> poison, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.arm.mve.fma.v4f32(<4 x float> %s2, <4 x float> %sp, <4 x float> %s1)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @fma_n_v4f32(<8 x half> %s1, <8 x half> %s2, half %s3) #0 {
+; CHECK-LABEL: fma_n_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f16 r0, s8
+; CHECK-NEXT:    vfma.f16 q0, q1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x half> poison, half %s3, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> poison, <8 x i32> zeroinitializer
+  %0 = tail call <8 x half> @llvm.arm.mve.fma.v8f16(<8 x half> %s2, <8 x half> %sp, <8 x half> %s1)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @fms_v4f32(<4 x float> %dst, <4 x float> %s1, <4 x float> %s2) #0 {
+; CHECK-LABEL: fms_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vfms.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = fneg <4 x float> %s1
+  %0 = tail call <4 x float> @llvm.arm.mve.fma.v4f32(<4 x float> %c, <4 x float> %s2, <4 x float> %dst)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @fms_v8f16(<8 x half> %dst, <8 x half> %s1, <8 x half> %s2) #0 {
+; CHECK-LABEL: fms_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vfms.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = fneg <8 x half> %s1
+  %0 = tail call <8 x half> @llvm.arm.mve.fma.v8f16(<8 x half> %c, <8 x half> %s2, <8 x half> %dst)
+  ret <8 x half> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @fms_n_v8f16(<4 x float> %s1, <4 x float> %s2, float %s3) #0 {
+; CHECK-LABEL: fms_n_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vdup.32 q2, r0
+; CHECK-NEXT:    vfms.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = fneg <4 x float> %s2
+  %i = insertelement <4 x float> poison, float %s3, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> poison, <4 x i32> zeroinitializer
+  %0 = tail call <4 x float> @llvm.arm.mve.fma.v4f32(<4 x float> %c, <4 x float> %sp, <4 x float> %s1)
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <8 x half> @fms_n_v4f32(<8 x half> %s1, <8 x half> %s2, half %s3) #0 {
+; CHECK-LABEL: fms_n_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f16 r0, s8
+; CHECK-NEXT:    vdup.16 q2, r0
+; CHECK-NEXT:    vfms.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = fneg <8 x half> %s2
+  %i = insertelement <8 x half> poison, half %s3, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> poison, <8 x i32> zeroinitializer
+  %0 = tail call <8 x half> @llvm.arm.mve.fma.v8f16(<8 x half> %c, <8 x half> %sp, <8 x half> %s1)
+  ret <8 x half> %0
+}
+
+attributes #0 = { strictfp }