[AMD][NFC] Refactor AccelerateAMDMatmul operand legalization (intel#4136)

joviliast · web-flow · commit f04df2412bc2 · 2024-06-19T16:41:54.000-05:00
- Choose proper configuration of operands according to number of
conversions if it possible;
- Get rid of complicated logic to find operand config;
- Remove helper `supportWMMA()` to get rid of impicit logical
dependencies with AccelerateAMDMatmul

Signed-off-by: Ilya Veselov &lt;iveselov.nn@gmail.com&gt;
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -482,58 +482,6 @@ bool supportMFMA(triton::DotOp op) {
   return true;
 }
 
-static bool supportWMMAGranularity(int m, int n, int k) {
-  return m % 16 == 0 && n % 16 == 0 && k % 16 == 0;
-}
-
-static bool supportWMMATypes(Type a, Type b, Type c, Type d) {
-  if (a != b || c != d)
-    return false;
-  auto aWidth = a.getIntOrFloatBitWidth();
-  auto cWidth = c.getIntOrFloatBitWidth();
-  if (a.isIntOrIndex()) {
-    if (!c.isIntOrIndex())
-      return false;
-    bool aValid = aWidth <= 8;
-    bool cValid = cWidth <= 32;
-    return aValid && cValid;
-  } else if (isa<FloatType>(a) && isa<FloatType>(c)) {
-    if (a.isBF16())
-      return c.isBF16() || c.isF32();
-    if (a.isF16())
-      return c.isF16() || c.isF32();
-    return aWidth <= cWidth && aWidth <= 16;
-  }
-  return false;
-}
-
-bool supportWMMA(triton::DotOp op) {
-  auto aTy = cast<RankedTensorType>(op.getA().getType());
-  auto bTy = cast<RankedTensorType>(op.getB().getType());
-  auto cTy = cast<RankedTensorType>(op.getC().getType());
-  auto dTy = cast<RankedTensorType>(op.getResult().getType());
-
-  auto aElemTy = aTy.getElementType();
-  auto bElemTy = bTy.getElementType();
-  auto cElemTy = cTy.getElementType();
-  auto dElemTy = dTy.getElementType();
-
-  if (!supportWMMATypes(aElemTy, bElemTy, cElemTy, dElemTy))
-    return false;
-
-  auto aShape = aTy.getShape();
-  auto bShape = bTy.getShape();
-
-  auto rank = aShape.size();
-  assert(bShape.size() == rank);
-  assert(aShape[rank - 1] == bShape[rank - 2]);
-  if (!supportWMMAGranularity(aShape[rank - 2], bShape[rank - 1],
-                              aShape[rank - 1]))
-    return false;
-
-  return true;
-}
-
 bool supportMMA(triton::DotOp op, int version) {
   // Refer to mma section for the data type supported by Volta and Hopper
   // Tensor Core in
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -112,6 +112,87 @@ warpsPerTileWMMA(tt::DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps) {
                        AMDWmmaEncodingAttr::getMNKDimPerWMMAInstr()[1]});
 }
 
+using OperandTypesVector = SmallVector<Type, 4>;
+OperandTypesVector
+selectMatrixCoreOperandTypes(tt::DotOp dot,
+                             ArrayRef<OperandTypesVector> applicableTypes) {
+  SmallVector<Value> dotOperands = {dot.getA(), dot.getB(), dot.getC(),
+                                    dot.getD()};
+  OperandTypesVector initElemTypes;
+  llvm::transform(dotOperands, std::back_inserter(initElemTypes), [](Value v) {
+    return cast<RankedTensorType>(v.getType()).getElementType();
+  });
+
+  // Use simple costmodel to define optimal set of the dot operands.
+  // Most expensive - accuracy loss conversions:
+  //   - any larger type -> any smaller type;
+  //   - float -> int;
+  //   - int -> float (not supported for now);
+  //   - signed int -> unsigned int;
+  //   - unsigned int -> signed int with same or less size.
+  // They are never performed, better to use FMA.
+  // Supported conversion for now costs `1`, no conversion costs `0`.
+  // The model could be improved in the future. For example taken into account
+  // chain dot could be detected and result conversion score is decreased.
+  int maxConvertCost =
+      std::numeric_limits<int32_t>::max() / applicableTypes.front().size();
+  auto calcConvertCost = [&](Type fromTy, Type toTy) -> int32_t {
+    if (fromTy == toTy)
+      return 0;
+
+    // Skip conversion between int and float. Int16/int32 cases are lowered to
+    // FMA.
+    if (fromTy.isIntOrIndex() != toTy.isIntOrIndex())
+      return maxConvertCost;
+
+    if (fromTy.isIntOrIndex() && toTy.isIntOrIndex() &&
+        fromTy.isUnsignedInteger() != toTy.isUnsignedInteger())
+      return fromTy.isUnsignedInteger() && fromTy.getIntOrFloatBitWidth() <
+                                               toTy.getIntOrFloatBitWidth()
+                 ? 1
+                 : maxConvertCost;
+
+    return fromTy.getIntOrFloatBitWidth() <= toTy.getIntOrFloatBitWidth()
+               ? 1
+               : maxConvertCost;
+  };
+  auto minCost = maxConvertCost;
+  auto optTypes = OperandTypesVector();
+  for (auto types : applicableTypes) {
+    assert(types.size() == initElemTypes.size());
+    int accumulatedConvertCost = 0;
+    for (int i = 0; i < initElemTypes.size(); ++i) {
+      accumulatedConvertCost += calcConvertCost(initElemTypes[i], types[i]);
+    }
+    if (accumulatedConvertCost < minCost) {
+      minCost = accumulatedConvertCost;
+      optTypes = types;
+    }
+  }
+  return optTypes;
+}
+
+OperandTypesVector getOperandTypesForWmmaOp(mlir::PatternRewriter &rewriter,
+                                            tt::DotOp dot) {
+  Type f16 = rewriter.getF16Type();
+  Type f32 = rewriter.getF32Type();
+  Type bf16 = rewriter.getBF16Type();
+  Type i8 = rewriter.getIntegerType(8);
+  Type i32 = rewriter.getIntegerType(32);
+  SmallVector<OperandTypesVector> applicableTypes = {
+      // clang-format off
+      {f16, f16, f32, f32},
+      {f16, f16, f16, f16},
+      {bf16, bf16, f32, f32},
+      {bf16, bf16, bf16, bf16},
+      {i8, i8, i32, i32},
+      // i4, i4, i32, i32 - is supported configuration
+      // by WMMA instruction, but not supported by triton
+      // clang-format on
+  };
+  return selectMatrixCoreOperandTypes(dot, applicableTypes);
+}
+
 /**
  * @brief Convert layout and cast element type of a given tensor
  *
@@ -520,81 +601,71 @@ class BlockedToWMMA : public mlir::RewritePattern {
   mlir::LogicalResult
   matchAndRewrite(mlir::Operation *op,
                   mlir::PatternRewriter &rewriter) const override {
+    auto ctx = op->getContext();
     auto dotOp = cast<tt::DotOp>(op);
 
+    Value a = dotOp.getA();
+    Value b = dotOp.getB();
+
     auto oldRetType = cast<RankedTensorType>(dotOp.getResult().getType());
-    if (!oldRetType.getEncoding() ||
-        !isa<ttg::BlockedEncodingAttr>(oldRetType.getEncoding()))
+    auto oldRetEncoding = oldRetType.getEncoding();
+    if (!oldRetEncoding || !isa<ttg::BlockedEncodingAttr>(oldRetEncoding))
+      return failure();
+
+    auto oldAType = cast<RankedTensorType>(a.getType());
+    auto oldBType = cast<RankedTensorType>(b.getType());
+    auto retShape = oldRetType.getShape();
+    auto aShape = oldAType.getShape();
+    auto bShape = oldBType.getShape();
+
+    // check shape
+    auto mnkDim = AMDWmmaEncodingAttr::getMNKDimPerWMMAInstr();
+    auto rank = aShape.size();
+    if (aShape[rank - 2] % mnkDim[0] != 0 || // m
+        bShape[rank - 1] % mnkDim[1] != 0 || // n
+        aShape[rank - 1] % mnkDim[2] != 0)   // k
       return failure();
 
-    if (!supportWMMA(dotOp))
+    // get operand types
+    auto operandTypes = getOperandTypesForWmmaOp(rewriter, dotOp);
+    if (operandTypes.empty())
       return failure();
 
     // get WMMA encoding for the given number of warps
-    auto retShape = oldRetType.getShape();
     auto mod = op->getParentOfType<mlir::ModuleOp>();
     int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
 
-    // operands
-    Value a = dotOp.getA();
-    Value b = dotOp.getB();
-    auto oldAType = cast<RankedTensorType>(a.getType());
-    auto oldBType = cast<RankedTensorType>(b.getType());
-    auto ctx = oldAType.getContext();
-
     AMDWmmaEncodingAttr wmmaEnc;
 
-    auto mnkDim = AMDWmmaEncodingAttr::getMNKDimPerWMMAInstr();
     auto warpsPerTile = warpsPerTileWMMA(dotOp, retShape, numWarps);
-    // Not supported yet
-    // if (retShape[0] < warpsPerTile[0] * mnkDim[0] || retShape[1] <
-    // warpsPerTile[1] * mnkDim[1])
-    //  return failure();
-    auto CTALayout = ttg::getCTALayout(oldRetType.getEncoding());
-    wmmaEnc = AMDWmmaEncodingAttr::get(oldRetType.getContext(), warpsPerTile,
-                                       CTALayout);
-
-    Type wmmaAccType;
-    auto oldRetElemType = oldRetType.getElementType();
-    auto aElemType = oldAType.getElementType();
-    auto bElemType = oldBType.getElementType();
-    if (oldRetElemType.isIntOrIndex()) {
-      wmmaAccType = rewriter.getIntegerType(32);
-    } else if (isa<mlir::Float16Type, mlir::BFloat16Type>(oldRetElemType) &&
-               aElemType == oldRetElemType) {
-      wmmaAccType = oldRetElemType;
-    } else if (isa<mlir::FloatType>(oldRetElemType) &&
-               aElemType.getIntOrFloatBitWidth() < 16) {
-      aElemType = rewriter.getF16Type();
-      bElemType = rewriter.getF16Type();
-      wmmaAccType = rewriter.getF16Type();
-    } else {
-      wmmaAccType = rewriter.getF32Type();
-    }
 
-    auto newRetType = RankedTensorType::get(retShape, wmmaAccType, wmmaEnc);
+    auto CTALayout = ttg::getCTALayout(oldRetEncoding);
+    wmmaEnc = AMDWmmaEncodingAttr::get(ctx, warpsPerTile, CTALayout);
+
+    auto newRetType = RankedTensorType::get(retShape, operandTypes[3], wmmaEnc);
 
     // convert accumulator
     auto oldAcc = dotOp.getOperand(2);
-    auto newAcc = convertAndCastTensor(rewriter, oldAcc, wmmaEnc, wmmaAccType);
+    auto newAcc =
+        convertAndCastTensor(rewriter, oldAcc, wmmaEnc, operandTypes[2]);
 
     auto newAType = RankedTensorType::get(
-        oldAType.getShape(), aElemType,
+        aShape, operandTypes[0],
         ttg::DotOperandEncodingAttr::get(ctx, 0, wmmaEnc, mnkDim[2]));
     auto newBType = RankedTensorType::get(
-        oldBType.getShape(), bElemType,
+        bShape, operandTypes[1],
         ttg::DotOperandEncodingAttr::get(ctx, 1, wmmaEnc, mnkDim[2]));
 
-    Value castedA =
-        convertAndCastTensor(rewriter, a, newAType.getEncoding(), aElemType);
-    Value castedB =
-        convertAndCastTensor(rewriter, b, newBType.getEncoding(), bElemType);
+    Value castedA = convertAndCastTensor(rewriter, a, newAType.getEncoding(),
+                                         operandTypes[0]);
+    Value castedB = convertAndCastTensor(rewriter, b, newBType.getEncoding(),
+                                         operandTypes[1]);
     auto newDot = rewriter.create<tt::DotOp>(
         dotOp.getLoc(), newRetType, castedA, castedB, newAcc,
         dotOp.getInputPrecision(), dotOp.getMaxNumImpreciseAcc());
 
-    Value dotOutput = convertAndCastTensor(
-        rewriter, newDot, oldRetType.getEncoding(), oldRetElemType);
+    Value dotOutput = convertAndCastTensor(rewriter, newDot, oldRetEncoding,
+                                           oldRetType.getElementType());
     rewriter.replaceOp(op, dotOutput);
     return success();
   }