[intel] improve pitch and width constexpr folding

januszjah · januszjah · commit 3102d0797ab9 · 2025-11-17T08:31:05.000Z
diff --git a/python/test/unit/language/test_block_pointer.py b/python/test/unit/language/test_block_pointer.py
@@ -69,8 +69,8 @@ def test_block_copy(dtypes_str, n, padding_option, boundary_check, device):
 def matmul_no_scf_with_advance_kernel(  #
         a_ptr, b_ptr, c_ptr,  #
         M, N, K,  #
-        stride_am, stride_ak,  #
-        stride_bk, stride_bn,  #
+        stride_am: tl.constexpr, stride_ak: tl.constexpr,  #
+        stride_bk: tl.constexpr, stride_bn: tl.constexpr,  #
         stride_cm, stride_cn,  #
         BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr  #
 ):
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -16,6 +16,7 @@
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
 #include "intel/include/Utils/Utility.h"
 #include "triton/Tools/LinearLayout.h"
+
 #include <optional>
 #include <triton/Tools/Sys/GetEnv.hpp>
 
@@ -39,6 +40,43 @@ static int __builtin_ctz(unsigned x) {
 
 namespace {
 
+static Value skipCasts(Value v) {
+  Operation *def = v.getDefiningOp();
+  if (def &&
+      isa<LLVM::TruncOp, LLVM::SExtOp, LLVM::ZExtOp, LLVM::BitcastOp>(def))
+    return def->getOperand(0);
+  return v;
+}
+
+static Value tryFoldOp(Value v) {
+  Operation *def = v.getDefiningOp();
+  if (def) {
+    SmallVector<OpFoldResult> results;
+    if (succeeded(def->fold(results)) && results.size() == 1) {
+      if (auto val = dyn_cast_or_null<Value>(results[0]))
+        return val;
+    }
+  }
+  return v;
+}
+
+static std::optional<int64_t> tryConstEval(Value v, int depth = 16) {
+  for (int i = 0; i < depth; ++i) {
+    if (auto res = getConstantIntValue(v))
+      return res;
+
+    Value newV = skipCasts(v);
+    newV = tryFoldOp(newV);
+
+    if (newV == v)
+      break;
+
+    v = newV;
+  }
+
+  return std::nullopt;
+}
+
 Value maybeAnd(RewriterBase &rewriter, Location loc, Value a, Value b) {
   auto tb = TritonLLVMOpBuilder(loc, rewriter);
   if (a && b) {
@@ -1031,6 +1069,7 @@ struct LoadOpToBlockIOConversion
   LogicalResult
   rewriteTensorPointerLoad(triton::LoadOp op, OpAdaptor adaptor,
                            ConversionPatternRewriter &rewriter) const {
+
     // FIXME: Remove once IGC can split large 2D block loads.
     std::optional<bool> oneMatrixPerLoadForBT =
         mlir::triton::tools::isEnvValueBool(mlir::triton::tools::getStrEnv(
@@ -1589,24 +1628,21 @@ struct LoadOpToBlockIOConversion
       pitch = b.trunc(i32_ty, colStride);
       std::swap(baseWidth, baseHeight);
     }
+
     // HW requires the pitch to be at least 64 bytes.
-    std::function<Value(Value)> skipTrunc = [&](Value v) {
-      if (dyn_cast_or_null<LLVM::TruncOp>(v.getDefiningOp()))
-        return skipTrunc(v.getDefiningOp()->getOperand(0));
-      return v;
-    };
-    if (Operation *op = skipTrunc(pitch).getDefiningOp()) {
-      std::optional<int64_t> pitchConst =
-          mlir::triton::intel::getFoldedConstantValue(op);
-      if (pitchConst.has_value()) {
-        if ((*pitchConst * elemSizeInBits / 8) < 64)
-          return failure();
-      }
+    if (auto pitchConst = tryConstEval(pitch)) {
+      if ((*pitchConst * elemSizeInBits / 8) < 64)
+        return failure();
     }
 
     baseWidth = b.trunc(i32_ty, baseWidth);
     baseHeight = b.trunc(i32_ty, baseHeight);
 
+    if (auto widthConst = tryConstEval(baseWidth)) {
+      if ((*widthConst * elemSizeInBits / 8) < 64)
+        return failure();
+    }
+
     const unsigned originalElemBits = elemSizeInBits;
     if (isTransposeRequired) {
       // adjust the block io parameter to align HW's limitations on