intel
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 21 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td‎
Lines changed: 12 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/Passes.td‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp‎
Lines changed: 40 additions & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeTMemSubtiling.cpp‎
Lines changed: 207 additions & 0 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeTMemSubtiling.cpp‎
Lines changed: 207 additions & 0 deletions
diff --git a/‎python/test/unit/language/test_matmul.py‎
Lines changed: 22 additions & 8 deletions b/‎python/test/unit/language/test_matmul.py‎
Lines changed: 22 additions & 8 deletions
@@ -473,6 +473,27 @@ def TTNG_TMEMAllocOp : TTNG_Op<"tmem_alloc", [DeclareOpInterfaceMethods<MemoryEf
   let hasVerifier = 1;
 }
 
+def TTNG_TMEMSubSliceOp : TTNG_Op<"tmem_subslice", [Pure]> {
+  let summary = "Take a subslice of a tensor memory allocation";
+  let description = [{
+    This operation takes a subslice of a tensor memory allocation and returns a new descriptor
+    containing the address and a view of the subslice.
+    This is similar to ttg.memdesc_subview except the offset needs to be static and we can only
+    slice alog the inner dimension of a 2D memdesc as this is the only one we can do for TMem.
+  }];
+  let arguments = (ins TTG_MemDescType:$src, I32Attr:$N);
+
+  let assemblyFormat = [{
+    $src attr-dict `:` qualified(type($src)) `->` qualified(type($result))
+  }];
+
+  let builders = [
+      OpBuilder<(ins "Value":$alloc, "int":$offset, "int":$size)>,
+    ];
+  let results = (outs TTG_MemDescType:$result);
+  let hasVerifier = 1;
+}
+
 def TTNG_TMEMCopyOp : TTNG_Op<"tmem_copy", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
   let summary = "Initiate an asynchronous copy operation from shared memory to the Tensor Memory.";
 
 
@@ -60,6 +60,8 @@ std::unique_ptr<Pass> createTritonNvidiaGPUPromoteLHSToTMemPass();
 
 std::unique_ptr<Pass> createTritonNvidiaGPUOptimizeDescriptorEncodingPass();
 
+std::unique_ptr<Pass> createTritonNvidiaGPUOptimizeTMemSubtilingPass();
+
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #define GEN_PASS_DECL_TRITONNVIDIAGPULEGALIZETMALAYOUTS
 
@@ -130,4 +130,16 @@ def TritonNvidiaGPUOptimizeDescriptorEncodingPass : Pass<"triton-nvidia-optimize
                            "mlir::triton::TritonDialect"];
 }
 
+def TritonNvidiaGPUOptimizeTMemSubtilingPass : Pass<"triton-nvidia-optimize-tmem-subtiling", "mlir::ModuleOp"> {
+  let summary = "Optimize subtiling.";
+
+  let description = [{
+    Optimize subtiling by trying to split tmem_load when user splits a tensor.
+  }];
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
+                           "mlir::triton::TritonDialect"];
+}
+
 #endif
@@ -485,6 +485,46 @@ void TMEMCopyOp::getEffects(
                        mlir::triton::gpu::SharedMemory::get());
 }
 
+// -- TMEMSubSliceOp --
+LogicalResult TMEMSubSliceOp::verify() {
+  auto srcTy = cast<triton::gpu::MemDescType>(getSrc().getType());
+  auto encoding = dyn_cast<triton::nvidia_gpu::TensorMemoryEncodingAttr>(
+      srcTy.getEncoding());
+  if (!encoding)
+    return emitOpError("The source must be a tensor memory buffer.");
+  if (encoding.getBlockM() != 128)
+    return emitOpError("The source must be a 128xN layout.");
+  auto dstTy = cast<triton::gpu::MemDescType>(getResult().getType());
+  auto dstEncoding = dyn_cast<triton::nvidia_gpu::TensorMemoryEncodingAttr>(
+      dstTy.getEncoding());
+  if (!dstEncoding)
+    return emitOpError("The destination must be a tensor memory buffer.");
+  if (dstEncoding.getBlockM() != encoding.getBlockM() ||
+      dstEncoding.getCTASplitM() != encoding.getCTASplitM() ||
+      dstEncoding.getCTASplitN() != encoding.getCTASplitN() ||
+      dstEncoding.getUnpacked() != encoding.getUnpacked())
+    return emitOpError("The destination must have the same block size and "
+                       "CTASplit size as the source.");
+  return mlir::success();
+}
+
+void TMEMSubSliceOp::build(OpBuilder &builder, OperationState &state,
+                           Value alloc, int offset, int size) {
+  auto allocTy = cast<triton::gpu::MemDescType>(alloc.getType());
+  SmallVector<int64_t> shape(allocTy.getShape());
+  shape.back() = size;
+  auto encoding =
+      cast<triton::nvidia_gpu::TensorMemoryEncodingAttr>(allocTy.getEncoding());
+  unsigned newBlockN = std::min<unsigned>(encoding.getBlockN(), size);
+  auto newEncoding = triton::nvidia_gpu::TensorMemoryEncodingAttr::get(
+      builder.getContext(), encoding.getBlockM(), newBlockN,
+      encoding.getUnpacked(), encoding.getCTASplitM(), encoding.getCTASplitN());
+  auto subsliceType = gpu::MemDescType::get(
+      shape, allocTy.getElementType(), newEncoding, allocTy.getMemorySpace(),
+      allocTy.getMutableMemory());
+  build(builder, state, subsliceType, alloc, offset);
+}
+
 } // namespace nvidia_gpu
 } // namespace triton
 } // namespace mlir
@@ -2,6 +2,7 @@ add_triton_library(TritonNvidiaGPUTransforms
   FenceInsertion.cpp
   MMALowering.cpp
   OptimizeDescriptorEncoding.cpp
+  OptimizeTMemSubtiling.cpp
   PlanCTA.cpp
   PromoteLHSToTMem.cpp
   TensorMemoryAllocation.cpp
 
@@ -0,0 +1,207 @@
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Types.h"
+#include "triton/Dialect/TritonGPU/IR/Attributes.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
+
+namespace {
+
+using namespace mlir;
+
+namespace ttng = triton::nvidia_gpu;
+namespace ttg = triton::gpu;
+namespace tt = triton;
+
+#define GEN_PASS_CLASSES
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h.inc"
+
+// If we don't know the effects of the op, we add all possible effects.
+static void addAllValuelessEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Read>());
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Write>());
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Allocate>());
+  effects.emplace_back(MemoryEffects::Effect::get<MemoryEffects::Free>());
+}
+
+static bool
+collectEffects(Operation *op,
+               SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  // Collect effect instances the operation. Note that the implementation of
+  // getEffects erases all effect instances that have the type other than the
+  // template parameter so we collect them first in a local buffer and then
+  // copy.
+  if (auto iface = dyn_cast<MemoryEffectOpInterface>(op)) {
+    SmallVector<MemoryEffects::EffectInstance> localEffects;
+    iface.getEffects(localEffects);
+    llvm::append_range(effects, localEffects);
+    return true;
+  }
+  if (op->hasTrait<OpTrait::HasRecursiveMemoryEffects>()) {
+    for (auto &region : op->getRegions()) {
+      for (auto &block : region) {
+        for (auto &innerOp : block)
+          if (!collectEffects(&innerOp, effects))
+            return false;
+      }
+    }
+    return true;
+  }
+
+  // We need to be conservative here in case the op doesn't have the interface
+  // and assume it can have any possible effect.
+  addAllValuelessEffects(effects);
+  return false;
+}
+
+// Sink tmem_loads as close to their use as possible to reduce register
+// pressure.
+static void sinkLoad(ttng::TMEMLoadOp load, Operation *cvt) {
+  Operation *insertBefore = nullptr;
+  Operation *next = cvt->getNextNode();
+  while (next && !next->hasTrait<OpTrait::IsTerminator>()) {
+    insertBefore = next;
+    bool dep = false;
+    for (auto operand : getNestedOperands(next)) {
+      if (operand == cvt->getResult(0)) {
+        dep = true;
+        break;
+      }
+    }
+    if (!isMemoryEffectFree(next)) {
+      SmallVector<MemoryEffects::EffectInstance> effects;
+      collectEffects(next, effects);
+      for (auto effect : effects) {
+        if (effect.getEffect() ==
+                MemoryEffects::Effect::get<MemoryEffects::Write>() ||
+            effect.getEffect() ==
+                MemoryEffects::Effect::get<MemoryEffects::Allocate>()) {
+          if (effect.getResource() ==
+                  mlir::SideEffects::DefaultResource::get() ||
+              effect.getResource() ==
+                  mlir::triton::nvidia_gpu::TensorMemory::get()) {
+            dep = true;
+            break;
+          }
+        }
+      }
+    }
+    if (dep)
+      break;
+    next = next->getNextNode();
+  }
+  if (insertBefore) {
+    load->moveBefore(insertBefore);
+    cvt->moveBefore(insertBefore);
+  }
+}
+
+// clang-format off
+// Converts:
+//  %l = ttng.tmem_load %o : !ttg.memdesc<128x256xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x256xf32, #blocked>
+//  %r = tt.reshape %l : tensor<128x256xf32, #blocked> -> tensor<128x2x128xf32, #blocked4>
+//  %t = tt.trans %r {order = array<i32: 0, 2, 1>} : tensor<128x2x128xf32, #blocked4> -> tensor<128x128x2xf32, #blocked5>
+//  %outLHS, %outRHS = tt.split %t : tensor<128x128x2xf32, #blocked5> -> tensor<128x128xf32, #blocked2>
+// To:
+//  %o0 = ttng.tmem_subslice %o { N = 0 }: !ttg.memdesc<128x256xf32, #tmem, #ttng.tensor_memory, mutable> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+//  %outLHS = ttng.tmem_load %o0 : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+//  %o1 = ttng.tmem_subslice %o { N = 128 }: !ttg.memdesc<128x256xf32, #tmem, #ttng.tensor_memory, mutable> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
+//  %outRHS = ttng.tmem_load %o1 : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xf32, #blocked>
+// clang-format on
+// This will change the layout of the destination tensor to distribute each
+// slice across warps. It currently only supports simple cases where tmem can be
+// sliced easily. This could be extended if needed with more powerful slicing
+// support of tmem.
+class TMemSplitLoadPattern : public OpRewritePattern<tt::SplitOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tt::SplitOp splitOp,
+                                PatternRewriter &rewriter) const override {
+    auto src = splitOp.getSrc();
+    // Skip convert layout ops.
+    while (auto cvt = src.getDefiningOp<ttg::ConvertLayoutOp>()) {
+      src = cvt.getSrc();
+    }
+    // Only support splitting N dimension on the outer most.
+    auto transOp = src.getDefiningOp<tt::TransOp>();
+    if (!transOp || transOp.getOrder() != ArrayRef<int>({0, 2, 1}))
+      return failure();
+    auto reshapeOp = transOp.getSrc().getDefiningOp<tt::ReshapeOp>();
+    if (!reshapeOp)
+      return failure();
+    auto shape = reshapeOp.getResult().getType().getShape();
+    if (shape[0] != reshapeOp.getSrc().getType().getShape()[0])
+      return failure();
+    auto tmemLoad = reshapeOp.getSrc().getDefiningOp<ttng::TMEMLoadOp>();
+    if (!tmemLoad)
+      return failure();
+    // We found a tmem_load that is split on the N dimension. We can split it
+    // into multiple tmem_loads.
+    int mDim = getShapePerCTA(tmemLoad.getSrc().getType())[0];
+    // TODO: enable other M cases. (the layout is a bit more complex).
+    if (mDim != 128)
+      return failure();
+    int splitNSize = shape[2];
+    if (splitNSize < 8)
+      return failure();
+    Value tmem = tmemLoad.getSrc();
+    int numWarps = ttg::lookupNumWarps(tmemLoad);
+    rewriter.setInsertionPoint(tmemLoad);
+    // First slice.
+    Value subSlice0 = rewriter.create<ttng::TMEMSubSliceOp>(
+        tmemLoad.getLoc(), tmem, 0, splitNSize);
+    Attribute distLayout = ttng::getTmemCompatibleLayout(
+        mDim, splitNSize, splitOp.getOutLHS().getType(), numWarps);
+    RankedTensorType newLoadType = RankedTensorType::get(
+        splitOp.getOutLHS().getType().getShape(),
+        splitOp.getOutLHS().getType().getElementType(), distLayout);
+    auto load0 = rewriter.create<ttng::TMEMLoadOp>(tmemLoad.getLoc(),
+                                                   newLoadType, subSlice0);
+    auto cvt0 = rewriter.create<ttg::ConvertLayoutOp>(
+        tmemLoad.getLoc(), splitOp.getOutLHS().getType(), load0);
+    // Second slice.
+    Value subSlice1 = rewriter.create<ttng::TMEMSubSliceOp>(
+        tmemLoad.getLoc(), tmem, splitNSize, splitNSize);
+    auto load1 = rewriter.create<ttng::TMEMLoadOp>(tmemLoad.getLoc(),
+                                                   newLoadType, subSlice1);
+    auto cvt1 = rewriter.create<ttg::ConvertLayoutOp>(
+        tmemLoad.getLoc(), splitOp.getOutRHS().getType(), load1);
+    rewriter.replaceOp(splitOp, {cvt0, cvt1});
+    sinkLoad(load0, cvt0);
+    sinkLoad(load1, cvt1);
+    return success();
+  }
+};
+
+class TritonNvidiaGPUOptimizeTMemSubtilingPass
+    : public TritonNvidiaGPUOptimizeTMemSubtilingPassBase<
+          TritonNvidiaGPUOptimizeTMemSubtilingPass> {
+public:
+  using BaseT = TritonNvidiaGPUOptimizeTMemSubtilingPassBase<
+      TritonNvidiaGPUOptimizeTMemSubtilingPass>;
+  using BaseT::BaseT;
+
+  void runOnOperation() override {
+    MLIRContext *context = &getContext();
+    ModuleOp m = getOperation();
+
+    mlir::RewritePatternSet patterns(context);
+    patterns.add<TMemSplitLoadPattern>(context);
+    if (failed(applyPatternsGreedily(m, std::move(patterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::createTritonNvidiaGPUOptimizeTMemSubtilingPass() {
+  return std::make_unique<TritonNvidiaGPUOptimizeTMemSubtilingPass>();
+}
@@ -35,7 +35,7 @@ def matmul_kernel(  #
         stride_cm, stride_cn,  #
         BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,  #
         NUM_STAGES: tl.constexpr, SCALE_A: tl.constexpr = None, PRECISION: tl.constexpr = "ieee",
-        A_TRANS: tl.constexpr = False):
+        A_TRANS: tl.constexpr = False, EPILOGUE_SUBTILE: tl.constexpr = False):
     pid = tl.program_id(axis=0)
     num_pid_m = tl.cdiv(M, BLOCK_M)
     pid_m = pid % num_pid_m
@@ -63,10 +63,21 @@ def matmul_kernel(  #
         accumulator = tl.dot(a, b, acc=accumulator, out_dtype=output_ptr.dtype.element_ty, input_precision=PRECISION)
         a_ptrs += BLOCK_K * stride_ak
         b_ptrs += BLOCK_K * stride_bk
-    offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    output_ptrs = output_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
-    tl.store(output_ptrs, accumulator)
+    if EPILOGUE_SUBTILE:
+        acc = tl.reshape(accumulator, (BLOCK_M, 2, BLOCK_N // 2))
+        acc = tl.permute(acc, (0, 2, 1))
+        acc0, acc1 = tl.split(acc)
+        offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N // 2)
+        output_ptrs0 = output_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+        output_ptrs1 = output_ptrs0 + stride_cn * (BLOCK_N // 2)
+        tl.store(output_ptrs0, acc0)
+        tl.store(output_ptrs1, acc1)
+    else:
+        offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+        output_ptrs = output_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+        tl.store(output_ptrs, accumulator)
 
 
 def get_src_element_ty_size(dtype_str):
@@ -86,8 +97,9 @@ def get_src_element_ty_size(dtype_str):
                                                                    (512, 64, 32, 2), (64, 16, 16, 4)])
 @pytest.mark.parametrize("NUM_CTAS", [1, 2])
 @pytest.mark.parametrize("NUM_WARPS", [4, 8])
-def test_simple_matmul(dtype_src_str, dtype_dst_str, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, NUM_WARPS, NUM_CTAS,
-                       device):
+@pytest.mark.parametrize("EPILOGUE_SUBTILE", [True, False])
+def test_simple_matmul(dtype_src_str, dtype_dst_str, BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES, NUM_WARPS, NUM_CTAS, device,
+                       EPILOGUE_SUBTILE):
     if NUM_CTAS > 1 and (not is_cuda() or torch.cuda.get_device_capability()[0] < 9):
         pytest.xfail("Clusters requires nvidia compute capability >= 9")
     if is_hip() and ((BLOCK_K * BLOCK_M + BLOCK_K * BLOCK_N) * NUM_STAGES * get_src_element_ty_size(dtype_src_str)
@@ -105,6 +117,8 @@ def test_simple_matmul(dtype_src_str, dtype_dst_str, BLOCK_M, BLOCK_N, BLOCK_K,
         pytest.skip("FMA matmul not supported for multiple CTAs")
     if (BLOCK_M < 64 or (BLOCK_M == 64 and BLOCK_N == 16)) and NUM_CTAS > 1:
         pytest.skip("multi-CTAs is broken for mmav2")
+    if EPILOGUE_SUBTILE and not is_xpu() and (is_hip() or NUM_CTAS > 1 or BLOCK_N >= 512):
+        pytest.skip("creates convert layout too big to fit in smem")
     M, N, K = 1024, 512, 256
     torch.manual_seed(42)
     precision = "tf32" if dtype_src_str == "tensorfloat32" else "ieee"
@@ -125,7 +139,7 @@ def test_simple_matmul(dtype_src_str, dtype_dst_str, BLOCK_M, BLOCK_N, BLOCK_K,
     grid = (triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N), 1)
     k = matmul_kernel[grid](a, b, output, M, N, K, a.stride(0), a.stride(1), b.stride(0), b.stride(1), output.stride(0),
                             output.stride(1), BLOCK_M, BLOCK_N, BLOCK_K, NUM_STAGES=NUM_STAGES, PRECISION=precision,
-                            num_warps=NUM_WARPS, num_ctas=NUM_CTAS)
+                            num_warps=NUM_WARPS, num_ctas=NUM_CTAS, EPILOGUE_SUBTILE=EPILOGUE_SUBTILE)
     ref_out = torch.matmul(A, B).to(torch.float32)
     output = output.to(torch.float32)
     if dtype_src_str == "float32":