[Gluon] Disable constant CSE before auto layout propagation (#8323)

peterbell10 · web-flow · commit b50872a8be95 · 2025-09-30T20:39:18.000+01:00
Fixes #8229

Background is that we run the gluon inliner prior to auto layout
propagation to enable returning auto layout from a function and having
different calls of the function resolve to different layouts.

However, the inliner calls gluon canonicalize and the
`GreedyPatternRewriter` defaults to CSEing constants. This means that
two distinct constants which could otherwise resolve to different
layouts may be CSEd into a single constant and create a new conflict.

I fix this by changing the inliner to do even less canonicalization, and
only simplify control flow operations. I then add a canoncalization pass
after auto layout resolution to make up for this.
diff --git a/include/triton/Dialect/Gluon/Transforms/Passes.td b/include/triton/Dialect/Gluon/Transforms/Passes.td
@@ -35,4 +35,14 @@ def GluonInline: Pass<"gluon-inline"> {
   let dependentDialects = [];
 }
 
+def GluonSimplifyControlFlow: Pass<"gluon-slimplify-control-flow"> {
+  let summary = "simplications for control flow ops";
+
+  let description = [{
+    The `gluon-inline` pass applies a reduced set of simplification
+    and canonicalization patterns to the module.
+  }];
+  let dependentDialects = [];
+}
+
 #endif
diff --git a/lib/Dialect/Gluon/Transforms/CMakeLists.txt b/lib/Dialect/Gluon/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_triton_library(GluonTransforms
   Canonicalize.cpp
   Inline.cpp
   ResolveAutoEncodings.cpp
+  SimplifyControlFlow.cpp
 
   DEPENDS
   GluonTransformsIncGen
diff --git a/lib/Dialect/Gluon/Transforms/Inline.cpp b/lib/Dialect/Gluon/Transforms/Inline.cpp
@@ -22,7 +22,7 @@ struct Inline : public gluon::impl::GluonInlineBase<Inline> {
 void Inline::runOnOperation() {
   mlir::PassManager pm(&getContext());
   pm.addPass(createInlinerPass(/*opPipelines=*/{}, [](OpPassManager &pm) {
-    pm.addPass(gluon::createGluonCanonicalize());
+    pm.addPass(gluon::createGluonSimplifyControlFlow());
   }));
   if (failed(pm.run(getOperation())))
     return signalPassFailure();
diff --git a/lib/Dialect/Gluon/Transforms/SimplifyControlFlow.cpp b/lib/Dialect/Gluon/Transforms/SimplifyControlFlow.cpp
@@ -0,0 +1,49 @@
+#include "mlir/IR/OperationSupport.h"
+#include "triton/Dialect/Gluon/Transforms/Passes.h"
+
+#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+using namespace mlir;
+using namespace triton;
+
+namespace mlir::triton::gluon {
+#define GEN_PASS_DEF_GLUONSIMPLIFYCONTROLFLOW
+#include "triton/Dialect/Gluon/Transforms/Passes.h.inc"
+} // namespace mlir::triton::gluon
+
+namespace {
+struct SimplifyControlFlow
+    : public gluon::impl::GluonSimplifyControlFlowBase<SimplifyControlFlow> {
+  void runOnOperation() override;
+};
+} // namespace
+
+void SimplifyControlFlow::runOnOperation() {
+  MLIRContext *ctx = &getContext();
+  RewritePatternSet patterns(&getContext());
+
+  // Populate `scf` and `cf` canonicalizers.
+  ctx->getLoadedDialect<scf::SCFDialect>()->getCanonicalizationPatterns(
+      patterns);
+  ctx->getLoadedDialect<cf::ControlFlowDialect>()->getCanonicalizationPatterns(
+      patterns);
+  for (mlir::RegisteredOperationName op : ctx->getRegisteredOperationsByDialect(
+           scf::SCFDialect::getDialectNamespace()))
+    op.getCanonicalizationPatterns(patterns, ctx);
+  for (mlir::RegisteredOperationName op : ctx->getRegisteredOperationsByDialect(
+           cf::ControlFlowDialect::getDialectNamespace()))
+    op.getCanonicalizationPatterns(patterns, ctx);
+  populateForOpDeadArgumentElimination(patterns);
+
+  GreedyRewriteConfig config;
+  // This is intended to run before AutoLayouts are resolved, in which case
+  // CSEing constants can lead to additional layout conflicts.
+  config.enableConstantCSE(false);
+  (void)applyPatternsGreedily(getOperation(), std::move(patterns), config);
+}
diff --git a/python/test/gluon/test_core.py b/python/test/gluon/test_core.py
@@ -1114,3 +1114,28 @@ def kernel(a_ptr, b_ptr, c_ptr, out_ptr):
     out = torch.empty((B, B), dtype=torch.float32, device="cuda")
     kernel[(1, )](a, b, c, out)
     torch.testing.assert_close(out, torch.addmm(c, a, b), atol=1e-2, rtol=1e-2)
+
+
+@gluon.jit
+def kernel_auto_layout_constant(threads_per_warp: ttgl.constexpr):
+    BLOCK: ttgl.constexpr = 16
+    SIZE: ttgl.constexpr = 10
+
+    mask = ttgl.full(
+        (BLOCK, BLOCK),
+        True,
+        ttgl.int1,
+        ttgl.BlockedLayout(
+            size_per_thread=[1, 1],
+            threads_per_warp=[1, threads_per_warp],
+            warps_per_cta=[1, 4],
+            order=[1, 0],
+        ),
+    )
+
+    mask &= (ttgl.arange(0, BLOCK, ttgl.AutoLayout()) < SIZE).expand_dims(0)
+    mask &= (ttgl.arange(0, BLOCK, ttgl.AutoLayout()) < SIZE).expand_dims(1)
+
+
+def test_auto_layout_constant():
+    kernel_auto_layout_constant.warmup(THREADS_PER_WARP, grid=(1, ))
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -23,10 +23,10 @@
     "fence_async_shared",
     "get_tmem_32x32b_reg_layout",
     "mbarrier",
+    "mma_v2",
     "tensor_memory_descriptor",
     "TensorMemoryLayout",
     "tma",
-    "mma_v2",
 ]
 
 
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py b/python/triton/experimental/gluon/language/nvidia/hopper/__init__.py
@@ -8,7 +8,7 @@
 if TYPE_CHECKING:
     from triton._C.libtriton import ir
 
-__all__ = ["async_copy", "fence_async_shared", "mbarrier", "tma", "warpgroup_mma", "warpgroup_mma_wait", "mma_v2"]
+__all__ = ["async_copy", "fence_async_shared", "mbarrier", "mma_v2", "tma", "warpgroup_mma", "warpgroup_mma_wait"]
 
 
 @_core.builtin
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py
@@ -329,6 +329,7 @@ def gluon_to_ttgir(self, src, metadata, options, capability):
 
         passes.gluon.add_inliner(pm)
         passes.gluon.add_resolve_auto_encodings(pm)
+        passes.gluon.add_canonicalizer(pm)
         passes.common.add_sccp(pm)
         passes.ttir.add_loop_aware_cse(pm)
         passes.gluon.add_canonicalizer(pm)