feat: triton tracing works now finally

avik-pal · avik-pal · commit 357f1c017cda · 2025-10-09T12:30:54.000-04:00
diff --git a/docs/src/.vitepress/config.mts b/docs/src/.vitepress/config.mts
@@ -131,6 +131,7 @@ export default defineConfig({
               { text: "SparseTensor", link: "/api/dialects/sparsetensor" },
               { text: "StableHLO", link: "/api/dialects/stablehlo" },
               { text: "Triton", link: "/api/dialects/triton" },
+              { text: "TritonExt", link: "/api/dialects/tritonext" },
               { text: "TPU", link: "/api/dialects/tpu" },
               { text: "VHLO", link: "/api/dialects/vhlo" },
             ],
@@ -221,6 +222,7 @@ export default defineConfig({
               { text: "SparseTensor", link: "/api/dialects/sparsetensor" },
               { text: "StableHLO", link: "/api/dialects/stablehlo" },
               { text: "Triton", link: "/api/dialects/triton" },
+              { text: "TritonExt", link: "/api/dialects/tritonext" },
               { text: "TPU", link: "/api/dialects/tpu" },
               { text: "VHLO", link: "/api/dialects/vhlo" },
             ],
diff --git a/docs/src/api/dialects/tritonext.md b/docs/src/api/dialects/tritonext.md
@@ -0,0 +1,11 @@
+```@meta
+CollapsedDocStrings = true
+```
+
+# TritonExt Dialect
+
+Provides extensions to the Triton dialect.
+
+```@autodocs
+Modules = [Reactant.MLIR.Dialects.triton_ext]
+```
diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -1305,57 +1305,89 @@ function optimization_passes(
 end
 
 # https://github.com/triton-lang/triton/blob/8ee584014e9570ba608809c42dc2060fdd214a98/python/src/passes.cc
+# To get the latest passes run triton with MLIR_ENABLE_DUMP=1 and then extract the passes
 function triton_optimization_passes()
-    # TODO: check that all triton passes are included here
-    return join(
+    all_passes = join(
         [
-            # convert passes
-            "convert-scf-to-cf",
-            "convert-cf-to-llvm",
-            "convert-index-to-llvm",
-            "convert-arith-to-llvm",
-            "convert-nvvm-to-llvm",
-            # common passes
             "canonicalize",
-            # ttir passes
+            "triton-rewrite-tensor-pointer",
+            "canonicalize",
             "triton-combine",
             "triton-reorder-broadcast",
-            "triton-rewrite-tensor-pointer",
-            "triton-rewrite-tensor-descriptor-to-pointer",
+            "cse",
+            "symbol-dce",
             "triton-loop-unroll",
-            "triton-licm",
-            "triton-loop-aware-cse",
-            # TODO: should num-warps and num-ctas be set for each kernel?
             "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
-            # ttgir passes
             "tritongpu-coalesce",
+            "tritongpu-F32DotTC",
+            "triton-nvidia-gpu-plan-cta",
+            "tritongpu-remove-layout-conversions",
             "tritongpu-optimize-thread-locality",
+            "tritongpu-accelerate-matmul",
+            "tritongpu-remove-layout-conversions",
+            "tritongpu-optimize-dot-operands",
+            "canonicalize",
+            "triton-nvidia-optimize-descriptor-encoding",
+            "triton-loop-aware-cse",
+            "tritongpu-fuse-nested-loops",
+            "canonicalize",
+            "triton-licm",
+            "tritongpu-optimize-accumulator-init",
             "tritongpu-hoist-tmem-alloc",
+            "tritongpu-promote-lhs-to-tmem",
             "tritongpu-assign-latencies",
-            "tritongpu-pipeline",
             "tritongpu-schedule-loops",
             "tritongpu-automatic-warp-specialization",
+            "tritongpu-partition-scheduling",
+            "tritongpu-load-mma-specialization",
+            "tritongpu-rewrite-partition-dependencies",
+            "sccp",
+            "cse",
+            "tritongpu-partition-loops",
+            "tritongpu-optimize-partition-warps",
+            "tritongpu-schedule-loops",
+            "tritongpu-pipeline",
+            "tritongpu-combine-tensor-select-and-if",
+            "triton-nvidia-gpu-remove-tmem-tokens",
+            "canonicalize",
+            "triton-loop-aware-cse",
             "tritongpu-prefetch",
-            "tritongpu-accelerate-matmul",
-            "tritongpu-reorder-instructions",
-            "tritongpu-F32DotTC",
             "tritongpu-optimize-dot-operands",
+            "canonicalize",
+            "tritongpu-coalesce-async-copy",
+            "triton-nvidia-optimize-tmem-layouts",
             "tritongpu-remove-layout-conversions",
+            "triton-nvidia-interleave-tmem",
             "tritongpu-reduce-data-duplication",
-            "tritongpu-hoist-tmem-alloc",
-            "tritongpu-fuse-nested-loops",
-            "tritongpu-rewrite-partition-dependencies",
-            "tritongpu-partition-loops",
+            "tritongpu-reorder-instructions",
+            "triton-loop-aware-cse",
+            "symbol-dce",
+            "triton-nvidia-tma-lowering",
+            "triton-nvidia-gpu-fence-insertion",
+            "sccp",
+            "canonicalize",
+            "triton-nvidia-mma-lowering",
             "tritongpu-combine-tensor-select-and-if",
-            # ttgir to llvm passes
             "tritongpu-allocate-warp-groups",
+            "convert-scf-to-cf",
             "allocate-shared-memory",
+            "triton-tensor-memory-allocation",
             "tritongpu-global-scratch-memory-allocation",
-            "tritongpu-optimize-accumulator-init",
-            "tritongpu-coalesce-async-copy",
+            # TODO: register the commented out passes
+            # "convert-triton-gpu-to-llvm",
+            "canonicalize",
+            "cse",
+            # "convert-nv-gpu-to-llvm",
+            # "convert-warp-specialize-to-llvm",
+            "reconcile-unrealized-casts",
+            "canonicalize",
+            "cse",
+            "symbol-dce",
+            "enable-line-info",
         ],
         ",",
     )
+    return "triton_ext.module(builtin.module($(all_passes)))"
 end
 
 # TODO we want to be able to run the more advanced passes via transform dialect as an enzyme intermediate
diff --git a/src/Ops.jl b/src/Ops.jl
@@ -3,7 +3,7 @@
 # Julia and Reactant semantics should be considered on the higher abstractions that use these 
 module Ops
 using ..MLIR: MLIR
-using ..MLIR.Dialects: stablehlo, chlo, enzyme, enzymexla
+using ..MLIR.Dialects: stablehlo, chlo, enzyme, enzymexla, triton_ext
 using ..Reactant:
     Reactant,
     TracedRArray,
@@ -1749,32 +1749,52 @@ function _extract_function(
     code::String;
     func_name::String="main",
     func_op_kind::String="func.func",
-    nested_module::Bool=false,
     location::MLIR.IR.Location=MLIR.IR.Location(),
 )
     module_suffix = string(hash(code); base=16)
     name_to_call = func_name * "_call_" * module_suffix
     mod_name = func_name * "_module_" * module_suffix
     symbol_attr_name = String(MLIR.API.mlirSymbolTableGetSymbolAttributeName())
 
-    if nested_module
+    use_ttext_module = split(func_op_kind, ".")[1] == "tt"
+
+    if use_ttext_module
+        tt_mod_name = func_name * "_tt_module_" * module_suffix
+        tt_region = MLIR.IR.Region()
+        tt_block = MLIR.IR.Block()
+        push!(tt_region, tt_block)
+        triton_mod_op = triton_ext.module_(;
+            location, bodyRegion=tt_region, sym_name=tt_mod_name
+        )
+        MLIR.IR.rmfromparent!(triton_mod_op)
+        push!(MLIR.IR.body(MLIR.IR.mmodule()), triton_mod_op) # insert into parent module
+
         region = MLIR.IR.Region()
         push!(region, MLIR.IR.Block())
         moduleop = MLIR.Dialects.builtin.module_(;
             location, bodyRegion=region, sym_name=mod_name
         )
         MLIR.IR.rmfromparent!(moduleop)
-        push!(MLIR.IR.body(MLIR.IR.mmodule()), moduleop) # insert into parent module
+        push!(tt_block, moduleop) # insert into triton module
 
         top_level_block = MLIR.IR.Block(
             MLIR.API.mlirModuleGetBody(MLIR.API.mlirModuleFromOperation(moduleop)), false
         )
         fn = nothing
+
+        symref = MLIR.IR.SymbolRefAttribute(
+            tt_mod_name,
+            MLIR.IR.Attribute[
+                MLIR.IR.FlatSymbolRefAttribute(mod_name),
+                MLIR.IR.FlatSymbolRefAttribute(name_to_call),
+            ],
+        )
     else
         current_module = MLIR.IR.mmodule()
         moduleop = MLIR.IR.Operation(current_module)
         top_level_block = MLIR.IR.body(current_module)
         fn = MLIR.IR.lookup(MLIR.IR.SymbolTable(moduleop), name_to_call)
+        symref = MLIR.IR.FlatSymbolRefAttribute(name_to_call)
     end
 
     if isnothing(fn)
@@ -1795,12 +1815,14 @@ function _extract_function(
         )
         @assert res == MLIR.IR.success() "hlo_call: failed to rename $fn_name"
 
-        # Set function private
-        MLIR.IR.attr!(
-            op,
-            MLIR.API.mlirSymbolTableGetVisibilityAttributeName(),
-            MLIR.IR.Attribute("private"),
-        )
+        if !use_ttext_module
+            # Set function private
+            MLIR.IR.attr!(
+                op,
+                MLIR.API.mlirSymbolTableGetVisibilityAttributeName(),
+                MLIR.IR.Attribute("private"),
+            )
+        end
 
         # Change function name
         MLIR.IR.attr!(op, symbol_attr_name, MLIR.IR.Attribute(name_to_call))
@@ -1815,7 +1837,7 @@ function _extract_function(
         error("hlo_call: could not find function $func_name in the provided module")
     end
 
-    return fn, name_to_call, mod_name
+    return fn, symref
 end
 
 function triton_call(
@@ -1829,19 +1851,15 @@ function triton_call(
     location=mlir_stacktrace("triton_call", @__FILE__, @__LINE__),
     # TODO: other kwargs
 )
-    _, name_to_call, mod_name = _extract_function(
-        mlir_code; func_name, func_op_kind="tt.func", nested_module=true, location
-    )
+    _, symref = _extract_function(mlir_code; func_name, func_op_kind="tt.func", location)
 
-    enzymexla.triton_call(
+    triton_ext.call(
         grid_x.mlir_data,
         grid_y.mlir_data,
         grid_z.mlir_data,
         shmem.mlir_data,
         [Reactant.TracedUtils.get_mlir_data(a) for a in args];
-        fn=MLIR.IR.SymbolRefAttribute(
-            mod_name, MLIR.IR.Attribute[MLIR.IR.FlatSymbolRefAttribute(name_to_call)]
-        ),
+        fn=symref,
         result_0=MLIR.IR.Type[],
         location,
     )
@@ -1879,9 +1897,7 @@ julia> Reactant.@jit(
     func_name="main",
     location=mlir_stacktrace("hlo_call", @__FILE__, @__LINE__),
 )
-    fn, name_to_call, _ = _extract_function(
-        code; func_name, func_op_kind="func.func", location
-    )
+    fn, symref = _extract_function(code; func_name, func_op_kind="func.func", location)
 
     ftype_attr = MLIR.IR.attr(fn, "function_type")
     ftype = MLIR.IR.Type(ftype_attr)
@@ -1898,7 +1914,7 @@ julia> Reactant.@jit(
     call = MLIR.Dialects.func.call(
         operands;
         result_0=[MLIR.IR.result(ftype, i) for i in 1:MLIR.IR.nresults(ftype)],
-        callee=MLIR.IR.FlatSymbolRefAttribute(name_to_call),
+        callee=symref,
         location,
     )