feat: more triton passes + keep triton func in a separate module

avik-pal · avik-pal · commit 013180cb2d68 · 2025-10-09T12:30:28.000-04:00
diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD
@@ -979,6 +979,9 @@ cc_library(
             "-Wl,-exported_symbol,_ReactantFuncSetArgAttr",
             "-Wl,-exported_symbol,_ReactantHermeticCudaGetVersion",
             "-Wl,-exported_symbol,_ReactantCudaDriverGetVersion",
+            "-Wl,-exported_symbol,_ReactantCudaDeviceGetComputeCapalilityMajor",
+            "-Wl,-exported_symbol,_ReactantCudaDeviceGetComputeCapalilityMinor",
+            "-Wl,-exported_symbol,_ReactantCudaDeviceGetWarpSizeInThreads",
             "-Wl,-exported_symbol,_ReactantLLVMParseCommandLineOptions",
             "-Wl,-exported_symbol,_ReactantCudaDeviceGetComputeCapalilityMajor",
             "-Wl,-exported_symbol,_ReactantCudaDeviceGetComputeCapalilityMinor",
diff --git a/ext/ReactantPythonCallExt/pycall.jl b/ext/ReactantPythonCallExt/pycall.jl
@@ -60,6 +60,7 @@ signature_string(::TracedRNumber{T}) where {T} = "$(MLIR_TYPE_STRING[T])", nothi
 signature_string(x::T) where {T<:Number} = string(x), x
 signature_string(x) = error("Unsupported argument type: $(typeof(x))")
 
+# TODO: better name for hints?
 function overlayed_pycall_with_triton(
     kernel::Py, args...; grid, num_warps::Integer=1, num_stages::Integer=3, hints=nothing
 )
@@ -95,8 +96,11 @@ function overlayed_pycall_with_triton(
         fn=kernel, constexprs=constants, signature=sigmap, attrs=attrs
     )
 
-    # TODO: check that we are using CUDA. Get compute_capability from the target
-    target = triton.backends.compiler.GPUTarget("cuda", 80, 32)
+    target = triton.backends.compiler.GPUTarget(
+        "cuda",
+        parse(Int, Reactant.Compiler.cubinChip[][4:end]),
+        Reactant.Compiler.cuWarpSize[],
+    )
     backend = triton.compiler.make_backend(target)
     options = backend.parse_options(
         pydict(
@@ -111,7 +115,7 @@ function overlayed_pycall_with_triton(
     ccinfo = triton.compile(src; target=target, options=options.__dict__)
 
     @opcall triton_call(
-        pyconvert(String, ccinfo.asm["ttir"]),
+        pyconvert(String, ccinfo.asm["source"]),
         filter(x -> x isa Reactant.TracedType, args)...;
         func_name=pyconvert(String, ccinfo.metadata.name),
         grid_x=@opcall(constant(grid[1])),
diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -1298,9 +1298,66 @@ function optimization_passes(
         push!(passes, "remove-duplicate-func-def")
     end
     push!(passes, func_passes)
+    if backend == "cuda"
+        push!(passes, triton_optimization_passes())
+    end
     return join(passes, ',')
 end
 
+# https://github.com/triton-lang/triton/blob/8ee584014e9570ba608809c42dc2060fdd214a98/python/src/passes.cc
+function triton_optimization_passes()
+    # TODO: check that all triton passes are included here
+    return join(
+        [
+            # convert passes
+            "convert-scf-to-cf",
+            "convert-cf-to-llvm",
+            "convert-index-to-llvm",
+            "convert-arith-to-llvm",
+            "convert-nvvm-to-llvm",
+            # common passes
+            "canonicalize",
+            # # ttir passes
+            # "triton-combine",
+            # "triton-reorder-broadcast",
+            # "triton-rewrite-tensor-pointer",
+            # "triton-rewrite-tensor-descriptor-to-pointer",
+            # "triton-loop-unroll",
+            # "triton-licm",
+            # "triton-loop-aware-cse",
+            # # TODO: should num-warps and num-ctas be set for each kernel?
+            # "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
+            # # ttgir passes
+            # "tritongpu-coalesce",
+            # "tritongpu-optimize-thread-locality",
+            # "tritongpu-hoist-tmem-alloc",
+            # "tritongpu-assign-latencies",
+            # "tritongpu-pipeline",
+            # "tritongpu-schedule-loops",
+            # "tritongpu-automatic-warp-specialization",
+            # "tritongpu-prefetch",
+            # "tritongpu-accelerate-matmul",
+            # "tritongpu-reorder-instructions",
+            # "tritongpu-F32DotTC",
+            # "tritongpu-optimize-dot-operands",
+            # "tritongpu-remove-layout-conversions",
+            # "tritongpu-reduce-data-duplication",
+            # "tritongpu-hoist-tmem-alloc",
+            # "tritongpu-fuse-nested-loops",
+            # "tritongpu-rewrite-partition-dependencies",
+            # "tritongpu-partition-loops",
+            # "tritongpu-combine-tensor-select-and-if",
+            # # ttgir to llvm passes
+            # "tritongpu-allocate-warp-groups",
+            # "allocate-shared-memory",
+            # "tritongpu-global-scratch-memory-allocation",
+            # "tritongpu-optimize-accumulator-init",
+            # "tritongpu-coalesce-async-copy",
+        ],
+        ",",
+    )
+end
+
 # TODO we want to be able to run the more advanced passes via transform dialect as an enzyme intermediate
 # However, this errs as we cannot attach the transform with to the funcop itself [as we run a functionpass].
 const enzyme_pass::String = "enzyme{postpasses=\"arith-raise{stablehlo=true},canonicalize,cse,canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math,canonicalize,cse,canonicalize\"}"
@@ -2254,7 +2311,8 @@ function compile_mlir!(
         end
     end
 
-    run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
+    # XXX: re-enable this pass
+    # run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
 
     func_op = MLIR.API.mlirSymbolTableLookup(
         MLIR.IR.SymbolTable(MLIR.IR.Operation(mod)), fnname
diff --git a/src/Ops.jl b/src/Ops.jl
@@ -1746,12 +1746,20 @@ end
 _new_function_name(orig_name, module_suffix) = orig_name * "_call_" * module_suffix
 
 function _extract_function(
-    code::String; func_name::String="main", func_op_kind::String="func.func"
+    code::String;
+    func_name::String="main",
+    func_op_kind::String="func.func",
+    nested_module::Bool=false,
 )
     module_suffix = string(hash(code); base=16)
     name_to_call = _new_function_name(func_name, module_suffix)
 
     current_module = MLIR.IR.mmodule()
+    if nested_module
+        new_module = MLIR.IR.Module()
+        push!(MLIR.IR.body(current_module), MLIR.IR.Operation(new_module, true))
+        current_module = new_module
+    end
     top_level_block = MLIR.IR.body(current_module)
 
     symbol_attr_name = String(MLIR.API.mlirSymbolTableGetSymbolAttributeName())
@@ -1815,7 +1823,9 @@ function triton_call(
     location=mlir_stacktrace("triton_call", @__FILE__, @__LINE__),
     # TODO: other kwargs
 )
-    _, name_to_call = _extract_function(mlir_code; func_name, func_op_kind="tt.func")
+    _, name_to_call = _extract_function(
+        mlir_code; func_name, func_op_kind="tt.func", nested_module=true
+    )
 
     enzymexla.triton_call(
         grid_x.mlir_data,
diff --git a/src/mlir/IR/Module.jl b/src/mlir/IR/Module.jl
@@ -52,7 +52,8 @@ body(module_) = Block(API.mlirModuleGetBody(module_), false)
 
 Views the module as a generic operation.
 """
-Operation(module_::Module) = Operation(API.mlirModuleGetOperation(module_), false)
+Operation(module_::Module, owned::Bool=false) =
+    Operation(API.mlirModuleGetOperation(module_), owned)
 
 function Base.show(io::IO, module_::Module)
     return show(io, Operation(module_))