feat: more triton passes + keep triton func in a separate module

avik-pal · avik-pal · commit 5b37e06b407d · 2025-09-27T23:19:56.000-05:00
diff --git a/deps/ReactantExtra/API.cpp b/deps/ReactantExtra/API.cpp
@@ -487,8 +487,8 @@ MakeGPUClient(int node_id, int num_nodes, int64_t *allowed_devices,
     return client.release();
   }
 #else
-      *error = "ReactantExtra was not built with GPU support";
-      return nullptr;
+  *error = "ReactantExtra was not built with GPU support";
+  return nullptr;
 #endif
 }
 
@@ -716,16 +716,56 @@ std::vector<int64_t> row_major(int64_t dim) {
 static void noop() {}
 
 #ifdef REACTANT_CUDA
+
 #include "third_party/gpus/cuda/include/cuda.h"
+
 REACTANT_ABI int32_t ReactantCudaDriverGetVersion() {
   int32_t data;
   ReactantHandleCuResult(cuDriverGetVersion(&data));
   return data;
 }
+
 REACTANT_ABI int32_t ReactantHermeticCudaGetVersion() { return CUDA_VERSION; }
+
+REACTANT_ABI int32_t ReactantCudaDeviceGetComputeCapalilityMajor() {
+  CUdevice cuDevice;
+  ReactantHandleCuResult(cuDeviceGet(&cuDevice, 0));
+  int major;
+  ReactantHandleCuResult(cuDeviceGetAttribute(
+      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+  return major;
+}
+
+REACTANT_ABI int32_t ReactantCudaDeviceGetComputeCapalilityMinor() {
+  CUdevice cuDevice;
+  ReactantHandleCuResult(cuDeviceGet(&cuDevice, 0));
+  int minor;
+  ReactantHandleCuResult(cuDeviceGetAttribute(
+      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+  return minor;
+}
+
+REACTANT_ABI int32_t ReactantCudaDeviceGetWarpSizeInThreads() {
+  CUdevice cuDevice;
+  ReactantHandleCuResult(cuDeviceGet(&cuDevice, 0));
+  int warpSize;
+  ReactantHandleCuResult(cuDeviceGetAttribute(
+      &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice));
+  return warpSize;
+}
+
 #else
+
 REACTANT_ABI int32_t ReactantCudaDriverGetVersion() { return 0; }
+
 REACTANT_ABI int32_t ReactantHermeticCudaGetVersion() { return 0; }
+
+REACTANT_ABI int32_t ReactantCudaDeviceGetComputeCapalilityMajor() { return 0; }
+
+REACTANT_ABI int32_t ReactantCudaDeviceGetComputeCapalilityMinor() { return 0; }
+
+REACTANT_ABI int32_t ReactantCudaDeviceGetWarpSizeInThreads() { return 0; }
+
 #endif
 
 REACTANT_ABI void *UnsafeBufferPointer(PjRtBuffer *buffer) {
diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD
@@ -979,6 +979,9 @@ cc_library(
             "-Wl,-exported_symbol,_ReactantFuncSetArgAttr",
             "-Wl,-exported_symbol,_ReactantHermeticCudaGetVersion",
             "-Wl,-exported_symbol,_ReactantCudaDriverGetVersion",
+            "-Wl,-exported_symbol,_ReactantCudaDeviceGetComputeCapalilityMajor",
+            "-Wl,-exported_symbol,_ReactantCudaDeviceGetComputeCapalilityMinor",
+            "-Wl,-exported_symbol,_ReactantCudaDeviceGetWarpSizeInThreads",
             "-Wl,-exported_symbol,_ReactantLLVMParseCommandLineOptions",
             "-Wl,-exported_symbol,_PjRtDeviceGetLocalDeviceId",
             "-Wl,-exported_symbol,_PjRtDeviceGetGlobalDeviceId",
diff --git a/deps/ReactantExtra/WORKSPACE b/deps/ReactantExtra/WORKSPACE
@@ -4,7 +4,7 @@ NSYNC_COMMIT = "82b118aa7ace3132e517e2c467f8732978cf4023"
 
 NSYNC_SHA256 = ""
 
-ENZYMEXLA_COMMIT = "fba59b4000ea352b145a14e1384b8c2940299987"
+ENZYMEXLA_COMMIT = "b59185c7586783a17d9486e682307ae89c713964"
 
 ENZYMEXLA_SHA256 = ""
 
diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
@@ -1460,14 +1460,6 @@ function Reactant.make_tracer(
     return newa
 end
 
-function __init__()
-    if CUDA.functional() && !Reactant.precompiling()
-        cap = CUDA.capability(CUDA.device())
-        Reactant.Compiler.cubinChip[] = "sm_$(cap.major)$(cap.minor)"
-    end
-    return nothing
-end
-
 # In Julia v1.11.3 precompiling this module caches bad code:
 # <https://github.com/EnzymeAD/Reactant.jl/issues/614>.
 @static if !Sys.isapple()
diff --git a/ext/ReactantPythonCallExt/pycall.jl b/ext/ReactantPythonCallExt/pycall.jl
@@ -60,6 +60,7 @@ signature_string(::TracedRNumber{T}) where {T} = "$(MLIR_TYPE_STRING[T])", nothi
 signature_string(x::T) where {T<:Number} = string(x), x
 signature_string(x) = error("Unsupported argument type: $(typeof(x))")
 
+# TODO: better name for hints?
 function overlayed_pycall_with_triton(
     kernel::Py, args...; grid, num_warps::Integer=1, num_stages::Integer=3, hints=nothing
 )
@@ -95,8 +96,11 @@ function overlayed_pycall_with_triton(
         fn=kernel, constexprs=constants, signature=sigmap, attrs=attrs
     )
 
-    # TODO: check that we are using CUDA. Get compute_capability from the target
-    target = triton.backends.compiler.GPUTarget("cuda", 80, 32)
+    target = triton.backends.compiler.GPUTarget(
+        "cuda",
+        parse(Int, Reactant.Compiler.cubinChip[][4:end]),
+        Reactant.Compiler.cuWarpSize[],
+    )
     backend = triton.compiler.make_backend(target)
     options = backend.parse_options(
         pydict(
@@ -111,7 +115,7 @@ function overlayed_pycall_with_triton(
     ccinfo = triton.compile(src; target=target, options=options.__dict__)
 
     @opcall triton_call(
-        pyconvert(String, ccinfo.asm["ttir"]),
+        pyconvert(String, ccinfo.asm["source"]),
         filter(x -> x isa Reactant.TracedType, args)...;
         func_name=pyconvert(String, ccinfo.metadata.name),
         grid_x=@opcall(constant(grid[1])),
diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -1291,9 +1291,66 @@ function optimization_passes(
         push!(passes, "remove-duplicate-func-def")
     end
     push!(passes, func_passes)
+    if backend == "cuda"
+        push!(passes, triton_optimization_passes())
+    end
     return join(passes, ',')
 end
 
+# https://github.com/triton-lang/triton/blob/8ee584014e9570ba608809c42dc2060fdd214a98/python/src/passes.cc
+function triton_optimization_passes()
+    # TODO: check that all triton passes are included here
+    return join(
+        [
+            # convert passes
+            "convert-scf-to-cf",
+            "convert-cf-to-llvm",
+            "convert-index-to-llvm",
+            "convert-arith-to-llvm",
+            "convert-nvvm-to-llvm",
+            # common passes
+            "canonicalize",
+            # # ttir passes
+            # "triton-combine",
+            # "triton-reorder-broadcast",
+            # "triton-rewrite-tensor-pointer",
+            # "triton-rewrite-tensor-descriptor-to-pointer",
+            # "triton-loop-unroll",
+            # "triton-licm",
+            # "triton-loop-aware-cse",
+            # # TODO: should num-warps and num-ctas be set for each kernel?
+            # "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
+            # # ttgir passes
+            # "tritongpu-coalesce",
+            # "tritongpu-optimize-thread-locality",
+            # "tritongpu-hoist-tmem-alloc",
+            # "tritongpu-assign-latencies",
+            # "tritongpu-pipeline",
+            # "tritongpu-schedule-loops",
+            # "tritongpu-automatic-warp-specialization",
+            # "tritongpu-prefetch",
+            # "tritongpu-accelerate-matmul",
+            # "tritongpu-reorder-instructions",
+            # "tritongpu-F32DotTC",
+            # "tritongpu-optimize-dot-operands",
+            # "tritongpu-remove-layout-conversions",
+            # "tritongpu-reduce-data-duplication",
+            # "tritongpu-hoist-tmem-alloc",
+            # "tritongpu-fuse-nested-loops",
+            # "tritongpu-rewrite-partition-dependencies",
+            # "tritongpu-partition-loops",
+            # "tritongpu-combine-tensor-select-and-if",
+            # # ttgir to llvm passes
+            # "tritongpu-allocate-warp-groups",
+            # "allocate-shared-memory",
+            # "tritongpu-global-scratch-memory-allocation",
+            # "tritongpu-optimize-accumulator-init",
+            # "tritongpu-coalesce-async-copy",
+        ],
+        ",",
+    )
+end
+
 # TODO we want to be able to run the more advanced passes via transform dialect as an enzyme intermediate
 # However, this errs as we cannot attach the transform with to the funcop itself [as we run a functionpass].
 const enzyme_pass::String = "enzyme{postpasses=\"arith-raise{stablehlo=true},canonicalize,cse,canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math,canonicalize,cse,canonicalize\"}"
@@ -1425,6 +1482,7 @@ const cubinChip = Ref{String}("sm_60")
 const cubinFormat = Ref{String}("bin")
 const cuindexBitWidth = Ref{Int}(32)
 const cuOptLevel = Ref{Int}(2)
+const cuWarpSize = Ref{Int}(32)
 # Wgatever the relevant highest version from our LLVM is within NVPTX.td
 # Or more specifically looking at clang/lib/Driver/ToolChains/Cuda.cpp:684
 #  We see relevant ptx version is CUDA 12.6 -> 85
@@ -2245,7 +2303,8 @@ function compile_mlir!(
         end
     end
 
-    run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
+    # XXX: re-enable this pass
+    # run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
 
     func_op = MLIR.API.mlirSymbolTableLookup(
         MLIR.IR.SymbolTable(MLIR.IR.Operation(mod)), fnname
diff --git a/src/Ops.jl b/src/Ops.jl
@@ -1701,12 +1701,20 @@ end
 _new_function_name(orig_name, module_suffix) = orig_name * "_call_" * module_suffix
 
 function _extract_function(
-    code::String; func_name::String="main", func_op_kind::String="func.func"
+    code::String;
+    func_name::String="main",
+    func_op_kind::String="func.func",
+    nested_module::Bool=false,
 )
     module_suffix = string(hash(code); base=16)
     name_to_call = _new_function_name(func_name, module_suffix)
 
     current_module = MLIR.IR.mmodule()
+    if nested_module
+        new_module = MLIR.IR.Module()
+        push!(MLIR.IR.body(current_module), MLIR.IR.Operation(new_module, true))
+        current_module = new_module
+    end
     top_level_block = MLIR.IR.body(current_module)
 
     symbol_attr_name = String(MLIR.API.mlirSymbolTableGetSymbolAttributeName())
@@ -1770,7 +1778,9 @@ function triton_call(
     location=mlir_stacktrace("triton_call", @__FILE__, @__LINE__),
     # TODO: other kwargs
 )
-    _, name_to_call = _extract_function(mlir_code; func_name, func_op_kind="tt.func")
+    _, name_to_call = _extract_function(
+        mlir_code; func_name, func_op_kind="tt.func", nested_module=true
+    )
 
     enzymexla.triton_call(
         grid_x.mlir_data,
diff --git a/src/mlir/IR/Module.jl b/src/mlir/IR/Module.jl
@@ -52,7 +52,8 @@ body(module_) = Block(API.mlirModuleGetBody(module_), false)
 
 Views the module as a generic operation.
 """
-Operation(module_::Module) = Operation(API.mlirModuleGetOperation(module_), false)
+Operation(module_::Module, owned::Bool=false) =
+    Operation(API.mlirModuleGetOperation(module_), owned)
 
 function Base.show(io::IO, module_::Module)
     return show(io, Operation(module_))
diff --git a/src/xla/XLA.jl b/src/xla/XLA.jl
@@ -234,6 +234,15 @@ for runtime in (:PJRT, :IFRT)
                         )
                         state.clients["cuda"] = gpu
                         state.default_client = gpu
+
+                        # set values for cuda. This is being done here since we need cuda
+                        # to be initialized before we can use it. initializing the devices
+                        # implicitly initializes cuda.
+                        cc_major = @ccall MLIR.API.mlir_c.ReactantCudaDeviceGetComputeCapalilityMajor()::Int32
+                        cc_minor = @ccall MLIR.API.mlir_c.ReactantCudaDeviceGetComputeCapalilityMinor()::Int32
+                        Reactant.Compiler.cubinChip[] = "sm_$(cc_major)$(cc_minor)"
+
+                        Reactant.Compiler.cuWarpSize[] = @ccall MLIR.API.mlir_c.ReactantCudaDeviceGetWarpSizeInThreads()::Int32
                     catch e
                         println(stdout, e)
                     end