feat: use new device properties [skip ci]

avik-pal · avik-pal · commit 04cbf60ab798 · 2025-10-16T15:53:58.000-05:00
diff --git a/deps/ReactantExtra/WORKSPACE b/deps/ReactantExtra/WORKSPACE
@@ -4,7 +4,7 @@ NSYNC_COMMIT = "82b118aa7ace3132e517e2c467f8732978cf4023"
 
 NSYNC_SHA256 = ""
 
-ENZYMEXLA_COMMIT = "f2072aa2031eb6a1d5d1972d3a95340fb67c9480"
+ENZYMEXLA_COMMIT = "8221b6147f497592205e6f558b1609e2964f3330"
 
 ENZYMEXLA_SHA256 = ""
 
diff --git a/ext/ReactantPythonCallExt/pycall.jl b/ext/ReactantPythonCallExt/pycall.jl
@@ -47,14 +47,16 @@ function overlayed_pycall_with_jax_tracing(f::Py, args...)
     return length(res) == 0 ? nothing : (length(res) == 1 ? res[1] : res)
 end
 
-function normalize_grid_and_blocks(grid_fn, metadata)
-    return normalize_grid_and_blocks(grid_fn(metadata), metadata)
+function normalize_grid_and_blocks(grid_fn, metadata, device_properties)
+    return normalize_grid_and_blocks(
+        grid_fn(metadata, device_properties), metadata, device_properties
+    )
 end
 
-function normalize_grid_and_blocks(grid::Integer, metadata)
-    return normalize_grid_and_blocks((grid,), metadata)
+function normalize_grid_and_blocks(grid::Integer, metadata, device_properties)
+    return normalize_grid_and_blocks((grid,), metadata, device_properties)
 end
-function normalize_grid_and_blocks(grid::Dims{N}, metadata) where {N}
+function normalize_grid_and_blocks(grid::Dims{N}, metadata, device_properties) where {N}
     @assert N <= 3
     @assert all(grid .> 0)
     return (grid..., ntuple(_ -> 1, 3 - N)...)
@@ -71,8 +73,9 @@ function overlayed_pycall_with_triton(
     args...;
     grid,
     blocks,
-    num_warps::Integer=1,
+    num_warps::Integer=4,
     num_stages::Integer=3,
+    num_ctas::Integer=1,
     hints=nothing,
 )
     triton = tritonptr[]
@@ -105,16 +108,23 @@ function overlayed_pycall_with_triton(
         fn=kernel, constexprs=constants, signature=sigmap, attrs=attrs
     )
 
+    # TODO: pass the device/client here from `compile`
+    client = Reactant.XLA.default_backend()
+    @assert Reactant.XLA.platform_name(client) == "cuda"
+    device = Reactant.XLA.default_device(client)
+    device_properties = Reactant.XLA.device_properties(device)
+
     target = triton.backends.compiler.GPUTarget(
-        "cuda",
-        parse(Int, Reactant.Compiler.cubinChip[][4:end]),
-        Reactant.Compiler.cuWarpSize[],
+        Reactant.XLA.platform_name(client),
+        parse(Int, "$(device_properties.major)$(device_properties.minor)"),
+        device_properties.warp_size,
     )
     backend = triton.compiler.make_backend(target)
     options = backend.parse_options(
         pydict(
             "num_warps" => num_warps,
             "num_stages" => num_stages,
+            "num_ctas" => num_ctas,
             "extern_libs" => pytuple((pytuple(("libdevice", Reactant_jll.libdevice)),)),
         ),
     )
@@ -123,8 +133,8 @@ function overlayed_pycall_with_triton(
     # we are compiling here + lowering again inside enzymejax
     ccinfo = triton.compile(src; target=target, options=options.__dict__)
 
-    grid = normalize_grid_and_blocks(grid, ccinfo.metadata)
-    blocks = normalize_grid_and_blocks(blocks, ccinfo.metadata)
+    grid = normalize_grid_and_blocks(grid, ccinfo.metadata, device_properties)
+    blocks = normalize_grid_and_blocks(blocks, ccinfo.metadata, device_properties)
 
     return @opcall triton_call(
         pyconvert(String, ccinfo.asm["source"]),
@@ -136,5 +146,8 @@ function overlayed_pycall_with_triton(
         block_x=@opcall(constant(blocks[1])),
         block_y=@opcall(constant(blocks[2])),
         block_z=@opcall(constant(blocks[3])),
+        # The following are written to module attributes and restored later on
+        num_ctas,
+        num_warps,
     )
 end
diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -703,6 +703,7 @@ function optimization_passes(
     max_constant_threshold::Int=1024,
     backend::String="gpu",
     enable_triton_passes::Bool=false,
+    device_properties::Union{Nothing,XLA.DeviceProperties}=nothing,
 )
     transform_passes_list = [
         "patterns=compare_op_canon<16>",
@@ -1302,14 +1303,20 @@ function optimization_passes(
     end
     push!(passes, func_passes)
     if enable_triton_passes && backend == "cuda"
-        push!(passes, triton_optimization_passes())
+        push!(passes, triton_optimization_passes(device_properties))
     end
     return join(passes, ',')
 end
 
 # https://github.com/triton-lang/triton/blob/8ee584014e9570ba608809c42dc2060fdd214a98/python/src/passes.cc
 # To get the latest passes run triton with MLIR_ENABLE_DUMP=1 and then extract the passes
-function triton_optimization_passes()
+function triton_optimization_passes(device_properties)
+    @assert device_properties !== nothing "Device properties must be provided to run \
+                                           triton passes. This might happen if you are \
+                                           compiling a triton kernel for non-cuda backend."
+    major_version = device_properties.major
+    minor_version = device_properties.minor
+
     all_passes = join(
         [
             "canonicalize",
@@ -1320,7 +1327,9 @@ function triton_optimization_passes()
             "cse",
             "symbol-dce",
             "triton-loop-unroll",
-            "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
+            "preserve-triton-warps-ctas{save=true restore=false}",
+            "convert-triton-to-tritongpu{target=cuda:$(major_version)$(minor_version)}",
+            "preserve-triton-warps-ctas{save=false restore=true}",
             "tritongpu-coalesce",
             "tritongpu-F32DotTC",
             "triton-nvidia-gpu-plan-cta",
@@ -1740,6 +1749,9 @@ function compile_mlir!(
 
     toolkit = XLA.CUDA_DATA_DIR[]
 
+    default_device = XLA.default_device(client)
+    device_properties = XLA.device_properties(default_device)
+
     if backend == "cpu" || backend == "tpu"
         kern = "lower-kernel{backend=cpu},canonicalize"
         if backend == "tpu"
@@ -1754,9 +1766,7 @@ function compile_mlir!(
             "lower-kernel,canonicalize"
         end
 
-        device_properties = XLA.device_properties(XLA.default_device(client))
         cubinChip = "sm_$(device_properties.major)$(device_properties.minor)"
-
         if DEBUG_KERNEL[]
             curesulthandler = dlsym(
                 Reactant_jll.libReactantExtra_handle, "ReactantHandleCuResult"
@@ -1787,6 +1797,7 @@ function compile_mlir!(
         lower_comms,
         backend,
         enable_triton_passes=false,
+        device_properties,
     )
     opt_passes2 = optimization_passes(
         compile_options;
@@ -1795,6 +1806,7 @@ function compile_mlir!(
         lower_comms,
         backend,
         enable_triton_passes=false,
+        device_properties,
     )
     opt_passes_with_triton = optimization_passes(
         compile_options;
@@ -1803,6 +1815,7 @@ function compile_mlir!(
         lower_comms,
         backend,
         enable_triton_passes=true,
+        device_properties,
     )
 
     raise_passes = if raise isa String
@@ -1824,6 +1837,7 @@ function compile_mlir!(
                 recognize_comms,
                 lower_comms,
                 backend,
+                device_properties,
             )
             result = result * "," * opt_passes_dus_to_concat
         end
@@ -2148,6 +2162,7 @@ function compile_mlir!(
                 recognize_comms,
                 lower_comms,
                 backend,
+                device_properties,
             ),
             "post_op_transpose_reshape",
         )
diff --git a/src/Ops.jl b/src/Ops.jl
@@ -1837,7 +1837,7 @@ function _extract_function(
         error("hlo_call: could not find function $func_name in the provided module")
     end
 
-    return fn, symref
+    return fn, symref, moduleop
 end
 
 function triton_call(
@@ -1850,9 +1850,16 @@ function triton_call(
     block_x::TracedRNumber{<:Integer},
     block_y::TracedRNumber{<:Integer},
     block_z::TracedRNumber{<:Integer},
+    num_ctas::Integer=1,
+    num_warps::Integer=4,
     location=mlir_stacktrace("triton_call", @__FILE__, @__LINE__),
 )
-    _, symref = _extract_function(mlir_code; func_name, func_op_kind="tt.func", location)
+    _, symref, modop = _extract_function(
+        mlir_code; func_name, func_op_kind="tt.func", location
+    )
+
+    MLIR.IR.attr!(modop, "ttg.num-wraps", MLIR.IR.Attribute(Int32(num_warps)))
+    MLIR.IR.attr!(modop, "ttg.num-ctas", MLIR.IR.Attribute(Int32(num_ctas)))
 
     result_types = MLIR.IR.Type[]
     output_operand_aliases = MLIR.IR.Attribute[]
@@ -1929,7 +1936,7 @@ julia> Reactant.@jit(
     func_name="main",
     location=mlir_stacktrace("hlo_call", @__FILE__, @__LINE__),
 )
-    fn, symref = _extract_function(code; func_name, func_op_kind="func.func", location)
+    fn, symref, _ = _extract_function(code; func_name, func_op_kind="func.func", location)
 
     ftype_attr = MLIR.IR.attr(fn, "function_type")
     ftype = MLIR.IR.Type(ftype_attr)