fix: correct launch configuration

avik-pal · avik-pal · commit f81c38a38ad1 · 2025-10-28T15:15:58.000-05:00
diff --git a/ext/ReactantPythonCallExt/pycall.jl b/ext/ReactantPythonCallExt/pycall.jl
@@ -59,9 +59,9 @@ struct TritonMetadata{CK,MD,DP}
     max_num_threads::Int
 end
 
-normalize_grid(grid_fn, metadata) = normalize_grid(grid_fn(metadata), metadata)
-normalize_grid(grid::Integer, metadata) = normalize_grid((grid,), metadata)
-function normalize_grid(grid::Dims{N}, metadata) where {N}
+canonicalize_grid(grid_fn, metadata) = canonicalize_grid(grid_fn(metadata), metadata)
+canonicalize_grid(grid::Integer, metadata) = canonicalize_grid((grid,), metadata)
+function canonicalize_grid(grid::Dims{N}, metadata) where {N}
     @assert N <= 3
     @assert all(grid .> 0)
     return (grid..., ntuple(_ -> 1, 3 - N)...)
@@ -82,6 +82,7 @@ function overlayed_pycall_with_triton(
     num_ctas::Integer=1,
     hints=nothing,
 )
+    @assert num_ctas == 1 "TODO: num_ctas > 1 not supported"
     triton = tritonptr[]
 
     mapped = map(signature_string, args)
@@ -163,7 +164,7 @@ function overlayed_pycall_with_triton(
         Int(n_max_threads[]),
     )
 
-    grid = normalize_grid(grid, metadata)
+    grid = canonicalize_grid(grid, metadata)
 
     return @opcall triton_call(
         pyconvert(String, compiled_kernel.asm["source"]),
@@ -177,5 +178,7 @@ function overlayed_pycall_with_triton(
         block_z=@opcall(constant(1)),
         num_ctas,
         num_warps,
+        threads_per_warp=device_properties.warp_size,
+        enable_source_remat=false,
     )
 end
diff --git a/src/CompileOptions.jl b/src/CompileOptions.jl
@@ -230,6 +230,7 @@ function CompileOptions(;
             :just_batch,
             :none,
             :no_triton,
+            :before_triton_lowering,
         ]
     end
 
diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -1330,9 +1330,7 @@ function triton_optimization_passes(device_properties)
             "cse",
             "symbol-dce",
             "triton-loop-unroll",
-            "preserve-triton-warps-ctas{save=true restore=false}",
-            "convert-triton-to-tritongpu{target=cuda:$(major_version)$(minor_version)}",
-            "preserve-triton-warps-ctas{save=false restore=true}",
+            "convert-triton-to-triton-gpu-preserving-module-attributes{target=cuda:$(major_version)$(minor_version)}",
             "tritongpu-coalesce",
             "tritongpu-F32DotTC",
             "triton-nvidia-gpu-plan-cta",
@@ -1933,6 +1931,31 @@ function compile_mlir!(
             ),
             "no_triton",
         )
+    elseif compile_options.optimization_passes === :before_triton_lowering
+        run_pass_pipeline!(
+            mod,
+            join(
+                if compile_options.raise_first
+                    ["mark-func-memory-effects", opt_passes]
+                else
+                    [
+                        "mark-func-memory-effects",
+                        opt_passes,
+                        "enzyme-batch",
+                        opt_passes2,
+                        enzyme_pass,
+                        opt_passes_with_triton,
+                        "canonicalize",
+                        "remove-unnecessary-enzyme-ops",
+                        "enzyme-simplify-math",
+                        legalize_chlo_to_stablehlo...,
+                        opt_passes2,
+                    ]
+                end,
+                ',',
+            ),
+            "before_triton_lowering",
+        )
     elseif compile_options.optimization_passes === :before_kernel
         run_pass_pipeline!(
             mod,
diff --git a/src/Ops.jl b/src/Ops.jl
@@ -1852,14 +1852,22 @@ function triton_call(
     block_z::TracedRNumber{<:Integer},
     num_ctas::Integer=1,
     num_warps::Integer=4,
+    threads_per_warp::Integer=32,
+    enable_source_remat::Bool=false,
     location=mlir_stacktrace("triton_call", @__FILE__, @__LINE__),
 )
     _, symref, modop = _extract_function(
         mlir_code; func_name, func_op_kind="tt.func", location
     )
 
-    MLIR.IR.attr!(modop, "ttg.num-wraps", MLIR.IR.Attribute(Int32(num_warps)))
-    MLIR.IR.attr!(modop, "ttg.num-ctas", MLIR.IR.Attribute(Int32(num_ctas)))
+    MLIR.IR.attr!(modop, "enzymexla.ttg.num-warps", MLIR.IR.Attribute(Int32(num_warps)))
+    MLIR.IR.attr!(modop, "enzymexla.ttg.num-ctas", MLIR.IR.Attribute(Int32(num_ctas)))
+    MLIR.IR.attr!(
+        modop, "enzymexla.ttg.threads-per-warp", MLIR.IR.Attribute(Int32(threads_per_warp))
+    )
+    if enable_source_remat
+        MLIR.IR.attr!(modop, "enzymexla.ttg.enable-source-remat", MLIR.IR.UnitAttribute())
+    end
 
     result_types = MLIR.IR.Type[]
     output_operand_aliases = MLIR.IR.Attribute[]
diff --git a/test/integration/triton/layer_norm.jl b/test/integration/triton/layer_norm.jl
@@ -53,19 +53,19 @@ end
 
 @testset "fused_layer_norm" begin
     if RunningOnCUDA
-        x_ra = Reactant.to_rarray(rand(Float32, 256, 2056))
-        weight_ra = Reactant.to_rarray(rand(Float32, 256))
-        bias_ra = Reactant.to_rarray(rand(Float32, 256))
+        x_ra = Reactant.to_rarray(rand(Float32, 257, 2056))
+        weight_ra = Reactant.to_rarray(rand(Float32, 257))
+        bias_ra = Reactant.to_rarray(rand(Float32, 257))
 
         y_ra1, mean_ra1, rstd_ra1 = @jit layer_norm_triton(x_ra, weight_ra, bias_ra, false)
         y_ra2, mean_ra2, rstd_ra2 = @jit layer_norm_naive(x_ra, weight_ra, bias_ra)
         y_ra3, mean_ra3, rstd_ra3 = @jit layer_norm_triton(x_ra, weight_ra, bias_ra, true)
 
-        @test_broken y_ra1 ≈ y_ra2
-        @test_broken y_ra2 ≈ y_ra3
-        @test_broken mean_ra1 ≈ mean_ra2
+        @test y_ra1 ≈ y_ra2
+        @test y_ra2 ≈ y_ra3
+        @test mean_ra1 ≈ mean_ra2
         @test mean_ra2 ≈ mean_ra3
-        @test_broken rstd_ra1 ≈ rstd_ra2
+        @test rstd_ra1 ≈ rstd_ra2
         @test rstd_ra2 ≈ rstd_ra3
     end
 end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -66,7 +66,7 @@ const REACTANT_TEST_GROUP = lowercase(get(ENV, "REACTANT_TEST_GROUP", "all"))
             @safetestset "low_memory_dropout" include(
                 "integration/triton/low_memory_dropout.jl"
             )
-            @safetestset "layer norm" include("integration/triton/layer_norm.jl") # XXX
+            @safetestset "layer norm" include("integration/triton/layer_norm.jl")
             # @safetestset "attention" include("integration/triton/attention.jl")
             @safetestset "libdevice" include("integration/triton/libdevice.jl")
             # @safetestset "grouped gemm" include("integration/triton/grouped_gemm.jl")

Original file line number	Diff line number	Diff line change
`@@ -230,6 +230,7 @@ function CompileOptions(;`
`230`	`230`	`:just_batch,`
`231`	`231`	`:none,`
`232`	`232`	`:no_triton,`
	`233`	`+ :before_triton_lowering,`
`233`	`234`	`]`
`234`	`235`	`end`
`235`	`236`
Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,7 @@ const REACTANT_TEST_GROUP = lowercase(get(ENV, "REACTANT_TEST_GROUP", "all"))`
`66`	`66`	`@safetestset "low_memory_dropout" include(`
`67`	`67`	`"integration/triton/low_memory_dropout.jl"`
`68`	`68`	`)`
`69`		`- @safetestset "layer norm" include("integration/triton/layer_norm.jl") # XXX`
	`69`	`+ @safetestset "layer norm" include("integration/triton/layer_norm.jl")`
`70`	`70`	`# @safetestset "attention" include("integration/triton/attention.jl")`
`71`	`71`	`@safetestset "libdevice" include("integration/triton/libdevice.jl")`
`72`	`72`	`# @safetestset "grouped gemm" include("integration/triton/grouped_gemm.jl")`