feat: put the tt func in a separate module and use symbol ref

avik-pal · avik-pal · commit 7f0afd80b7a8 · 2025-09-28T11:20:32.000-05:00
diff --git a/deps/ReactantExtra/WORKSPACE b/deps/ReactantExtra/WORKSPACE
@@ -4,7 +4,7 @@ NSYNC_COMMIT = "82b118aa7ace3132e517e2c467f8732978cf4023"
 
 NSYNC_SHA256 = ""
 
-ENZYMEXLA_COMMIT = "b59185c7586783a17d9486e682307ae89c713964"
+ENZYMEXLA_COMMIT = "52ae936cae8f7050adc26c4ed5e755200497dc86"
 
 ENZYMEXLA_SHA256 = ""
 
diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -1310,42 +1310,42 @@ function triton_optimization_passes()
             "convert-nvvm-to-llvm",
             # common passes
             "canonicalize",
-            # # ttir passes
-            # "triton-combine",
-            # "triton-reorder-broadcast",
-            # "triton-rewrite-tensor-pointer",
-            # "triton-rewrite-tensor-descriptor-to-pointer",
-            # "triton-loop-unroll",
-            # "triton-licm",
-            # "triton-loop-aware-cse",
-            # # TODO: should num-warps and num-ctas be set for each kernel?
-            # "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
-            # # ttgir passes
-            # "tritongpu-coalesce",
-            # "tritongpu-optimize-thread-locality",
-            # "tritongpu-hoist-tmem-alloc",
-            # "tritongpu-assign-latencies",
-            # "tritongpu-pipeline",
-            # "tritongpu-schedule-loops",
-            # "tritongpu-automatic-warp-specialization",
-            # "tritongpu-prefetch",
-            # "tritongpu-accelerate-matmul",
-            # "tritongpu-reorder-instructions",
-            # "tritongpu-F32DotTC",
-            # "tritongpu-optimize-dot-operands",
-            # "tritongpu-remove-layout-conversions",
-            # "tritongpu-reduce-data-duplication",
-            # "tritongpu-hoist-tmem-alloc",
-            # "tritongpu-fuse-nested-loops",
-            # "tritongpu-rewrite-partition-dependencies",
-            # "tritongpu-partition-loops",
-            # "tritongpu-combine-tensor-select-and-if",
-            # # ttgir to llvm passes
-            # "tritongpu-allocate-warp-groups",
-            # "allocate-shared-memory",
-            # "tritongpu-global-scratch-memory-allocation",
-            # "tritongpu-optimize-accumulator-init",
-            # "tritongpu-coalesce-async-copy",
+            # ttir passes
+            "triton-combine",
+            "triton-reorder-broadcast",
+            "triton-rewrite-tensor-pointer",
+            "triton-rewrite-tensor-descriptor-to-pointer",
+            "triton-loop-unroll",
+            "triton-licm",
+            "triton-loop-aware-cse",
+            # TODO: should num-warps and num-ctas be set for each kernel?
+            "convert-triton-to-tritongpu{target=cuda:$(cubinChip[][4:end]) num-warps=1 threads-per-warp=$(cuWarpSize[]) num-ctas=1}",
+            # ttgir passes
+            "tritongpu-coalesce",
+            "tritongpu-optimize-thread-locality",
+            "tritongpu-hoist-tmem-alloc",
+            "tritongpu-assign-latencies",
+            "tritongpu-pipeline",
+            "tritongpu-schedule-loops",
+            "tritongpu-automatic-warp-specialization",
+            "tritongpu-prefetch",
+            "tritongpu-accelerate-matmul",
+            "tritongpu-reorder-instructions",
+            "tritongpu-F32DotTC",
+            "tritongpu-optimize-dot-operands",
+            "tritongpu-remove-layout-conversions",
+            "tritongpu-reduce-data-duplication",
+            "tritongpu-hoist-tmem-alloc",
+            "tritongpu-fuse-nested-loops",
+            "tritongpu-rewrite-partition-dependencies",
+            "tritongpu-partition-loops",
+            "tritongpu-combine-tensor-select-and-if",
+            # ttgir to llvm passes
+            "tritongpu-allocate-warp-groups",
+            "allocate-shared-memory",
+            "tritongpu-global-scratch-memory-allocation",
+            "tritongpu-optimize-accumulator-init",
+            "tritongpu-coalesce-async-copy",
         ],
         ",",
     )
@@ -2303,8 +2303,7 @@ function compile_mlir!(
         end
     end
 
-    # XXX: re-enable this pass
-    # run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
+    run_pass_pipeline!(mod, "mark-func-memory-effects", "mark-func-memory-effects")
 
     func_op = MLIR.API.mlirSymbolTableLookup(
         MLIR.IR.SymbolTable(MLIR.IR.Operation(mod)), fnname
diff --git a/src/Ops.jl b/src/Ops.jl
@@ -1705,54 +1705,60 @@ function _extract_function(
     func_name::String="main",
     func_op_kind::String="func.func",
     nested_module::Bool=false,
+    location::MLIR.IR.Location=MLIR.IR.Location(),
 )
     module_suffix = string(hash(code); base=16)
-    name_to_call = _new_function_name(func_name, module_suffix)
+    name_to_call = func_name * "_call_" * module_suffix
+    mod_name = func_name * "_module_" * module_suffix
+    symbol_attr_name = String(MLIR.API.mlirSymbolTableGetSymbolAttributeName())
 
-    current_module = MLIR.IR.mmodule()
     if nested_module
-        new_module = MLIR.IR.Module()
-        push!(MLIR.IR.body(current_module), MLIR.IR.Operation(new_module, true))
-        current_module = new_module
-    end
-    top_level_block = MLIR.IR.body(current_module)
+        region = MLIR.IR.Region()
+        push!(region, MLIR.IR.Block())
+        moduleop = MLIR.Dialects.builtin.module_(;
+            location, bodyRegion=region, sym_name=mod_name
+        )
+        MLIR.IR.rmfromparent!(moduleop)
+        push!(MLIR.IR.body(MLIR.IR.mmodule()), moduleop) # insert into parent module
 
-    symbol_attr_name = String(MLIR.API.mlirSymbolTableGetSymbolAttributeName())
-    fn = MLIR.IR.lookup(
-        MLIR.IR.SymbolTable(MLIR.IR.Operation(current_module)), name_to_call
-    )
+        top_level_block = MLIR.IR.Block(
+            MLIR.API.mlirModuleGetBody(MLIR.API.mlirModuleFromOperation(moduleop)), false
+        )
+        fn = nothing
+    else
+        current_module = MLIR.IR.mmodule()
+        moduleop = MLIR.IR.Operation(current_module)
+        top_level_block = MLIR.IR.body(current_module)
+        fn = MLIR.IR.lookup(MLIR.IR.SymbolTable(moduleop), name_to_call)
+    end
 
     if isnothing(fn)
         new_mod = parse(MLIR.IR.Module, code)
         new_mod_op = MLIR.IR.Operation(new_mod)
         body = MLIR.IR.body(new_mod)
 
         operations = collect(MLIR.IR.OperationIterator(body))
-        for op in operations
-            if MLIR.IR.name(op) == func_op_kind
-                fn_name = String(MLIR.IR.attr(op, symbol_attr_name))
-                if fn_name == func_name
-                    fn = op
-                end
+        idx = Base.findfirst(op -> MLIR.IR.name(op) == func_op_kind, operations)
+        @assert idx !== nothing
+        op = operations[idx]
 
-                res = MLIR.IR.LogicalResult(
-                    MLIR.API.mlirSymbolTableReplaceAllSymbolUses(
-                        fn_name, name_to_call, new_mod_op
-                    ),
-                )
-                @assert res == MLIR.IR.success() "hlo_call: failed to rename $fn_name"
-
-                # Set function private
-                MLIR.IR.attr!(
-                    op,
-                    MLIR.API.mlirSymbolTableGetVisibilityAttributeName(),
-                    MLIR.IR.Attribute("private"),
-                )
-
-                # Change function name
-                MLIR.IR.attr!(op, symbol_attr_name, MLIR.IR.Attribute(name_to_call))
-            end
-        end
+        fn_name = String(MLIR.IR.attr(op, symbol_attr_name))
+        fn_name == func_name && (fn = op)
+
+        res = MLIR.IR.LogicalResult(
+            MLIR.API.mlirSymbolTableReplaceAllSymbolUses(fn_name, name_to_call, new_mod_op)
+        )
+        @assert res == MLIR.IR.success() "hlo_call: failed to rename $fn_name"
+
+        # Set function private
+        MLIR.IR.attr!(
+            op,
+            MLIR.API.mlirSymbolTableGetVisibilityAttributeName(),
+            MLIR.IR.Attribute("private"),
+        )
+
+        # Change function name
+        MLIR.IR.attr!(op, symbol_attr_name, MLIR.IR.Attribute(name_to_call))
 
         for op in operations
             MLIR.IR.rmfromparent!(op)
@@ -1764,7 +1770,7 @@ function _extract_function(
         error("hlo_call: could not find function $func_name in the provided module")
     end
 
-    return fn, name_to_call
+    return fn, name_to_call, mod_name
 end
 
 function triton_call(
@@ -1778,8 +1784,8 @@ function triton_call(
     location=mlir_stacktrace("triton_call", @__FILE__, @__LINE__),
     # TODO: other kwargs
 )
-    _, name_to_call = _extract_function(
-        mlir_code; func_name, func_op_kind="tt.func", nested_module=true
+    _, name_to_call, mod_name = _extract_function(
+        mlir_code; func_name, func_op_kind="tt.func", nested_module=true, location
     )
 
     enzymexla.triton_call(
@@ -1788,7 +1794,9 @@ function triton_call(
         grid_z.mlir_data,
         shmem.mlir_data,
         [Reactant.TracedUtils.get_mlir_data(a) for a in args];
-        fn=MLIR.IR.FlatSymbolRefAttribute(name_to_call),
+        fn=MLIR.IR.SymbolRefAttribute(
+            mod_name, MLIR.IR.Attribute[MLIR.IR.FlatSymbolRefAttribute(name_to_call)]
+        ),
         result_0=MLIR.IR.Type[],
         location,
     )
@@ -1826,7 +1834,9 @@ julia> Reactant.@jit(
     func_name="main",
     location=mlir_stacktrace("hlo_call", @__FILE__, @__LINE__),
 )
-    fn, name_to_call = _extract_function(code; func_name, func_op_kind="func.func")
+    fn, name_to_call, _ = _extract_function(
+        code; func_name, func_op_kind="func.func", location
+    )
 
     ftype_attr = MLIR.IR.attr(fn, "function_type")
     ftype = MLIR.IR.Type(ftype_attr)