feat: correctly set strides + get n_regs

avik-pal · avik-pal · commit 954f2575639f · 2025-10-17T08:00:24.000-05:00
diff --git a/deps/ReactantExtra/API.cpp b/deps/ReactantExtra/API.cpp
@@ -816,6 +816,26 @@ REACTANT_ABI void ReactantCudaDeviceGetProperties(DeviceProperties *jlprops,
   jlprops->maxThreadsPerMultiProcessor = props.maxThreadsPerMultiProcessor;
 }
 
+REACTANT_ABI void ReactantCudaGetRegsSpillsMaxThreadsFromBinary(
+    const char *binary, const char *fnname, int32_t *regs, int32_t *spills,
+    int32_t *maxThreads) {
+  CUfunction fun;
+  CUmodule mod;
+
+  ReactantHandleCuResult(cuModuleLoadData(&mod, binary));
+  ReactantHandleCuResult(cuModuleGetFunction(&fun, mod, fnname));
+
+  ReactantHandleCuResult(
+      cuFuncGetAttribute(regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
+  ReactantHandleCuResult(
+      cuFuncGetAttribute(spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
+  *spills /= 4;
+  ReactantHandleCuResult(cuFuncGetAttribute(
+      maxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun));
+
+  return;
+}
+
 #else
 
 REACTANT_ABI int32_t ReactantCudaDriverGetVersion() { return 0; }
@@ -831,6 +851,10 @@ REACTANT_ABI int32_t ReactantCudaDeviceGetWarpSizeInThreads() { return 0; }
 REACTANT_ABI void ReactantCudaDeviceGetProperties(DeviceProperties *jlprops,
                                                   int32_t device_id) {}
 
+REACTANT_ABI void ReactantCudaGetRegsSpillsMaxThreadsFromBinary(
+    const char *binary, const char *fnname, int32_t *regs, int32_t *spills,
+    int32_t *maxThreads) {}
+
 #endif
 
 REACTANT_ABI void *UnsafeBufferPointer(PjRtBuffer *buffer) {
diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD
@@ -984,6 +984,7 @@ cc_library(
             "-Wl,-exported_symbol,_ReactantCudaDeviceGetComputeCapalilityMinor",
             "-Wl,-exported_symbol,_ReactantCudaDeviceGetWarpSizeInThreads",
             "-Wl,-exported_symbol,_ReactantCudaDeviceGetProperties",
+            "-Wl,-exported_symbol,_ReactantCudaGetRegsSpillsMaxThreadsFromBinary",
             "-Wl,-exported_symbol,_PjRtDeviceGetLocalDeviceId",
             "-Wl,-exported_symbol,_PjRtDeviceGetGlobalDeviceId",
             "-Wl,-exported_symbol,_PjRtDeviceGetLocalHardwareId",
diff --git a/ext/ReactantPythonCallExt/pycall.jl b/ext/ReactantPythonCallExt/pycall.jl
@@ -47,16 +47,25 @@ function overlayed_pycall_with_jax_tracing(f::Py, args...)
     return length(res) == 0 ? nothing : (length(res) == 1 ? res[1] : res)
 end
 
-function normalize_grid_and_blocks(grid_fn, metadata, device_properties)
-    return normalize_grid_and_blocks(
-        grid_fn(metadata, device_properties), metadata, device_properties
-    )
+struct TritonMetadata{CK,MD,DP}
+    compiled_kernel::CK
+    metadata::MD
+    device_properties::DP
+    num_warps::Int
+    num_stages::Int
+    num_ctas::Int
+    num_regs::Int
+    num_spills::Int
+    max_num_threads::Int
 end
 
-function normalize_grid_and_blocks(grid::Integer, metadata, device_properties)
-    return normalize_grid_and_blocks((grid,), metadata, device_properties)
+function normalize_grid_and_blocks(grid_fn, metadata)
+    return normalize_grid_and_blocks(grid_fn(metadata), metadata)
+end
+function normalize_grid_and_blocks(grid::Integer, metadata)
+    return normalize_grid_and_blocks((grid,), metadata)
 end
-function normalize_grid_and_blocks(grid::Dims{N}, metadata, device_properties) where {N}
+function normalize_grid_and_blocks(grid::Dims{N}, metadata) where {N}
     @assert N <= 3
     @assert all(grid .> 0)
     return (grid..., ntuple(_ -> 1, 3 - N)...)
@@ -131,15 +140,40 @@ function overlayed_pycall_with_triton(
 
     # Currently we are doing a double compilation here. can we do better?
     # we are compiling here + lowering again inside enzymejax
-    ccinfo = triton.compile(src; target=target, options=options.__dict__)
+    compiled_kernel = triton.compile(src; target=target, options=options.__dict__)
+
+    cubin = pyconvert(Vector{UInt8}, compiled_kernel.asm["cubin"])
+    fname = pyconvert(String, compiled_kernel.metadata.name)
+    n_regs, n_spills, n_max_threads = Ref{Int32}(), Ref{Int32}(), Ref{Int32}()
+    GC.@preserve cubin fname n_regs n_spills n_max_threads begin
+        @ccall Reactant.MLIR.API.mlir_c.ReactantCudaGetRegsSpillsMaxThreadsFromBinary(
+            cubin::Ptr{Cvoid},
+            fname::Cstring,
+            n_regs::Ptr{Int32},
+            n_spills::Ptr{Int32},
+            n_max_threads::Ptr{Int32},
+        )::Cvoid
+    end
+
+    metadata = TritonMetadata(
+        compiled_kernel,
+        compiled_kernel.metadata,
+        device_properties,
+        num_warps,
+        num_stages,
+        num_ctas,
+        Int(n_regs[]),
+        Int(n_spills[]),
+        Int(n_max_threads[]),
+    )
 
-    grid = normalize_grid_and_blocks(grid, ccinfo.metadata, device_properties)
-    blocks = normalize_grid_and_blocks(blocks, ccinfo.metadata, device_properties)
+    grid = normalize_grid_and_blocks(grid, metadata)
+    blocks = normalize_grid_and_blocks(blocks, metadata)
 
     return @opcall triton_call(
-        pyconvert(String, ccinfo.asm["source"]),
+        pyconvert(String, compiled_kernel.asm["source"]),
         filter(x -> x isa Reactant.TracedType, args)...;
-        func_name=pyconvert(String, ccinfo.metadata.name),
+        func_name=fname,
         grid_x=@opcall(constant(grid[1])),
         grid_y=@opcall(constant(grid[2])),
         grid_z=@opcall(constant(grid[3])),
diff --git a/src/Reactant.jl b/src/Reactant.jl
@@ -236,6 +236,35 @@ include("stdlibs/Base.jl")
 # Other Integrations
 include("Enzyme.jl")
 
+"""
+    rowmajor_strides(x::AbstractArray)
+
+Returns the strides of the array `x` assuming that the array is stored in row-major order.
+"""
+rowmajor_strides(x::AbstractArray) = rowmajor_strides(size(x))
+function rowmajor_strides(sz::NTuple{N,Int}) where {N}
+    strides = ntuple(_ -> 1, N)
+    for i in (N - 1):-1:1
+        strides = Base.setindex(strides, strides[i + 1] * sz[i + 1], i)
+    end
+    return strides
+end
+
+"""
+    rowmajor_stride(x::AbstractArray, i::Integer)
+
+Returns the stride of the array `x` at dimension `i` assuming that the array is stored in
+row-major order.
+"""
+rowmajor_stride(x::AbstractArray, i::Integer) = rowmajor_stride(size(x), i)
+function rowmajor_stride(sz::NTuple{N,Int}, i::Integer) where {N}
+    s = 1
+    for j in (i + 1):N
+        s *= sz[j]
+    end
+    return s
+end
+
 export StackedBatchDuplicated, StackedBatchDuplicatedNoNeed
 
 const TracedType = Union{TracedRArray,TracedRNumber,MissingTracedValue}