test: add some triton tests

avik-pal · avik-pal · commit e5ea2f969dec · 2025-10-17T08:00:24.000-05:00
diff --git a/test/integration/triton/low_memory_dropout.jl b/test/integration/triton/low_memory_dropout.jl
@@ -0,0 +1,38 @@
+using PythonCall, Reactant, Test
+
+pyimport("sys").path.append(@__DIR__)
+
+low_memory_dropout_kernel = pyimport("low_memory_dropout").seeded_dropout_kernel
+
+const RunningOnCUDA = contains(string(Reactant.devices()[1]), "CUDA")
+
+function seeded_dropout(x::AbstractVector{T}, p::Number, seed) where {T}
+    output = similar(x)
+    mask = similar(x, Bool)
+    low_memory_dropout_kernel(
+        x,
+        output,
+        mask,
+        length(x),
+        p,
+        seed,
+        1024;
+        grid=(cld(length(x), 1024),),
+        blocks=(1024,),
+    )
+    return output, mask
+end
+
+function apply_dropout(x::AbstractVector{T}, mask::AbstractVector, p::Number) where {T}
+    return x .* mask ./ (1 - p)
+end
+
+@testset "low_memory_dropout" begin
+    if RunningOnCUDA
+        x_ra = Reactant.to_rarray(rand(Float32, 2056))
+
+        out, mask = @jit seeded_dropout(x_ra, 0.25f0, ConcreteRNumber(123))
+
+        @test @jit(apply_dropout(x_ra, mask, 0.25f0)) ≈ out
+    end
+end
diff --git a/test/integration/triton/low_memory_dropout.py b/test/integration/triton/low_memory_dropout.py
@@ -0,0 +1,29 @@
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def seeded_dropout_kernel(
+    x_ptr,
+    output_ptr,
+    mask_ptr,
+    n_elements,
+    p,
+    seed,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # compute memory offsets of elements handled by this instance
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # load data from x
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    # randomly prune it
+    random = tl.rand(seed, offsets)
+    x_keep = random > p
+    # write-back
+    output = tl.where(x_keep, x / (1 - p), 0.0)
+    mask_out = tl.where(x_keep, 1.0, 0.0)
+    tl.store(output_ptr + offsets, output, mask=mask)
+    tl.store(mask_ptr + offsets, mask_out, mask=mask)
diff --git a/test/integration/triton/softmax.jl b/test/integration/triton/softmax.jl
@@ -0,0 +1,60 @@
+using PythonCall, Reactant, Test
+
+pyimport("sys").path.append(@__DIR__)
+
+softmax_kernel = pyimport("softmax").softmax_kernel
+
+const RunningOnCUDA = contains(string(Reactant.devices()[1]), "CUDA")
+
+function softmax_naive(x::AbstractMatrix{T}) where {T}
+    x_max = maximum(x; dims=1)
+    z = x .- x_max
+    num = exp.(z)
+    denom = sum(num; dims=1)
+    return num ./ denom
+end
+
+function softmax_triton(x::AbstractMatrix{T}) where {T}
+    x_transposed = permutedims(x, (2, 1)) # match python array layout
+    out = similar(x_transposed)
+    n_rows, n_cols = size(x_transposed)
+
+    function grid_fn(metadata)
+        occupancy = (
+            metadata.device_properties.regs_per_block ÷
+            (metadata.num_regs * metadata.device_properties.warp_size * metadata.num_warps)
+        )
+
+        num_programs = min(
+            metadata.device_properties.multi_processor_count * min(
+                occupancy,
+                metadata.device_properties.shared_mem_per_block ÷ metadata.metadata.shared,
+            ),
+            n_rows,
+        )
+        return num_programs
+    end
+
+    softmax_kernel(
+        out,
+        x_transposed,
+        Reactant.rowmajor_stride(x_transposed, 1),
+        Reactant.rowmajor_stride(out, 1),
+        n_rows,
+        n_cols,
+        BLOCK_SIZE,
+        num_stages;
+        grid=grid_fn,
+        blocks=(BLOCK_SIZE,),
+    )
+
+    return permutedims(out, (2, 1))
+end
+
+@testset "softmax" begin
+    if RunningOnCUDA
+        x_ra = Reactant.to_rarray(rand(Float32, 132, 2056))
+
+        @test @jit(softmax_triton(x_ra)) ≈ @jit(softmax_naive(x_ra))
+    end
+end
diff --git a/test/integration/triton/softmax.py b/test/integration/triton/softmax.py
@@ -0,0 +1,38 @@
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def softmax_kernel(
+    output_ptr,
+    input_ptr,
+    input_row_stride,
+    output_row_stride,
+    n_rows,
+    n_cols,
+    BLOCK_SIZE: tl.constexpr,
+    num_stages: tl.constexpr,
+):
+    # starting row of the program
+    row_start = tl.program_id(0)
+    row_step = tl.num_programs(0)
+    for row_idx in tl.range(row_start, n_rows, row_step, num_stages=num_stages):
+        # The stride represents how much we need to increase the pointer to advance 1 row
+        row_start_ptr = input_ptr + row_idx * input_row_stride
+        # The block size is the next power of two greater than n_cols, so we can fit each
+        # row in a single block
+        col_offsets = tl.arange(0, BLOCK_SIZE)
+        input_ptrs = row_start_ptr + col_offsets
+        # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols
+        mask = col_offsets < n_cols
+        row = tl.load(input_ptrs, mask=mask, other=-float("inf"))
+        # Subtract maximum for numerical stability
+        row_minus_max = row - tl.max(row, axis=0)
+        # Note that exponentiation in Triton is fast but approximate (i.e., think __expf in CUDA)
+        numerator = tl.exp(row_minus_max)
+        denominator = tl.sum(numerator, axis=0)
+        softmax_output = numerator / denominator
+        # Write back output to DRAM
+        output_row_start_ptr = output_ptr + row_idx * output_row_stride
+        output_ptrs = output_row_start_ptr + col_offsets
+        tl.store(output_ptrs, softmax_output, mask=mask)
diff --git a/test/integration/triton/vector_add.jl b/test/integration/triton/vector_add.jl
@@ -0,0 +1,22 @@
+using PythonCall, Reactant, Test
+
+pyimport("sys").path.append(@__DIR__)
+
+add_kernel = pyimport("vector_add").add_kernel
+
+const RunningOnCUDA = contains(string(Reactant.devices()[1]), "CUDA")
+
+function vector_add_triton(x::AbstractVector{T}, y::AbstractVector{T}) where {T}
+    out = similar(x)
+    add_kernel(x, y, out, length(x), 1024; grid=(cld(length(x), 1024),), blocks=(1024,))
+    return out
+end
+
+@testset "vector_add" begin
+    if RunningOnCUDA
+        x_ra = Reactant.to_rarray(rand(Float32, 2096))
+        y_ra = Reactant.to_rarray(rand(Float32, 2096))
+
+        @test @jit(vector_add_triton(x_ra, y_ra)) ≈ @jit(x_ra .+ y_ra)
+    end
+end
diff --git a/test/integration/triton/vector_add.py b/test/integration/triton/vector_add.py
@@ -0,0 +1,31 @@
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def add_kernel(
+    x_ptr,  # *Pointer* to first input vector.
+    y_ptr,  # *Pointer* to second input vector.
+    output_ptr,  # *Pointer* to output vector.
+    n_elements,  # Size of the vector.
+    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process.
+    # NOTE: `constexpr` so it can be used as a shape value.
+):
+    # There are multiple 'programs' processing different data. We identify which program
+    # we are here:
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+    # This program will process inputs that are offset from the initial data.
+    # For instance, if you had a vector of length 256 and block_size of 64, the programs
+    # would each access the elements [0:64, 64:128, 128:192, 192:256].
+    # Note that offsets is a list of pointers:
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # Create a mask to guard memory operations against out-of-bounds accesses.
+    mask = offsets < n_elements
+    # Load x and y from DRAM, masking out any extra elements in case the input is not a
+    # multiple of the block size.
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    output = x + y
+    # Write x + y back to DRAM.
+    tl.store(output_ptr + offsets, output, mask=mask)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -59,6 +59,24 @@ const REACTANT_TEST_GROUP = lowercase(get(ENV, "REACTANT_TEST_GROUP", "all"))
             nranks = 2
             run(`$(mpiexec()) -n $nranks $(Base.julia_cmd()) integration/mpi.jl`)
         end
+        @testset "Triton" begin
+            @safetestset "vector_add" include("integration/triton/vector_add.jl")
+            @safetestset "softmax" include("integration/triton/softmax.jl")
+            # @safetestset "matmul" include("integration/triton/matmul.jl")
+            @safetestset "low_memory_dropout" include(
+                "integration/triton/low_memory_dropout.jl"
+            )
+            # @safetestset "layer norm" include("integration/triton/layer_norm.jl")
+            # @safetestset "attention" include("integration/triton/attention.jl")
+            # @safetestset "libdevice" include("integration/triton/libdevice.jl")
+            # @safetestset "grouped gemm" include("integration/triton/grouped_gemm.jl")
+            # @safetestset "persistant matmul" include(
+            #     "integration/triton/persistant_matmul.jl"
+            # )
+            # @safetestset "block scaled matmul" include(
+            #     "integration/triton/block_scaled_matmul.jl"
+            # )
+        end
     end
 
     if REACTANT_TEST_GROUP == "all" || REACTANT_TEST_GROUP == "neural_networks"