test: layer_norm + libdevice

avik-pal · avik-pal · commit 5fec1e040201 · 2025-10-16T22:11:39.000-05:00
diff --git a/test/integration/triton/layer_norm.jl b/test/integration/triton/layer_norm.jl
@@ -0,0 +1,67 @@
+using PythonCall, Reactant, Test
+
+pyimport("sys").path.append(@__DIR__)
+
+layer_norm_kernel = pyimport("layer_norm").layer_norm_fwd_fused
+
+function layer_norm_triton(
+    x::AbstractMatrix{T}, weight::AbstractVector{T}, bias::AbstractVector{T}
+) where {T}
+    x_transposed = permutedims(x, (2, 1)) # match python array layout
+    y = similar(x_transposed)
+    M, N = size(x_transposed)
+    mean = similar(x_transposed, Float32, M)
+    rstd = similar(x_transposed, Float32, M)
+
+    max_fused_size = 65536 ÷ sizeof(T)
+    block_size = min(max_fused_size, nextpow(2, N))
+
+    if N > block_size
+        throw(ArgumentError("This layer norm doesn't support feature dim >= 64KB."))
+    end
+
+    num_warps = min(max(block_size ÷ 256, 1), 8)
+
+    layer_norm_kernel(
+        x_transposed,
+        y,
+        weight,
+        bias,
+        mean,
+        rstd,
+        Reactant.rowmajor_stride(x_transposed, 1),
+        N,
+        1.0f-5,
+        block_size;
+        num_warps=num_warps,
+        num_ctas=1,
+        grid=(M,),
+        blocks=(block_size,),
+    )
+
+    return permutedims(y, (2, 1)), mean, rstd
+end
+
+function layer_norm_naive(
+    x::AbstractMatrix{T}, weight::AbstractVector{T}, bias::AbstractVector{T}
+) where {T}
+    mean = sum(x; dims=1) ./ size(x, 1)
+    rstd = 1 ./ sqrt.(sum(abs2, x .- mean; dims=1) ./ size(x, 1) .+ 1e-5)
+    x_hat = (x .- mean) .* rstd
+    return x_hat .* weight .+ bias, vec(mean), vec(rstd)
+end
+
+@testset "fused_layer_norm" begin
+    if RunningOnCUDA
+        x_ra = Reactant.to_rarray(rand(Float32, 256, 2056))
+        weight_ra = Reactant.to_rarray(rand(Float32, 256))
+        bias_ra = Reactant.to_rarray(rand(Float32, 256))
+
+        y_ra1, mean_ra1, rstd_ra1 = @jit layer_norm_triton(x_ra, weight_ra, bias_ra)
+        y_ra2, mean_ra2, rstd_ra2 = @jit layer_norm_naive(x_ra, weight_ra, bias_ra)
+
+        @test y_ra1 ≈ y_ra2
+        @test mean_ra1 ≈ mean_ra2
+        @test rstd_ra1 ≈ rstd_ra2
+    end
+end
diff --git a/test/integration/triton/layer_norm.py b/test/integration/triton/layer_norm.py
@@ -0,0 +1,52 @@
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def layer_norm_fwd_fused(
+    X,  # pointer to the input
+    Y,  # pointer to the output
+    W,  # pointer to the weights
+    B,  # pointer to the biases
+    Mean,  # pointer to the mean
+    Rstd,  # pointer to the 1/std
+    stride,  # how much to increase the pointer when moving by 1 row
+    N,  # number of columns in X
+    eps,  # epsilon to avoid division by zero
+    BLOCK_SIZE: tl.constexpr,
+):
+    # Map the program id to the row of X and Y it should compute.
+    row = tl.program_id(0)
+    Y += row * stride
+    X += row * stride
+    # Compute mean
+    mean = 0
+    _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        a = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+        _mean += a
+    mean = tl.sum(_mean, axis=0) / N
+    # Compute variance
+    _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32)
+        x = tl.where(cols < N, x - mean, 0.0)
+        _var += x * x
+    var = tl.sum(_var, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    # Write mean / rstd
+    tl.store(Mean + row, mean)
+    tl.store(Rstd + row, rstd)
+    # Normalize and apply linear transformation
+    for off in range(0, N, BLOCK_SIZE):
+        cols = off + tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        w = tl.load(W + cols, mask=mask)
+        b = tl.load(B + cols, mask=mask)
+        x = tl.load(X + cols, mask=mask, other=0.0).to(tl.float32)
+        x_hat = (x - mean) * rstd
+        y = x_hat * w + b
+        # Write output
+        tl.store(Y + cols, y, mask=mask)
diff --git a/test/integration/triton/libdevice.jl b/test/integration/triton/libdevice.jl
@@ -0,0 +1,21 @@
+using PythonCall, Reactant, Test
+
+pyimport("sys").path.append(@__DIR__)
+
+asin_kernel = pyimport("libdevice").asin_kernel
+
+const RunningOnCUDA = contains(string(Reactant.devices()[1]), "CUDA")
+
+function asin_triton(x::AbstractVector{T}) where {T}
+    out = similar(x)
+    asin_kernel(x, out, length(x), 1024; grid=(cld(length(x), 1024),), blocks=(1024,))
+    return out
+end
+
+@testset "libdevice asin" begin
+    if RunningOnCUDA
+        x_ra = Reactant.to_rarray(rand(Float32, 2096))
+
+        @test @jit(asin_triton(x_ra)) ≈ @jit(asin.(x_ra))
+    end
+end
diff --git a/test/integration/triton/libdevice.py b/test/integration/triton/libdevice.py
@@ -0,0 +1,19 @@
+import triton
+import triton.language as tl
+from triton.language.extra import libdevice
+
+
+@triton.jit
+def asin_kernel(
+    x_ptr,
+    y_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    x = libdevice.asin(x)
+    tl.store(y_ptr + offsets, x, mask=mask)
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -66,9 +66,9 @@ const REACTANT_TEST_GROUP = lowercase(get(ENV, "REACTANT_TEST_GROUP", "all"))
             @safetestset "low_memory_dropout" include(
                 "integration/triton/low_memory_dropout.jl"
             )
-            # @safetestset "layer norm" include("integration/triton/layer_norm.jl")
+            @safetestset "layer norm" include("integration/triton/layer_norm.jl")
             # @safetestset "attention" include("integration/triton/attention.jl")
-            # @safetestset "libdevice" include("integration/triton/libdevice.jl")
+            @safetestset "libdevice" include("integration/triton/libdevice.jl")
             # @safetestset "grouped gemm" include("integration/triton/grouped_gemm.jl")
             # @safetestset "persistant matmul" include(
             #     "integration/triton/persistant_matmul.jl"