From 3652aeb306a66f3971fa98b9eecd64ab17c56aaa Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:10:41 -0300 Subject: [PATCH 01/23] KernelIntrinsics --- Project.toml | 2 +- src/CUDAKernels.jl | 60 +++++++++++++++++++++++++++++++++------------- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/Project.toml b/Project.toml index c0fb240200..5dba52ac6f 100644 --- a/Project.toml +++ b/Project.toml @@ -67,7 +67,7 @@ ExprTools = "0.1" GPUArrays = "11.2.4" GPUCompiler = "1.4" GPUToolbox = "0.3, 1" -KernelAbstractions = "0.9.38" +KernelAbstractions = "0.10" LLVM = "9.3.1" LLVMLoopInfo = "1" LazyArtifacts = "1" diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index 5a36ed5eaa..b6f89aed90 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -4,6 +4,7 @@ using ..CUDA using ..CUDA: @device_override, CUSPARSE, default_memory, UnifiedMemory import KernelAbstractions as KA +import KernelAbstractions: KernelIntrinsics as KI import StaticArrays import SparseArrays: AbstractSparseArray @@ -157,37 +158,61 @@ function (obj::KA.Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=n return nothing end + +function KI.KIKernel(::CUDABackend, f, args...; kwargs...) + kern = eval(quote + @cuda launch=false $(kwargs...) $(f)($(args...)) + end) + KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern) +end + +function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing) + threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize + threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups + + obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid) +end + + +function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int + Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items)) +end +function KI.max_work_group_size(::CUDABackend)::Int + Int(attribute(device(), DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)) +end +function KI.multiprocessor_count(::CUDABackend)::Int + Int(attribute(device(), DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)) +end + ## indexing ## COV_EXCL_START -@device_override @inline function KA.__index_Local_Linear(ctx) - return threadIdx().x +@device_override @inline function KI.get_local_id() + return (; x = Int(threadIdx().x), y = Int(threadIdx().y), z = Int(threadIdx().z)) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return blockIdx().x +@device_override @inline function KI.get_group_id() + return (; x = Int(blockIdx().x), y = Int(blockIdx().y), z = Int(blockIdx().z)) end -@device_override @inline function KA.__index_Global_Linear(ctx) - I = @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x) - # TODO: This is unfortunate, can we get the linear index cheaper - @inbounds LinearIndices(KA.__ndrange(ctx))[I] +@device_override @inline function KI.get_global_id() + return (; x = Int(blockDim().x), y = Int(blockDim().y), z = Int(blockDim().z)) end -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[threadIdx().x] +@device_override @inline function KI.get_local_size() + return (; x = Int((blockDim().x-1)*blockDim().x + threadIdx().x), y = Int((blockDim().y-1)*blockDim().y + threadIdx().y), z = Int((blockDim().z-1)*blockDim().z + threadIdx().z)) end -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[blockIdx().x] +@device_override @inline function KI.get_num_grouups() + return (; x = Int(gridDim().x), y = Int(gridDim().y), z = Int(gridDim().z)) end -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x) +@device_override @inline function KI.get_global_size() + return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(lockDim().z * gridDim().z)) end -@device_override @inline function KA.__validindex(ctx) +@device_override @inline function KI.__validindex(ctx) if KA.__dynamic_checkbounds(ctx) I = @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x) return I in KA.__ndrange(ctx) @@ -198,7 +223,8 @@ end ## shared and scratch memory -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +# @device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} CuStaticSharedArray(T, Dims) end @@ -208,7 +234,7 @@ end ## synchronization and printing -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() sync_threads() end From 05bdd5ca3b2a64162829f83f760f4b31bd8e5172 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:51:35 -0300 Subject: [PATCH 02/23] Tweak --- src/CUDAKernels.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index b6f89aed90..7f836d39fd 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -166,11 +166,11 @@ function KI.KIKernel(::CUDABackend, f, args...; kwargs...) KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern) end -function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing) +function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups - obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid) + obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid, kwargs...) end From b86ce952e4c2374d1bc10fd8c0f20c59a374c11b Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:55:16 -0300 Subject: [PATCH 03/23] dogfood [only tests] [only benchmarks] --- src/CUDA.jl | 1 + src/accumulate.jl | 26 +++++++++++++------------- src/device/random.jl | 8 +++----- src/indexing.jl | 2 +- src/mapreduce.jl | 40 ++++++++++++++++++++-------------------- 5 files changed, 38 insertions(+), 39 deletions(-) diff --git a/src/CUDA.jl b/src/CUDA.jl index 8a82201a0a..bbe06a0bba 100644 --- a/src/CUDA.jl +++ b/src/CUDA.jl @@ -3,6 +3,7 @@ module CUDA using GPUCompiler using GPUArrays +import KernelAbstractions: KernelIntrinsics as KI using GPUToolbox diff --git a/src/accumulate.jl b/src/accumulate.jl index 1ec21f20ea..051ecc11ef 100644 --- a/src/accumulate.jl +++ b/src/accumulate.jl @@ -15,16 +15,16 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArray, Rdim, Rpre, Rpost, Rother, neutral, init, ::Val{inclusive}=Val(true)) where {T, inclusive} - threads = blockDim().x - thread = threadIdx().x - block = blockIdx().x + threads = KI.get_local_size().x + thread = KI.get_local_id().x + block = KI.get_group_id().x temp = CuDynamicSharedArray(T, (2*threads,)) # iterate the main dimension using threads and the first block dimension - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x # iterate the other dimensions using the remaining block dimensions - j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y + j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y if j > length(Rother) return @@ -47,7 +47,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr offset = 1 d = threads>>1 while d > 0 - sync_threads() + KI.barrier() @inbounds if thread <= d ai = offset * (2*thread-1) bi = offset * (2*thread) @@ -66,7 +66,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr d = 1 while d < threads offset >>= 1 - sync_threads() + KI.barrier() @inbounds if thread <= d ai = offset * (2*thread-1) bi = offset * (2*thread) @@ -78,7 +78,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr d *= 2 end - sync_threads() + KI.barrier() # write results to device memory @inbounds if i <= length(Rdim) @@ -100,14 +100,14 @@ end function aggregate_partial_scan(op::Function, output::AbstractArray, aggregates::AbstractArray, Rdim, Rpre, Rpost, Rother, init) - threads = blockDim().x - thread = threadIdx().x - block = blockIdx().x + threads = KI.get_local_size().x + thread = KI.get_local_id().x + block = KI.get_group_id().x # iterate the main dimension using threads and the first block dimension - i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x + i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x # iterate the other dimensions using the remaining block dimensions - j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y + j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y @inbounds if i <= length(Rdim) && j <= length(Rother) I = Rother[j] diff --git a/src/device/random.jl b/src/device/random.jl index d776bf886d..0cbedef6ff 100644 --- a/src/device/random.jl +++ b/src/device/random.jl @@ -63,8 +63,6 @@ end @inline Philox2x32() = Philox2x32{7}() @inline function Base.getproperty(rng::Philox2x32, field::Symbol) - threadId = threadIdx().x + (threadIdx().y - 1i32) * blockDim().x + - (threadIdx().z - 1i32) * blockDim().x * blockDim().y warpId = (threadId - 1i32) >> 0x5 + 1i32 # fld1 if field === :key @@ -72,9 +70,9 @@ end elseif field === :ctr1 @inbounds global_random_counters()[warpId] elseif field === :ctr2 - blockId = blockIdx().x + (blockIdx().y - 1i32) * gridDim().x + - (blockIdx().z - 1i32) * gridDim().x * gridDim().y - globalId = threadId + (blockId - 1i32) * (blockDim().x * blockDim().y * blockDim().z) + globalId = KI.get_global_id().x + + (KI.get_global_id().y - 1i32) * KI.get_global_size().x + + (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y globalId%UInt32 end::UInt32 end diff --git a/src/indexing.jl b/src/indexing.jl index b958dc02ec..f0fab5711c 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -33,7 +33,7 @@ function Base.findall(bools::AnyCuArray{Bool}) if n > 0 ## COV_EXCL_START function kernel(ys::CuDeviceArray, bools, indices) - i = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x + i = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x @inbounds if i <= length(bools) && bools[i] i′ = CartesianIndices(bools)[i] diff --git a/src/mapreduce.jl b/src/mapreduce.jl index d796b5dae1..c7ec3cf21f 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -19,9 +19,9 @@ end @inline function reduce_block(op, val::T, neutral, shuffle::Val{true}) where T # shared mem for partial sums assume(warpsize() == 32) - shared = CuStaticSharedArray(T, 32) + shared = KI.localmemory(T, 32) - wid, lane = fldmod1(threadIdx().x, warpsize()) + wid, lane = fldmod1(KI.get_local_id().x, warpsize()) # each warp performs partial reduction val = reduce_warp(op, val) @@ -32,10 +32,10 @@ end end # wait for all partial reductions - sync_threads() + KI.barrier() # read from shared memory only if that warp existed - val = if threadIdx().x <= fld1(blockDim().x, warpsize()) + val = if KI.get_local_id().x <= fld1(KI.get_local_size().x, warpsize()) @inbounds shared[lane] else neutral @@ -49,8 +49,8 @@ end return val end @inline function reduce_block(op, val::T, neutral, shuffle::Val{false}) where T - threads = blockDim().x - thread = threadIdx().x + threads = KI.get_local_size().x + thread = KI.get_local_id().x # shared mem for a complete reduction shared = CuDynamicSharedArray(T, (threads,)) @@ -59,7 +59,7 @@ end # perform a reduction d = 1 while d < threads - sync_threads() + KI.barrier() index = 2 * d * (thread-1) + 1 @inbounds if index <= threads other_val = if index + d <= threads @@ -92,10 +92,10 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs # decompose the 1D hardware indices into separate ones for reduction (across threads # and possibly blocks if it doesn't fit) and other elements (remaining blocks) - threadIdx_reduce = threadIdx().x - blockDim_reduce = blockDim().x - blockIdx_reduce, blockIdx_other = fldmod1(blockIdx().x, length(Rother)) - gridDim_reduce = gridDim().x ÷ length(Rother) + threadIdx_reduce = KI.get_local_id().x + blockDim_reduce = KI.get_local_size().x + blockIdx_reduce, blockIdx_other = fldmod1(KI.get_group_id().x, length(Rother)) + gridDim_reduce = KI.get_num_groups().x ÷ length(Rother) # block-based indexing into the values outside of the reduction dimension # (that means we can safely synchronize threads within this block) @@ -134,7 +134,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs end function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As) - grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x + grid_idx = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x @inbounds if grid_idx <= length(Rother) Iother = Rother[grid_idx] @@ -160,14 +160,14 @@ end # factored out for use in tests function serial_mapreduce_threshold(dev) - max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) * - attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT) + max_concurrency = KI.max_work_group_size(CUDABackend()) * KI.multiprocessor_count(CUDABackend()) return max_concurrency end function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, A::Union{AbstractArray,Broadcast.Broadcasted}; init=nothing) where {F, OP, T} + backend = CUDABackend() if !isa(A, Broadcast.Broadcasted) # XXX: Base.axes isn't defined anymore for Broadcasted, breaking this check Base.check_reducedims(R, A) @@ -201,11 +201,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # If `Rother` is large enough, then a naive loop is more efficient than partial reductions. if length(Rother) >= serial_mapreduce_threshold(dev) args = (f, op, init, Rreduce, Rother, R, A) - kernel = @cuda launch=false serial_mapreduce_kernel(args...) + kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...) kernel_config = launch_configuration(kernel.fun) threads = kernel_config.threads blocks = cld(length(Rother), threads) - kernel(args...; threads, blocks) + kernel(args...; workgroupsize=threads, numworkgroups=blocks) return R end @@ -228,9 +228,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # we might not be able to launch all those threads to reduce each slice in one go. # that's why each threads also loops across their inputs, processing multiple values # so that we can span the entire reduction dimension using a single thread block. - kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A) + kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A) compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T) - kernel_config = launch_configuration(kernel.fun; shmem=compute_shmem∘compute_threads) + kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads) reduce_threads = compute_threads(kernel_config.threads) reduce_shmem = compute_shmem(reduce_threads) @@ -255,7 +255,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # perform the actual reduction if reduce_blocks == 1 # we can cover the dimensions to reduce using a single block - kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem) + kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem) else # TODO: provide a version that atomically reduces from different blocks @@ -286,7 +286,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, end partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A; - threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem) + ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem) GPUArrays.mapreducedim!(identity, op, R, partial; init) end From 70944da4801b4742db4c5285a99c9f7c68a1c8d7 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:48:00 -0300 Subject: [PATCH 04/23] CI --- .buildkite/pipeline.yml | 813 ++++++++++++++++---------------- test/base/kernelabstractions.jl | 8 +- 2 files changed, 419 insertions(+), 402 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a21abe8026..ae6aea7a2d 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -7,9 +7,6 @@ steps: plugins: - JuliaCI/julia#v1: version: "{{matrix.julia}}" - - JuliaCI/julia-test#v1: - test_args: "--quickfail" - allow_reresolve: false - JuliaCI/julia-coverage#v1: dirs: - src @@ -20,6 +17,18 @@ steps: cuda: "*" commands: | echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml + + julia -e 'println("--- :julia: Developing KernelAbstractions") + using Pkg + Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")' + + julia -e 'println("--- :julia: Instantiating project") + using Pkg + Pkg.develop(; path=pwd())' || exit 3 + + julia -e 'println("+++ :julia: Running tests") + using Pkg + Pkg.test("CUDA"; coverage=true)' if: | build.message =~ /\[only tests\]/ || build.message =~ /\[only julia\]/ || @@ -39,428 +48,434 @@ steps: julia: "nightly" soft_fail: true - # then, test supported CUDA toolkits (installed through the artifact system) - - group: "CUDA" - key: "cuda" - depends_on: "julia" - steps: - - label: "CUDA {{matrix.cuda}}" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - - JuliaCI/julia-test#v1: - test_args: "--quickfail core base libraries" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only cuda\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip cuda\]/ - timeout_in_minutes: 60 - matrix: - setup: - cuda: - - "13.0" - - "12.9" - - "12.8" - - "12.6" - - "12.5" - - "12.4" - - "12.3" - - "12.2" - - "12.1" - - "12.0" - adjustments: - - with: - cuda: "13.0" - soft_fail: true - commands: | - echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml + # # then, test supported CUDA toolkits (installed through the artifact system) + # - group: "CUDA" + # key: "cuda" + # depends_on: "julia" + # steps: + # - label: "CUDA {{matrix.cuda}}" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.11" + # - JuliaCI/julia-test#v1: + # test_args: "--quickfail core base libraries" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # agents: + # queue: "juliagpu" + # cuda: "*" + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only cuda\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip cuda\]/ + # timeout_in_minutes: 60 + # matrix: + # setup: + # cuda: + # - "13.0" + # - "12.9" + # - "12.8" + # - "12.6" + # - "12.5" + # - "12.4" + # - "12.3" + # - "12.2" + # - "12.1" + # - "12.0" + # adjustments: + # - with: + # cuda: "13.0" + # soft_fail: true + # commands: | + # echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml - - group: ":nesting_dolls: Subpackages" - depends_on: "cuda" - steps: - - label: "{{matrix.package}} on CUDA {{matrix.cuda}}" - matrix: - setup: - cuda: - - "12.0" - - "13.0" - package: - - "cuDNN" - - "cuTENSOR" - - "cuStateVec" - - "cuTensorNet" - adjustments: - - with: - package: "cuStateVec" - cuda: "12.0" - soft_fail: true - - with: - package: "cuTensorNet" - cuda: "12.0" - soft_fail: true - - with: - package: "cuStateVec" - cuda: "13.0" - soft_fail: true - - with: - package: "cuTensorNet" - cuda: "13.0" - soft_fail: true - plugins: - - JuliaCI/julia#v1: - version: "1.10" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only subpackages\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip subpackages\]/ - timeout_in_minutes: 30 - commands: | - julia -e ' - using Pkg + # - group: ":nesting_dolls: Subpackages" + # depends_on: "cuda" + # steps: + # - label: "{{matrix.package}} on CUDA {{matrix.cuda}}" + # matrix: + # setup: + # cuda: + # - "12.0" + # - "13.0" + # package: + # - "cuDNN" + # - "cuTENSOR" + # - "cuStateVec" + # - "cuTensorNet" + # adjustments: + # - with: + # package: "cuStateVec" + # cuda: "12.0" + # soft_fail: true + # - with: + # package: "cuTensorNet" + # cuda: "12.0" + # soft_fail: true + # - with: + # package: "cuStateVec" + # cuda: "13.0" + # soft_fail: true + # - with: + # package: "cuTensorNet" + # cuda: "13.0" + # soft_fail: true + # plugins: + # - JuliaCI/julia#v1: + # version: "1.10" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # agents: + # queue: "juliagpu" + # cuda: "*" + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only subpackages\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip subpackages\]/ + # timeout_in_minutes: 30 + # commands: | + # julia -e ' + # using Pkg - println("--- :julia: Instantiating project") - withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do - Pkg.activate(joinpath("lib", lowercase("{{matrix.package}}"))) - try - Pkg.instantiate() - catch - # if we fail to instantiate, assume that we need newer dependencies - deps = [PackageSpec(path=".")] - if "{{matrix.package}}" == "cuTensorNet" - push!(deps, PackageSpec(path="lib/cutensor")) - end - Pkg.develop(deps) - end + # println("--- :julia: Instantiating project") + # withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do + # Pkg.activate(joinpath("lib", lowercase("{{matrix.package}}"))) + # try + # Pkg.instantiate() + # catch + # # if we fail to instantiate, assume that we need newer dependencies + # deps = [PackageSpec(path=".")] + # if "{{matrix.package}}" == "cuTensorNet" + # push!(deps, PackageSpec(path="lib/cutensor")) + # end + # Pkg.develop(deps) + # end - Pkg.add("CUDA_Runtime_jll") - write(joinpath("lib", lowercase("{{matrix.package}}"), "LocalPreferences.toml"), - "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"") - end + # Pkg.add("CUDA_Runtime_jll") + # write(joinpath("lib", lowercase("{{matrix.package}}"), "LocalPreferences.toml"), + # "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"") + # end - println("+++ :julia: Running tests") - Pkg.test(; coverage=true)' + # println("+++ :julia: Running tests") + # Pkg.test(; coverage=true)' - - group: ":telescope: Downstream" - depends_on: "cuda" - steps: - #- label: "NNlib.jl" - # plugins: - # - JuliaCI/julia#v1: - # version: "1.11" - # - JuliaCI/julia-coverage#v1: - # dirs: - # - src - # - lib - # - examples - # command: | - # julia --project -e ' - # using Pkg - # - # cuda = pwd() - # cudnn = joinpath(cuda, "lib", "cudnn") - # devdir = mktempdir() - # nnlib = joinpath(devdir, "NNlib") - # - # println("--- :julia: Installing TestEnv") - # Pkg.activate(; temp=true) - # Pkg.add("TestEnv") - # using TestEnv - # - # println("--- :julia: Installing NNlib") - # withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0, - # "JULIA_PKG_DEVDIR" => devdir) do - # Pkg.develop("NNlib") - # Pkg.activate(nnlib) - # - # try - # Pkg.develop([PackageSpec(path=cuda), PackageSpec(path=cudnn)]) - # TestEnv.activate() - # catch err - # @error "Could not install NNlib" exception=(err,catch_backtrace()) - # exit(3) - # finally - # Pkg.activate(nnlib) - # end - # end - # - # println("+++ :julia: Running tests") - # Pkg.test(; coverage=true)' - # env: - # NNLIB_TEST_CUDA: "true" - # NNLIB_TEST_CPU: "false" - # agents: - # queue: "juliagpu" - # cuda: "*" - # if: | - # build.message =~ /\[only tests\]/ || - # build.message =~ /\[only downstream\]/ || - # build.message !~ /\[only/ && !build.pull_request.draft && - # build.message !~ /\[skip tests\]/ && - # build.message !~ /\[skip downstream\]/ - # timeout_in_minutes: 30 - # soft_fail: - # - exit_status: 3 - - label: "Enzyme.jl" - plugins: - - JuliaCI/julia#v1: - version: "1.10" # XXX: Enzyme.jl is broken on 1.11 - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - command: | - julia -e ' - using Pkg + # - group: ":telescope: Downstream" + # depends_on: "cuda" + # steps: + # #- label: "NNlib.jl" + # # plugins: + # # - JuliaCI/julia#v1: + # # version: "1.11" + # # - JuliaCI/julia-coverage#v1: + # # dirs: + # # - src + # # - lib + # # - examples + # # command: | + # # julia --project -e ' + # # using Pkg + # # + # # cuda = pwd() + # # cudnn = joinpath(cuda, "lib", "cudnn") + # # devdir = mktempdir() + # # nnlib = joinpath(devdir, "NNlib") + # # + # # println("--- :julia: Installing TestEnv") + # # Pkg.activate(; temp=true) + # # Pkg.add("TestEnv") + # # using TestEnv + # # + # # println("--- :julia: Installing NNlib") + # # withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0, + # # "JULIA_PKG_DEVDIR" => devdir) do + # # Pkg.develop("NNlib") + # # Pkg.activate(nnlib) + # # + # # try + # # Pkg.develop([PackageSpec(path=cuda), PackageSpec(path=cudnn)]) + # # TestEnv.activate() + # # catch err + # # @error "Could not install NNlib" exception=(err,catch_backtrace()) + # # exit(3) + # # finally + # # Pkg.activate(nnlib) + # # end + # # end + # # + # # println("+++ :julia: Running tests") + # # Pkg.test(; coverage=true)' + # # env: + # # NNLIB_TEST_CUDA: "true" + # # NNLIB_TEST_CPU: "false" + # # agents: + # # queue: "juliagpu" + # # cuda: "*" + # # if: | + # # build.message =~ /\[only tests\]/ || + # # build.message =~ /\[only downstream\]/ || + # # build.message !~ /\[only/ && !build.pull_request.draft && + # # build.message !~ /\[skip tests\]/ && + # # build.message !~ /\[skip downstream\]/ + # # timeout_in_minutes: 30 + # # soft_fail: + # # - exit_status: 3 + # - label: "Enzyme.jl" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.10" # XXX: Enzyme.jl is broken on 1.11 + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # command: | + # julia -e ' + # using Pkg - println("--- :julia: Instantiating project") - withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do - # add Enzyme to the test deps - Pkg.activate("test") - Pkg.add(["Enzyme", "EnzymeCore"]) + # println("--- :julia: Instantiating project") + # withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do + # # add Enzyme to the test deps + # Pkg.activate("test") + # Pkg.add(["Enzyme", "EnzymeCore"]) - # to check compatibility, also add Enzyme to the main environment - # (or Pkg.test, which merges both environments, could fail) - Pkg.activate(".") - # Try to co-develop Enzyme and KA, if that fails, try just to dev Enzyme - try - Pkg.develop([PackageSpec("Enzyme"), PackageSpec("KernelAbstractions")]) - catch err - try - Pkg.develop([PackageSpec("Enzyme")]) - catch err - @error "Could not install Enzyme" exception=(err,catch_backtrace()) - exit(3) - end - end - end + # # to check compatibility, also add Enzyme to the main environment + # # (or Pkg.test, which merges both environments, could fail) + # Pkg.activate(".") + # # Try to co-develop Enzyme and KA, if that fails, try just to dev Enzyme + # try + # Pkg.develop([PackageSpec("Enzyme"), PackageSpec("KernelAbstractions")]) + # catch err + # try + # Pkg.develop([PackageSpec("Enzyme")]) + # catch err + # @error "Could not install Enzyme" exception=(err,catch_backtrace()) + # exit(3) + # end + # end + # end - println("+++ :julia: Running tests") - Pkg.test(; coverage=true, test_args=`extensions/enzyme`)' - agents: - queue: "juliagpu" - cuda: "*" - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only downstream\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip downstream\]/ - timeout_in_minutes: 60 - soft_fail: true + # println("+++ :julia: Running tests") + # Pkg.test(; coverage=true, test_args=`extensions/enzyme`)' + # agents: + # queue: "juliagpu" + # cuda: "*" + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only downstream\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip downstream\]/ + # timeout_in_minutes: 60 + # soft_fail: true - - group: ":eyes: Special" - depends_on: "cuda" - steps: - - label: "GPU-less environment" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - - JuliaCI/julia-test#v1: - run_tests: false - command: | - julia --project -e ' - using CUDA - @assert !CUDA.functional() - @assert !isdefined(CUDA, :libcudart) - CUDA.set_runtime_version!(v"11.6")' - julia --project -e ' - using CUDA - @assert !CUDA.functional() - @assert isdefined(CUDA, :libcudart)' - agents: - queue: "juliagpu" - intel: "*" - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only special\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip special\]/ - timeout_in_minutes: 5 + # - group: ":eyes: Special" + # depends_on: "cuda" + # steps: + # - label: "GPU-less environment" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.11" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # - JuliaCI/julia-test#v1: + # run_tests: false + # command: | + # julia --project -e ' + # using CUDA + # @assert !CUDA.functional() + # @assert !isdefined(CUDA, :libcudart) + # CUDA.set_runtime_version!(v"11.6")' + # julia --project -e ' + # using CUDA + # @assert !CUDA.functional() + # @assert isdefined(CUDA, :libcudart)' + # agents: + # queue: "juliagpu" + # intel: "*" + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only special\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip special\]/ + # timeout_in_minutes: 5 - - label: "Compute sanitizer" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - - JuliaCI/julia-test#v1: - test_args: "--sanitize core base" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only special\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip special\]/ - timeout_in_minutes: 60 + # - label: "Compute sanitizer" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.11" + # - JuliaCI/julia-test#v1: + # test_args: "--sanitize core base" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # agents: + # queue: "juliagpu" + # cuda: "*" + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only special\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip special\]/ + # timeout_in_minutes: 60 - - label: "Legacy memory allocator" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - - JuliaCI/julia-test#v1: - test_args: "--quickfail core base" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - env: - JULIA_CUDA_MEMORY_POOL: 'none' - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only special\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip special\]/ - timeout_in_minutes: 30 + # - label: "Legacy memory allocator" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.11" + # - JuliaCI/julia-test#v1: + # test_args: "--quickfail core base" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # agents: + # queue: "juliagpu" + # cuda: "*" + # env: + # JULIA_CUDA_MEMORY_POOL: 'none' + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only special\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip special\]/ + # timeout_in_minutes: 30 - - label: "CuArray with {{matrix.memory}} memory" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - - JuliaCI/julia-test#v1: - test_args: "--quickfail core base libraries" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only special\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip special\]/ - timeout_in_minutes: 45 - matrix: - setup: - memory: - - "unified" - - "host" - commands: | - echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml + # - label: "CuArray with {{matrix.memory}} memory" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.11" + # - JuliaCI/julia-test#v1: + # test_args: "--quickfail core base libraries" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # agents: + # queue: "juliagpu" + # cuda: "*" + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only special\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip special\]/ + # timeout_in_minutes: 45 + # matrix: + # setup: + # memory: + # - "unified" + # - "host" + # commands: | + # echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml - - label: "MultiGPU" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - - JuliaCI/julia-test#v1: - test_args: "--gpu=0,1 --quickfail core base libraries" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - multigpu: "*" - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only special\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip special\]/ - timeout_in_minutes: 45 + # - label: "MultiGPU" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.11" + # - JuliaCI/julia-test#v1: + # test_args: "--gpu=0,1 --quickfail core base libraries" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # agents: + # queue: "juliagpu" + # cuda: "*" + # multigpu: "*" + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only special\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip special\]/ + # timeout_in_minutes: 45 - - label: ":older_man: Old dependencies" - plugins: - - JuliaCI/julia#v1: - version: "1.10" # use the oldest supported Julia version (and update below) - - JuliaCI/julia-test#v1: - test_args: "--quickfail core base" - allow_reresolve: false - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - commands: | - git clone https://github.com/StefanKarpinski/Resolver.jl /tmp/Resolver - julia -e 'using Pkg; Pkg.activate("/tmp/Resolver/bin"); Pkg.instantiate()' - julia /tmp/Resolver/bin/resolve.jl . --min=@alldeps --julia="1.10" - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only special\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip special\]/ - timeout_in_minutes: 30 + # - label: ":older_man: Old dependencies" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.10" # use the oldest supported Julia version (and update below) + # - JuliaCI/julia-test#v1: + # test_args: "--quickfail core base" + # allow_reresolve: false + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # agents: + # queue: "juliagpu" + # cuda: "*" + # commands: | + # git clone https://github.com/StefanKarpinski/Resolver.jl /tmp/Resolver + # julia -e 'using Pkg; Pkg.activate("/tmp/Resolver/bin"); Pkg.instantiate()' + # julia /tmp/Resolver/bin/resolve.jl . --min=@alldeps --julia="1.10" + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only special\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip special\]/ + # timeout_in_minutes: 30 - wait: ~ continue_on_failure: true - - label: ":books: Documentation" + # - label: ":books: Documentation" + # plugins: + # - JuliaCI/julia#v1: + # version: "1.11" + # command: | + # julia --project -e ' + # println("--- :julia: Instantiating project") + # using Pkg + # Pkg.instantiate() + # Pkg.activate("docs") + # Pkg.instantiate() + # push!(LOAD_PATH, @__DIR__) + + # println("+++ :julia: Building documentation") + # include("docs/make.jl")' + # agents: + # queue: "juliagpu" + # cuda: "*" + # if: | + # build.message =~ /\[only docs\]/ || + # build.message !~ /\[only/ && !build.pull_request.draft && + # build.message !~ /\[skip docs\]/ + # timeout_in_minutes: 15 + + - label: ":racehorse: Benchmarks" plugins: - JuliaCI/julia#v1: version: "1.11" command: | julia --project -e ' - println("--- :julia: Instantiating project") using Pkg - Pkg.instantiate() - Pkg.activate("docs") - Pkg.instantiate() - push!(LOAD_PATH, @__DIR__) - println("+++ :julia: Building documentation") - include("docs/make.jl")' - agents: - queue: "juliagpu" - cuda: "*" - if: | - build.message =~ /\[only docs\]/ || - build.message !~ /\[only/ && !build.pull_request.draft && - build.message !~ /\[skip docs\]/ - timeout_in_minutes: 15 + println("--- :julia: Developing KernelAbstractions") + Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")' - - label: ":racehorse: Benchmarks" - plugins: - - JuliaCI/julia#v1: - version: "1.11" - command: | julia --project=perf -e ' using Pkg diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl index 2cb607ee3e..2f2c4300b5 100644 --- a/test/base/kernelabstractions.jl +++ b/test/base/kernelabstractions.jl @@ -4,7 +4,9 @@ using SparseArrays include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl")) -Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray) +Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([ + "CPU synchronization", + "fallback test: callable types",])) for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false)) Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray) end @@ -16,7 +18,7 @@ end @testset "CUDA Backend Adapt Tests" begin # CPU → GPU A = sprand(Float32, 10, 10, 0.5) #CSC - A_d = adapt(CUDABackend(), A) + A_d = adapt(CUDABackend(), A) @test A_d isa CUSPARSE.CuSparseMatrixCSC @test adapt(CUDABackend(), A_d) |> typeof == typeof(A_d) @@ -24,5 +26,5 @@ end B_d = A |> cu # CuCSC B = adapt(KA.CPU(), A_d) @test B isa SparseMatrixCSC - @test adapt(KA.CPU(), B) |> typeof == typeof(B) + @test adapt(KA.CPU(), B) |> typeof == typeof(B) end From 0d49f177272c546b074ae94b5ea32afcafcef1f9 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:27:10 -0300 Subject: [PATCH 05/23] Apply suggestion from @christiangnrd --- src/mapreduce.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index c7ec3cf21f..b32a87aaff 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -286,7 +286,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, end partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A; - ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem) + workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem) GPUArrays.mapreducedim!(identity, op, R, partial; init) end From 762e591e7f17d7d9d66039e70849e546cfab2552 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:27:49 -0300 Subject: [PATCH 06/23] Apply suggestion from @christiangnrd --- src/mapreduce.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index b32a87aaff..8f75fcce59 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -255,7 +255,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # perform the actual reduction if reduce_blocks == 1 # we can cover the dimensions to reduce using a single block - kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem) + kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem) else # TODO: provide a version that atomically reduces from different blocks From e133fbdd8f65e09e58c23ad7654bc0c9c98a0727 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 12:36:35 -0300 Subject: [PATCH 07/23] Apply suggestion from @christiangnrd --- src/CUDAKernels.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index 7f836d39fd..abc7dbe906 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -204,7 +204,7 @@ end return (; x = Int((blockDim().x-1)*blockDim().x + threadIdx().x), y = Int((blockDim().y-1)*blockDim().y + threadIdx().y), z = Int((blockDim().z-1)*blockDim().z + threadIdx().z)) end -@device_override @inline function KI.get_num_grouups() +@device_override @inline function KI.get_num_groups() return (; x = Int(gridDim().x), y = Int(gridDim().y), z = Int(gridDim().z)) end From f3413babd7b70f9ef2e72c7d8aa1df51b880afa2 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 13:22:49 -0300 Subject: [PATCH 08/23] Fix --- src/CUDAKernels.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index abc7dbe906..c99ca7e723 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -212,7 +212,7 @@ end return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(lockDim().z * gridDim().z)) end -@device_override @inline function KI.__validindex(ctx) +@device_override @inline function KA.__validindex(ctx) if KA.__dynamic_checkbounds(ctx) I = @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x) return I in KA.__ndrange(ctx) From 0474a67eaa0f79184a5d64bc584d936955dbea5a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 13:22:54 -0300 Subject: [PATCH 09/23] shtgbr --- .buildkite/pipeline.yml | 88 ++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index ae6aea7a2d..f7cc0f5ef0 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,52 +1,52 @@ steps: # first, test supported Julia versions (using local CUDA as installed on the system) - - group: ":julia: Julia" - key: "julia" - steps: - - label: "Julia {{matrix.julia}}" - plugins: - - JuliaCI/julia#v1: - version: "{{matrix.julia}}" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - commands: | - echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml + # - group: ":julia: Julia" + # key: "julia" + # steps: + # - label: "Julia {{matrix.julia}}" + # plugins: + # - JuliaCI/julia#v1: + # version: "{{matrix.julia}}" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # agents: + # queue: "juliagpu" + # cuda: "*" + # commands: | + # echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml - julia -e 'println("--- :julia: Developing KernelAbstractions") - using Pkg - Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")' + # julia -e 'println("--- :julia: Developing KernelAbstractions") + # using Pkg + # Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")' - julia -e 'println("--- :julia: Instantiating project") - using Pkg - Pkg.develop(; path=pwd())' || exit 3 + # julia -e 'println("--- :julia: Instantiating project") + # using Pkg + # Pkg.develop(; path=pwd())' || exit 3 - julia -e 'println("+++ :julia: Running tests") - using Pkg - Pkg.test("CUDA"; coverage=true)' - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only julia\]/ || - build.message !~ /\[only/ && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip julia\]/ - timeout_in_minutes: 90 - matrix: - setup: - julia: - - "1.10" - - "1.11" - - "1.12" - - "nightly" - adjustments: - - with: - julia: "nightly" - soft_fail: true + # julia -e 'println("+++ :julia: Running tests") + # using Pkg + # Pkg.test("CUDA"; coverage=true)' + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only julia\]/ || + # build.message !~ /\[only/ && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip julia\]/ + # timeout_in_minutes: 90 + # matrix: + # setup: + # julia: + # - "1.10" + # - "1.11" + # - "1.12" + # - "nightly" + # adjustments: + # - with: + # julia: "nightly" + # soft_fail: true # # then, test supported CUDA toolkits (installed through the artifact system) # - group: "CUDA" From 0c1ea747e13e6c357299e16627cd9ad662e42e29 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 13:32:51 -0300 Subject: [PATCH 10/23] Fix --- src/device/random.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/device/random.jl b/src/device/random.jl index 0cbedef6ff..7d72d90a1a 100644 --- a/src/device/random.jl +++ b/src/device/random.jl @@ -63,6 +63,8 @@ end @inline Philox2x32() = Philox2x32{7}() @inline function Base.getproperty(rng::Philox2x32, field::Symbol) + threadId = threadIdx().x + (threadIdx().y - 1i32) * blockDim().x + + (threadIdx().z - 1i32) * blockDim().x * blockDim().y warpId = (threadId - 1i32) >> 0x5 + 1i32 # fld1 if field === :key From 14f1c275a4e07122993ecdaf7ac54dbc3c50b0cc Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 13:40:49 -0300 Subject: [PATCH 11/23] another fix --- src/CUDAKernels.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index c99ca7e723..b9e6cc94f8 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -209,7 +209,7 @@ end end @device_override @inline function KI.get_global_size() - return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(lockDim().z * gridDim().z)) + return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(blockDim().z * gridDim().z)) end @device_override @inline function KA.__validindex(ctx) From 0a3e029fdeebc54020e6cb429b5b4db1d0c7791a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 14:01:03 -0300 Subject: [PATCH 12/23] fix --- src/CUDAKernels.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index b9e6cc94f8..fb5079373b 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -197,11 +197,11 @@ end end @device_override @inline function KI.get_global_id() - return (; x = Int(blockDim().x), y = Int(blockDim().y), z = Int(blockDim().z)) + return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z)) end @device_override @inline function KI.get_local_size() - return (; x = Int((blockDim().x-1)*blockDim().x + threadIdx().x), y = Int((blockDim().y-1)*blockDim().y + threadIdx().y), z = Int((blockDim().z-1)*blockDim().z + threadIdx().z)) + return (; x = Int(blockDim().x), y = Int(blockDim().y), z = Int(blockDim().z)) end @device_override @inline function KI.get_num_groups() From d652d0be6fb8bb612533e9bcd0ff055802b312d5 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 14:18:12 -0300 Subject: [PATCH 13/23] dfgbsg --- src/CUDAKernels.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index fb5079373b..bf4c269023 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -178,10 +178,10 @@ function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDA Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items)) end function KI.max_work_group_size(::CUDABackend)::Int - Int(attribute(device(), DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)) + Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)) end function KI.multiprocessor_count(::CUDABackend)::Int - Int(attribute(device(), DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)) + Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)) end ## indexing From 53dfbf0f0b2bc82fc78bb8ada5fec208f51b610f Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 14:36:23 -0300 Subject: [PATCH 14/23] dgbsg --- src/mapreduce.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 8f75fcce59..f0f096b646 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -265,8 +265,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # NOTE: we can't use the previously-compiled kernel, or its launch configuration, # since the type of `partial` might not match the original output container # (e.g. if that was a view). - partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A) - partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmem∘compute_threads) + partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A) + partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads) partial_reduce_threads = compute_threads(partial_kernel_config.threads) partial_reduce_shmem = compute_shmem(partial_reduce_threads) partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks From 08baa0f28661adf8577d85105efa9aed325ea82c Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 15:34:36 -0300 Subject: [PATCH 15/23] Finally? --- src/mapreduce.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index f0f096b646..3af00be038 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -202,7 +202,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, if length(Rother) >= serial_mapreduce_threshold(dev) args = (f, op, init, Rreduce, Rother, R, A) kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...) - kernel_config = launch_configuration(kernel.fun) + kernel_config = launch_configuration(kernel.kern.fun) threads = kernel_config.threads blocks = cld(length(Rother), threads) kernel(args...; workgroupsize=threads, numworkgroups=blocks) @@ -255,7 +255,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # perform the actual reduction if reduce_blocks == 1 # we can cover the dimensions to reduce using a single block - kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem) + kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem) else # TODO: provide a version that atomically reduces from different blocks From fed145c9ed694ee105b03a0c8132255d775be321 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 15:59:15 -0300 Subject: [PATCH 16/23] Revert "shtgbr" This reverts commit 3fba9403abf18d842105f76124e511fad0fe4231. --- .buildkite/pipeline.yml | 88 ++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index f7cc0f5ef0..ae6aea7a2d 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,52 +1,52 @@ steps: # first, test supported Julia versions (using local CUDA as installed on the system) - # - group: ":julia: Julia" - # key: "julia" - # steps: - # - label: "Julia {{matrix.julia}}" - # plugins: - # - JuliaCI/julia#v1: - # version: "{{matrix.julia}}" - # - JuliaCI/julia-coverage#v1: - # dirs: - # - src - # - lib - # - examples - # agents: - # queue: "juliagpu" - # cuda: "*" - # commands: | - # echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml + - group: ":julia: Julia" + key: "julia" + steps: + - label: "Julia {{matrix.julia}}" + plugins: + - JuliaCI/julia#v1: + version: "{{matrix.julia}}" + - JuliaCI/julia-coverage#v1: + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "*" + commands: | + echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml - # julia -e 'println("--- :julia: Developing KernelAbstractions") - # using Pkg - # Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")' + julia -e 'println("--- :julia: Developing KernelAbstractions") + using Pkg + Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")' - # julia -e 'println("--- :julia: Instantiating project") - # using Pkg - # Pkg.develop(; path=pwd())' || exit 3 + julia -e 'println("--- :julia: Instantiating project") + using Pkg + Pkg.develop(; path=pwd())' || exit 3 - # julia -e 'println("+++ :julia: Running tests") - # using Pkg - # Pkg.test("CUDA"; coverage=true)' - # if: | - # build.message =~ /\[only tests\]/ || - # build.message =~ /\[only julia\]/ || - # build.message !~ /\[only/ && - # build.message !~ /\[skip tests\]/ && - # build.message !~ /\[skip julia\]/ - # timeout_in_minutes: 90 - # matrix: - # setup: - # julia: - # - "1.10" - # - "1.11" - # - "1.12" - # - "nightly" - # adjustments: - # - with: - # julia: "nightly" - # soft_fail: true + julia -e 'println("+++ :julia: Running tests") + using Pkg + Pkg.test("CUDA"; coverage=true)' + if: | + build.message =~ /\[only tests\]/ || + build.message =~ /\[only julia\]/ || + build.message !~ /\[only/ && + build.message !~ /\[skip tests\]/ && + build.message !~ /\[skip julia\]/ + timeout_in_minutes: 90 + matrix: + setup: + julia: + - "1.10" + - "1.11" + - "1.12" + - "nightly" + adjustments: + - with: + julia: "nightly" + soft_fail: true # # then, test supported CUDA toolkits (installed through the artifact system) # - group: "CUDA" From 46cbc05c0c3736b5f9c5d1013b33a3d6dc279256 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 17:21:41 -0300 Subject: [PATCH 17/23] Reapply "shtgbr" This reverts commit fc12cd16974cff27fdc4cd53aa335ed64052a2b5. --- .buildkite/pipeline.yml | 88 ++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index ae6aea7a2d..f7cc0f5ef0 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1,52 +1,52 @@ steps: # first, test supported Julia versions (using local CUDA as installed on the system) - - group: ":julia: Julia" - key: "julia" - steps: - - label: "Julia {{matrix.julia}}" - plugins: - - JuliaCI/julia#v1: - version: "{{matrix.julia}}" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - commands: | - echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml + # - group: ":julia: Julia" + # key: "julia" + # steps: + # - label: "Julia {{matrix.julia}}" + # plugins: + # - JuliaCI/julia#v1: + # version: "{{matrix.julia}}" + # - JuliaCI/julia-coverage#v1: + # dirs: + # - src + # - lib + # - examples + # agents: + # queue: "juliagpu" + # cuda: "*" + # commands: | + # echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml - julia -e 'println("--- :julia: Developing KernelAbstractions") - using Pkg - Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")' + # julia -e 'println("--- :julia: Developing KernelAbstractions") + # using Pkg + # Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")' - julia -e 'println("--- :julia: Instantiating project") - using Pkg - Pkg.develop(; path=pwd())' || exit 3 + # julia -e 'println("--- :julia: Instantiating project") + # using Pkg + # Pkg.develop(; path=pwd())' || exit 3 - julia -e 'println("+++ :julia: Running tests") - using Pkg - Pkg.test("CUDA"; coverage=true)' - if: | - build.message =~ /\[only tests\]/ || - build.message =~ /\[only julia\]/ || - build.message !~ /\[only/ && - build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip julia\]/ - timeout_in_minutes: 90 - matrix: - setup: - julia: - - "1.10" - - "1.11" - - "1.12" - - "nightly" - adjustments: - - with: - julia: "nightly" - soft_fail: true + # julia -e 'println("+++ :julia: Running tests") + # using Pkg + # Pkg.test("CUDA"; coverage=true)' + # if: | + # build.message =~ /\[only tests\]/ || + # build.message =~ /\[only julia\]/ || + # build.message !~ /\[only/ && + # build.message !~ /\[skip tests\]/ && + # build.message !~ /\[skip julia\]/ + # timeout_in_minutes: 90 + # matrix: + # setup: + # julia: + # - "1.10" + # - "1.11" + # - "1.12" + # - "nightly" + # adjustments: + # - with: + # julia: "nightly" + # soft_fail: true # # then, test supported CUDA toolkits (installed through the artifact system) # - group: "CUDA" From d341f2d399a2f6a2a92d08a5870fa167d74edd15 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 17:26:59 -0300 Subject: [PATCH 18/23] Revert KIKernel --- src/mapreduce.jl | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 3af00be038..97a4176b41 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -201,11 +201,14 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # If `Rother` is large enough, then a naive loop is more efficient than partial reductions. if length(Rother) >= serial_mapreduce_threshold(dev) args = (f, op, init, Rreduce, Rother, R, A) - kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...) - kernel_config = launch_configuration(kernel.kern.fun) + # kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...) + kernel = @cuda launch=false serial_mapreduce_kernel(args...) + # kernel_config = launch_configuration(kernel.kern.fun) + kernel_config = launch_configuration(kernel.fun) threads = kernel_config.threads blocks = cld(length(Rother), threads) - kernel(args...; workgroupsize=threads, numworkgroups=blocks) + # kernel(args...; workgroupsize=threads, numworkgroups=blocks) + kernel(args...; threads, blocks) return R end @@ -228,9 +231,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # we might not be able to launch all those threads to reduce each slice in one go. # that's why each threads also loops across their inputs, processing multiple values # so that we can span the entire reduction dimension using a single thread block. - kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A) + # kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A) + kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A) compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T) - kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads) + # kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads) + kernel_config = launch_configuration(kernel.fun; shmem=compute_shmem∘compute_threads) reduce_threads = compute_threads(kernel_config.threads) reduce_shmem = compute_shmem(reduce_threads) @@ -255,7 +260,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # perform the actual reduction if reduce_blocks == 1 # we can cover the dimensions to reduce using a single block - kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem) + # kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem) + kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem) else # TODO: provide a version that atomically reduces from different blocks @@ -265,8 +271,10 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, # NOTE: we can't use the previously-compiled kernel, or its launch configuration, # since the type of `partial` might not match the original output container # (e.g. if that was a view). - partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A) - partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads) + # partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A) + partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A) + # partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads) + partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmem∘compute_threads) partial_reduce_threads = compute_threads(partial_kernel_config.threads) partial_reduce_shmem = compute_shmem(partial_reduce_threads) partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks @@ -286,7 +294,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T}, end partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A; - workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem) + threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem) + # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem) GPUArrays.mapreducedim!(identity, op, R, partial; init) end From 6a2c51ba94c06cb95af7af2b5eecc7b3a62c5b3f Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 3 Nov 2025 16:26:15 -0400 Subject: [PATCH 19/23] fix --- src/CUDAKernels.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index bf4c269023..0ef0a7d089 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -175,7 +175,9 @@ end function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int - Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items)) + kernel_config = launch_configuration(kikern.kern.fun) + + Int(min(kernel_config.threads, max_work_items)) end function KI.max_work_group_size(::CUDABackend)::Int Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)) From b63f875f099c811e752e5a1683dac615e8e94878 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 4 Nov 2025 13:26:20 -0400 Subject: [PATCH 20/23] New --- src/CUDAKernels.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index 0ef0a7d089..995728201d 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -1,7 +1,7 @@ module CUDAKernels using ..CUDA -using ..CUDA: @device_override, CUSPARSE, default_memory, UnifiedMemory +using ..CUDA: @device_override, CUSPARSE, default_memory, UnifiedMemory, cufunction, cudaconvert import KernelAbstractions as KA import KernelAbstractions: KernelIntrinsics as KI @@ -158,11 +158,10 @@ function (obj::KA.Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=n return nothing end +KI.kiconvert(::CUDABackend, arg) = cudaconvert(arg) -function KI.KIKernel(::CUDABackend, f, args...; kwargs...) - kern = eval(quote - @cuda launch=false $(kwargs...) $(f)($(args...)) - end) +function KI.kifunction(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + kern = cufunction(f, tt; name, kwargs...) KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern) end From f61fd57e6b41b30f01f6234a7d2f156f2d470d3d Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 4 Aug 2025 16:54:37 -0300 Subject: [PATCH 21/23] Test GPUArrays `reverse` [only julia] [only benchmarks] --- perf/Project.toml | 1 + perf/runbenchmarks.jl | 2 + src/reverse.jl | 280 +++++++++++++++++++++--------------------- test/base/array.jl | 88 ++++++------- test/runtests.jl | 3 + 5 files changed, 190 insertions(+), 184 deletions(-) diff --git a/perf/Project.toml b/perf/Project.toml index 8314c7b285..9b028ce66a 100644 --- a/perf/Project.toml +++ b/perf/Project.toml @@ -2,5 +2,6 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index 72d8b1e76a..21e9a96aac 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -1,4 +1,6 @@ # benchmark suite execution and codespeed submission +using Pkg +Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="reverse") using CUDA diff --git a/src/reverse.jl b/src/reverse.jl index d0c73da2ec..94914941f5 100644 --- a/src/reverse.jl +++ b/src/reverse.jl @@ -1,155 +1,155 @@ # reversing -# the kernel works by treating the array as 1d. after reversing by dimension x an element at -# pos [i1, i2, i3, ... , i{x}, ..., i{n}] will be at -# pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x - -# out-of-place version, copying a single value per thread from input to output -function _reverse(input::AnyCuArray{T, N}, output::AnyCuArray{T, N}; - dims=1:ndims(input)) where {T, N} - @assert size(input) == size(output) - rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N) - ref = size(input) .+ 1 - # converts an ND-index in the data array to the linear index - lin_idx = LinearIndices(input) - # converts a linear index in a reduced array to an ND-index, but using the reduced size - nd_idx = CartesianIndices(input) - - ## COV_EXCL_START - function kernel(input::AbstractArray{T, N}, output::AbstractArray{T, N}) where {T, N} - offset_in = blockDim().x * (blockIdx().x - 1i32) - index_in = offset_in + threadIdx().x - - @inbounds if index_in <= length(input) - idx = Tuple(nd_idx[index_in]) - idx = ifelse.(rev_dims, ref .- idx, idx) - index_out = lin_idx[idx...] - output[index_out] = input[index_in] - end - - return - end - ## COV_EXCL_STOP - - nthreads = 256 - nblocks = cld(length(input), nthreads) - - @cuda threads=nthreads blocks=nblocks kernel(input, output) -end - -# in-place version, swapping elements on half the number of threads -function _reverse!(data::AnyCuArray{T, N}; dims=1:ndims(data)) where {T, N} - rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N) - half_dim = findlast(rev_dims) - if isnothing(half_dim) - # no reverse operation needed at all in this case. - return - end - ref = size(data) .+ 1 - # converts an ND-index in the data array to the linear index - lin_idx = LinearIndices(data) - reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N) - reduced_length = prod(reduced_size) - # converts a linear index in a reduced array to an ND-index, but using the reduced size - nd_idx = CartesianIndices(reduced_size) - - ## COV_EXCL_START - function kernel(data::AbstractArray{T, N}) where {T, N} - offset_in = blockDim().x * (blockIdx().x - 1i32) - - index_in = offset_in + threadIdx().x - - @inbounds if index_in <= reduced_length - idx = Tuple(nd_idx[index_in]) - index_in = lin_idx[idx...] - idx = ifelse.(rev_dims, ref .- idx, idx) - index_out = lin_idx[idx...] - - if index_in < index_out - temp = data[index_out] - data[index_out] = data[index_in] - data[index_in] = temp - end - end - - return - end - ## COV_EXCL_STOP - - # NOTE: we launch slightly more than half the number of elements in the array as threads. - # The last non-singleton dimension along which to reverse is used to define how the array is split. - # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by - # ignoring the threads that cross the mid-point - - nthreads = 256 - nblocks = cld(prod(reduced_size), nthreads) - - @cuda threads=nthreads blocks=nblocks kernel(data) -end +# # the kernel works by treating the array as 1d. after reversing by dimension x an element at +# # pos [i1, i2, i3, ... , i{x}, ..., i{n}] will be at +# # pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x + +# # out-of-place version, copying a single value per thread from input to output +# function _reverse(input::AnyCuArray{T, N}, output::AnyCuArray{T, N}; +# dims=1:ndims(input)) where {T, N} +# @assert size(input) == size(output) +# rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N) +# ref = size(input) .+ 1 +# # converts an ND-index in the data array to the linear index +# lin_idx = LinearIndices(input) +# # converts a linear index in a reduced array to an ND-index, but using the reduced size +# nd_idx = CartesianIndices(input) + +# ## COV_EXCL_START +# function kernel(input::AbstractArray{T, N}, output::AbstractArray{T, N}) where {T, N} +# offset_in = blockDim().x * (blockIdx().x - 1i32) +# index_in = offset_in + threadIdx().x + +# @inbounds if index_in <= length(input) +# idx = Tuple(nd_idx[index_in]) +# idx = ifelse.(rev_dims, ref .- idx, idx) +# index_out = lin_idx[idx...] +# output[index_out] = input[index_in] +# end + +# return +# end +# ## COV_EXCL_STOP + +# nthreads = 256 +# nblocks = cld(length(input), nthreads) + +# @cuda threads=nthreads blocks=nblocks kernel(input, output) +# end + +# # in-place version, swapping elements on half the number of threads +# function _reverse!(data::AnyCuArray{T, N}; dims=1:ndims(data)) where {T, N} +# rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N) +# half_dim = findlast(rev_dims) +# if isnothing(half_dim) +# # no reverse operation needed at all in this case. +# return +# end +# ref = size(data) .+ 1 +# # converts an ND-index in the data array to the linear index +# lin_idx = LinearIndices(data) +# reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N) +# reduced_length = prod(reduced_size) +# # converts a linear index in a reduced array to an ND-index, but using the reduced size +# nd_idx = CartesianIndices(reduced_size) + +# ## COV_EXCL_START +# function kernel(data::AbstractArray{T, N}) where {T, N} +# offset_in = blockDim().x * (blockIdx().x - 1i32) + +# index_in = offset_in + threadIdx().x + +# @inbounds if index_in <= reduced_length +# idx = Tuple(nd_idx[index_in]) +# index_in = lin_idx[idx...] +# idx = ifelse.(rev_dims, ref .- idx, idx) +# index_out = lin_idx[idx...] + +# if index_in < index_out +# temp = data[index_out] +# data[index_out] = data[index_in] +# data[index_in] = temp +# end +# end + +# return +# end +# ## COV_EXCL_STOP + +# # NOTE: we launch slightly more than half the number of elements in the array as threads. +# # The last non-singleton dimension along which to reverse is used to define how the array is split. +# # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by +# # ignoring the threads that cross the mid-point + +# nthreads = 256 +# nblocks = cld(prod(reduced_size), nthreads) + +# @cuda threads=nthreads blocks=nblocks kernel(data) +# end # n-dimensional API -function Base.reverse!(data::AnyCuArray{T, N}; dims=:) where {T, N} - if isa(dims, Colon) - dims = 1:ndims(data) - end - if !applicable(iterate, dims) - throw(ArgumentError("dimension $dims is not an iterable")) - end - if !all(1 .≤ dims .≤ ndims(data)) - throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))")) - end - - _reverse!(data; dims=dims) - - return data -end - -# out-of-place -function Base.reverse(input::AnyCuArray{T, N}; dims=:) where {T, N} - if isa(dims, Colon) - dims = 1:ndims(input) - end - if !applicable(iterate, dims) - throw(ArgumentError("dimension $dims is not an iterable")) - end - if !all(1 .≤ dims .≤ ndims(input)) - throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))")) - end - - if all(size(input)[[dims...]].==1) - # no reverse operation needed at all in this case. - return copy(input) - else - output = similar(input) - _reverse(input, output; dims=dims) - return output - end -end +# function Base.reverse!(data::AnyCuArray{T, N}; dims=:) where {T, N} +# if isa(dims, Colon) +# dims = 1:ndims(data) +# end +# if !applicable(iterate, dims) +# throw(ArgumentError("dimension $dims is not an iterable")) +# end +# if !all(1 .≤ dims .≤ ndims(data)) +# throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))")) +# end + +# _reverse!(data; dims=dims) + +# return data +# end + +# # out-of-place +# function Base.reverse(input::AnyCuArray{T, N}; dims=:) where {T, N} +# if isa(dims, Colon) +# dims = 1:ndims(input) +# end +# if !applicable(iterate, dims) +# throw(ArgumentError("dimension $dims is not an iterable")) +# end +# if !all(1 .≤ dims .≤ ndims(input)) +# throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))")) +# end + +# if all(size(input)[[dims...]].==1) +# # no reverse operation needed at all in this case. +# return copy(input) +# else +# output = similar(input) +# _reverse(input, output; dims=dims) +# return output +# end +# end # 1-dimensional API -# in-place -Base.@propagate_inbounds function Base.reverse!(data::AnyCuVector{T}, start::Integer, - stop::Integer=length(data)) where {T} - _reverse!(view(data, start:stop)) - return data -end +# # in-place +# Base.@propagate_inbounds function Base.reverse!(data::AnyCuVector{T}, start::Integer, +# stop::Integer=length(data)) where {T} +# _reverse!(view(data, start:stop)) +# return data +# end -Base.reverse!(data::AnyCuVector{T}) where {T} = @inbounds reverse!(data, 1, length(data)) +# Base.reverse!(data::AnyCuVector{T}) where {T} = @inbounds reverse!(data, 1, length(data)) -# out-of-place -Base.@propagate_inbounds function Base.reverse(input::AnyCuVector{T}, start::Integer, - stop::Integer=length(input)) where {T} - output = similar(input) +# # out-of-place +# Base.@propagate_inbounds function Base.reverse(input::AnyCuVector{T}, start::Integer, +# stop::Integer=length(input)) where {T} +# output = similar(input) - start > 1 && copyto!(output, 1, input, 1, start-1) - _reverse(view(input, start:stop), view(output, start:stop)) - stop < length(input) && copyto!(output, stop+1, input, stop+1) +# start > 1 && copyto!(output, 1, input, 1, start-1) +# _reverse(view(input, start:stop), view(output, start:stop)) +# stop < length(input) && copyto!(output, stop+1, input, stop+1) - return output -end +# return output +# end -Base.reverse(data::AnyCuVector{T}) where {T} = @inbounds reverse(data, 1, length(data)) +# Base.reverse(data::AnyCuVector{T}) where {T} = @inbounds reverse(data, 1, length(data)) diff --git a/test/base/array.jl b/test/base/array.jl index 4c0ebca8da..a42963f634 100644 --- a/test/base/array.jl +++ b/test/base/array.jl @@ -478,50 +478,50 @@ end @test Array(x) == zeros(4) end -@testset "reverse" begin - # 1-d out-of-place - @test testf(x->reverse(x), rand(1000)) - @test testf(x->reverse(x, 10), rand(1000)) - @test testf(x->reverse(x, 10, 90), rand(1000)) - - # 1-d in-place - @test testf(x->reverse!(x), rand(1000)) - @test testf(x->reverse!(x, 10), rand(1000)) - @test testf(x->reverse!(x, 10, 90), rand(1000)) - - # n-d out-of-place - for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]), - dim in 1:length(shape) - @test testf(x->reverse(x; dims=dim), rand(shape...)) - - cpu = rand(shape...) - gpu = CuArray(cpu) - reverse!(gpu; dims=dim) - @test Array(gpu) == reverse(cpu; dims=dim) - end - - # supports multidimensional reverse - for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]), - dim in ((1,2),(2,3),(1,3),:) - @test testf(x->reverse(x; dims=dim), rand(shape...)) - - cpu = rand(shape...) - gpu = CuArray(cpu) - reverse!(gpu; dims=dim) - @test Array(gpu) == reverse(cpu; dims=dim) - end - - # wrapped array - @test testf(x->reverse(x), reshape(rand(2,2), 4)) - - # error throwing - cpu = rand(1,2,3,4) - gpu = CuArray(cpu) - @test_throws ArgumentError reverse!(gpu, dims=5) - @test_throws ArgumentError reverse!(gpu, dims=0) - @test_throws ArgumentError reverse(gpu, dims=5) - @test_throws ArgumentError reverse(gpu, dims=0) -end +# @testset "reverse" begin +# # 1-d out-of-place +# @test testf(x->reverse(x), rand(1000)) +# @test testf(x->reverse(x, 10), rand(1000)) +# @test testf(x->reverse(x, 10, 90), rand(1000)) + +# # 1-d in-place +# @test testf(x->reverse!(x), rand(1000)) +# @test testf(x->reverse!(x, 10), rand(1000)) +# @test testf(x->reverse!(x, 10, 90), rand(1000)) + +# # n-d out-of-place +# for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]), +# dim in 1:length(shape) +# @test testf(x->reverse(x; dims=dim), rand(shape...)) + +# cpu = rand(shape...) +# gpu = CuArray(cpu) +# reverse!(gpu; dims=dim) +# @test Array(gpu) == reverse(cpu; dims=dim) +# end + +# # supports multidimensional reverse +# for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]), +# dim in ((1,2),(2,3),(1,3),:) +# @test testf(x->reverse(x; dims=dim), rand(shape...)) + +# cpu = rand(shape...) +# gpu = CuArray(cpu) +# reverse!(gpu; dims=dim) +# @test Array(gpu) == reverse(cpu; dims=dim) +# end + +# # wrapped array +# @test testf(x->reverse(x), reshape(rand(2,2), 4)) + +# # error throwing +# cpu = rand(1,2,3,4) +# gpu = CuArray(cpu) +# @test_throws ArgumentError reverse!(gpu, dims=5) +# @test_throws ArgumentError reverse!(gpu, dims=0) +# @test_throws ArgumentError reverse(gpu, dims=5) +# @test_throws ArgumentError reverse(gpu, dims=0) +# end @testset "findall" begin # 1D diff --git a/test/runtests.jl b/test/runtests.jl index 1f651bc78d..bffaea670c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,6 @@ +using Pkg +Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="reverse") + using Distributed using Dates import REPL From e2f192a55b811b51cc2517b039c3d86d4931cfcb Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 5 Aug 2025 11:19:24 -0300 Subject: [PATCH 22/23] temp --- perf/array.jl | 230 +++++++++++++++++++++--------------------- perf/runbenchmarks.jl | 26 ++--- 2 files changed, 128 insertions(+), 128 deletions(-) diff --git a/perf/array.jl b/perf/array.jl index 30348a5120..d2df04f371 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -19,37 +19,37 @@ gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) gpu_mat_bools = CuArray(rand(rng, Bool, m, n)) gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) -group["construct"] = @benchmarkable CuArray{Int}(undef, 1) +# group["construct"] = @benchmarkable CuArray{Int}(undef, 1) -group["copy"] = @async_benchmarkable copy($gpu_mat) +# group["copy"] = @async_benchmarkable copy($gpu_mat) -gpu_mat2 = copy(gpu_mat) -let group = addgroup!(group, "copyto!") - group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat) - group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat) - group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat) -end +# gpu_mat2 = copy(gpu_mat) +# let group = addgroup!(group, "copyto!") +# group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat) +# group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat) +# group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat) +# end -let group = addgroup!(group, "iteration") - group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10] +# let group = addgroup!(group, "iteration") +# group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10] - group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] +# group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] - let group = addgroup!(group, "findall") - group["bool"] = @benchmarkable findall($gpu_vec_bools) - group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints) - end +# let group = addgroup!(group, "findall") +# group["bool"] = @benchmarkable findall($gpu_vec_bools) +# group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints) +# end - let group = addgroup!(group, "findfirst") - group["bool"] = @benchmarkable findfirst($gpu_vec_bools) - group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints) - end +# let group = addgroup!(group, "findfirst") +# group["bool"] = @benchmarkable findfirst($gpu_vec_bools) +# group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints) +# end - let group = addgroup!(group, "findmin") # findmax - group["1d"] = @async_benchmarkable findmin($gpu_vec) - group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1) - end -end +# let group = addgroup!(group, "findmin") # findmax +# group["1d"] = @async_benchmarkable findmin($gpu_vec) +# group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1) +# end +# end let group = addgroup!(group, "reverse") group["1d"] = @async_benchmarkable reverse($gpu_vec) @@ -62,94 +62,94 @@ let group = addgroup!(group, "reverse") group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2) end -group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 - -# no need to test inplace version, which performs the same operation (but with an alloc) -let group = addgroup!(group, "accumulate") - let group = addgroup!(group, "Float32") - group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec) - group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1) - group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2) - - group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1) - group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2) - end - let group = addgroup!(group, "Int64") - group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints) - group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1) - group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2) - - group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1) - group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2) - end -end - -let group = addgroup!(group, "reductions") - let group = addgroup!(group, "reduce") - let group = addgroup!(group, "Float32") - group["1d"] = @async_benchmarkable reduce(+, $gpu_vec) - group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1) - group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2) - group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1) - group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2) - end - let group = addgroup!(group, "Int64") - group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints) - group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1) - group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2) - group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1) - group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2) - end - end - - let group = addgroup!(group, "mapreduce") - let group = addgroup!(group, "Float32") - group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec) - group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1) - group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2) - group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1) - group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2) - end - let group = addgroup!(group, "Int64") - group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints) - group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1) - group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2) - group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1) - group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2) - end - end - - # used by sum, prod, minimum, maximum, all, any, count -end - -let group = addgroup!(group, "random") - let group = addgroup!(group, "rand") - group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n) - group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n) - end - - let group = addgroup!(group, "rand!") - group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec) - group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints) - end - - let group = addgroup!(group, "randn") - group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n) - end - - let group = addgroup!(group, "randn!") - group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec) - end -end - -let group = addgroup!(group, "sorting") - group["1d"] = @async_benchmarkable sort($gpu_vec) - group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) - group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) -end - -let group = addgroup!(group, "permutedims") - group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1)) - group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2)) - group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3)) -end +# group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 + +# # no need to test inplace version, which performs the same operation (but with an alloc) +# let group = addgroup!(group, "accumulate") +# let group = addgroup!(group, "Float32") +# group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec) +# group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1) +# group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2) + +# group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1) +# group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2) +# end +# let group = addgroup!(group, "Int64") +# group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints) +# group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1) +# group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2) + +# group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1) +# group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2) +# end +# end + +# let group = addgroup!(group, "reductions") +# let group = addgroup!(group, "reduce") +# let group = addgroup!(group, "Float32") +# group["1d"] = @async_benchmarkable reduce(+, $gpu_vec) +# group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1) +# group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2) +# group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1) +# group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2) +# end +# let group = addgroup!(group, "Int64") +# group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints) +# group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1) +# group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2) +# group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1) +# group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2) +# end +# end + +# let group = addgroup!(group, "mapreduce") +# let group = addgroup!(group, "Float32") +# group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec) +# group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1) +# group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2) +# group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1) +# group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2) +# end +# let group = addgroup!(group, "Int64") +# group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints) +# group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1) +# group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2) +# group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1) +# group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2) +# end +# end + +# # used by sum, prod, minimum, maximum, all, any, count +# end + +# let group = addgroup!(group, "random") +# let group = addgroup!(group, "rand") +# group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n) +# group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n) +# end + +# let group = addgroup!(group, "rand!") +# group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec) +# group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints) +# end + +# let group = addgroup!(group, "randn") +# group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n) +# end + +# let group = addgroup!(group, "randn!") +# group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec) +# end +# end + +# let group = addgroup!(group, "sorting") +# group["1d"] = @async_benchmarkable sort($gpu_vec) +# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) +# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) +# end + +# let group = addgroup!(group, "permutedims") +# group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1)) +# group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2)) +# group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3)) +# end diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index 21e9a96aac..f66a1a8a9b 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -18,13 +18,13 @@ end # before anything else, run latency benchmarks. these spawn subprocesses, so we don't want # to do so after regular benchmarks have caused the memory allocator to reserve memory. -@info "Running latency benchmarks" -latency_results = include("latency.jl") +# @info "Running latency benchmarks" +# latency_results = include("latency.jl") SUITE = BenchmarkGroup() -include("cuda.jl") -include("kernel.jl") +# include("cuda.jl") +# include("kernel.jl") include("array.jl") @info "Preparing main benchmarks" @@ -36,20 +36,20 @@ GC.gc(true) CUDA.reclaim() # benchmark groups that aren't part of the suite -addgroup!(SUITE, "integration") +# addgroup!(SUITE, "integration") @info "Running main benchmarks" results = run(SUITE, verbose=true) # integration tests (that do nasty things, so need to be run last) -@info "Running integration benchmarks" -integration_results = BenchmarkGroup() -integration_results["volumerhs"] = include("volumerhs.jl") -integration_results["byval"] = include("byval.jl") -integration_results["cudadevrt"] = include("cudadevrt.jl") - -results["latency"] = latency_results -results["integration"] = integration_results +# @info "Running integration benchmarks" +# integration_results = BenchmarkGroup() +# integration_results["volumerhs"] = include("volumerhs.jl") +# integration_results["byval"] = include("byval.jl") +# integration_results["cudadevrt"] = include("cudadevrt.jl") + +# results["latency"] = latency_results +# results["integration"] = integration_results # write out the results result_file = length(ARGS) >= 1 ? ARGS[1] : "benchmarkresults.json" From 69871e593e6e1b878ac4fde022481e2959cedbc2 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 3 Nov 2025 16:05:40 -0400 Subject: [PATCH 23/23] trhws --- Project.toml | 4 ++++ perf/runbenchmarks.jl | 2 -- test/runtests.jl | 3 --- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Project.toml b/Project.toml index 5dba52ac6f..4c7d6e5bce 100644 --- a/Project.toml +++ b/Project.toml @@ -44,6 +44,10 @@ EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +[sources] +GPUArrays = {rev = "reverse", url = "https://github.com/christiangnrd/GPUArrays.jl"} +KernelAbstractions = {rev = "intrinsics", url = "https://github.com/christiangnrd/KernelAbstractions.jl"} + [extensions] ChainRulesCoreExt = "ChainRulesCore" EnzymeCoreExt = "EnzymeCore" diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index f66a1a8a9b..677f8e8b8a 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -1,6 +1,4 @@ # benchmark suite execution and codespeed submission -using Pkg -Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="reverse") using CUDA diff --git a/test/runtests.jl b/test/runtests.jl index bffaea670c..1f651bc78d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,3 @@ -using Pkg -Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="reverse") - using Distributed using Dates import REPL