From 3652aeb306a66f3971fa98b9eecd64ab17c56aaa Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 11:10:41 -0300
Subject: [PATCH 01/23] KernelIntrinsics

---
 Project.toml       |  2 +-
 src/CUDAKernels.jl | 60 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/Project.toml b/Project.toml
index c0fb240200..5dba52ac6f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -67,7 +67,7 @@ ExprTools = "0.1"
 GPUArrays = "11.2.4"
 GPUCompiler = "1.4"
 GPUToolbox = "0.3, 1"
-KernelAbstractions = "0.9.38"
+KernelAbstractions = "0.10"
 LLVM = "9.3.1"
 LLVMLoopInfo = "1"
 LazyArtifacts = "1"
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 5a36ed5eaa..b6f89aed90 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -4,6 +4,7 @@ using ..CUDA
 using ..CUDA: @device_override, CUSPARSE, default_memory, UnifiedMemory
 
 import KernelAbstractions as KA
+import KernelAbstractions: KernelIntrinsics as KI
 
 import StaticArrays
 import SparseArrays: AbstractSparseArray
@@ -157,37 +158,61 @@ function (obj::KA.Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=n
     return nothing
 end
 
+
+function KI.KIKernel(::CUDABackend, f, args...; kwargs...)
+    kern = eval(quote
+        @cuda launch=false $(kwargs...) $(f)($(args...))
+    end)
+    KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
+end
+
+function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing)
+    threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize
+    threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups
+
+    obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid)
+end
+
+
+function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
+    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+end
+function KI.max_work_group_size(::CUDABackend)::Int
+    Int(attribute(device(), DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+end
+function KI.multiprocessor_count(::CUDABackend)::Int
+    Int(attribute(device(), DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+end
+
 ## indexing
 
 ## COV_EXCL_START
-@device_override @inline function KA.__index_Local_Linear(ctx)
-    return threadIdx().x
+@device_override @inline function KI.get_local_id()
+    return (; x = Int(threadIdx().x), y = Int(threadIdx().y), z = Int(threadIdx().z))
 end
 
 
-@device_override @inline function KA.__index_Group_Linear(ctx)
-    return blockIdx().x
+@device_override @inline function KI.get_group_id()
+    return (; x = Int(blockIdx().x), y = Int(blockIdx().y), z = Int(blockIdx().z))
 end
 
-@device_override @inline function KA.__index_Global_Linear(ctx)
-    I =  @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x)
-    # TODO: This is unfortunate, can we get the linear index cheaper
-    @inbounds LinearIndices(KA.__ndrange(ctx))[I]
+@device_override @inline function KI.get_global_id()
+    return (; x = Int(blockDim().x), y = Int(blockDim().y), z = Int(blockDim().z))
 end
 
-@device_override @inline function KA.__index_Local_Cartesian(ctx)
-    @inbounds KA.workitems(KA.__iterspace(ctx))[threadIdx().x]
+@device_override @inline function KI.get_local_size()
+    return (; x = Int((blockDim().x-1)*blockDim().x + threadIdx().x), y = Int((blockDim().y-1)*blockDim().y + threadIdx().y), z = Int((blockDim().z-1)*blockDim().z + threadIdx().z))
 end
 
-@device_override @inline function KA.__index_Group_Cartesian(ctx)
-    @inbounds KA.blocks(KA.__iterspace(ctx))[blockIdx().x]
+@device_override @inline function KI.get_num_grouups()
+    return (; x = Int(gridDim().x), y = Int(gridDim().y), z = Int(gridDim().z))
 end
 
-@device_override @inline function KA.__index_Global_Cartesian(ctx)
-    return @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x)
+@device_override @inline function KI.get_global_size()
+    return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(lockDim().z * gridDim().z))
 end
 
-@device_override @inline function KA.__validindex(ctx)
+@device_override @inline function KI.__validindex(ctx)
     if KA.__dynamic_checkbounds(ctx)
         I = @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x)
         return I in KA.__ndrange(ctx)
@@ -198,7 +223,8 @@ end
 
 ## shared and scratch memory
 
-@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+# @device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     CuStaticSharedArray(T, Dims)
 end
 
@@ -208,7 +234,7 @@ end
 
 ## synchronization and printing
 
-@device_override @inline function KA.__synchronize()
+@device_override @inline function KI.barrier()
     sync_threads()
 end
 

From 05bdd5ca3b2a64162829f83f760f4b31bd8e5172 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 11:51:35 -0300
Subject: [PATCH 02/23] Tweak

---
 src/CUDAKernels.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index b6f89aed90..7f836d39fd 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -166,11 +166,11 @@ function KI.KIKernel(::CUDABackend, f, args...; kwargs...)
     KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 
-function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing)
+function (obj::KI.KIKernel{CUDABackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
     threadsPerThreadgroup = isnothing(workgroupsize) ? 1 : workgroupsize
     threadgroupsPerGrid = isnothing(numworkgroups) ? 1 : numworkgroups
 
-    obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid)
+    obj.kern(args...; threads=threadsPerThreadgroup, blocks=threadgroupsPerGrid, kwargs...)
 end
 
 

From b86ce952e4c2374d1bc10fd8c0f20c59a374c11b Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 11:55:16 -0300
Subject: [PATCH 03/23] dogfood [only tests] [only benchmarks]

---
 src/CUDA.jl          |  1 +
 src/accumulate.jl    | 26 +++++++++++++-------------
 src/device/random.jl |  8 +++-----
 src/indexing.jl      |  2 +-
 src/mapreduce.jl     | 40 ++++++++++++++++++++--------------------
 5 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/src/CUDA.jl b/src/CUDA.jl
index 8a82201a0a..bbe06a0bba 100644
--- a/src/CUDA.jl
+++ b/src/CUDA.jl
@@ -3,6 +3,7 @@ module CUDA
 using GPUCompiler
 
 using GPUArrays
+import KernelAbstractions: KernelIntrinsics as KI
 
 using GPUToolbox
 
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 1ec21f20ea..051ecc11ef 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -15,16 +15,16 @@
 function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArray,
                       Rdim, Rpre, Rpost, Rother, neutral, init,
                       ::Val{inclusive}=Val(true)) where {T, inclusive}
-    threads = blockDim().x
-    thread = threadIdx().x
-    block = blockIdx().x
+    threads = KI.get_local_size().x
+    thread = KI.get_local_id().x
+    block = KI.get_group_id().x
 
     temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
+    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     if j > length(Rother)
         return
@@ -47,7 +47,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     offset = 1
     d = threads>>1
     while d > 0
-        sync_threads()
+        KI.barrier()
         @inbounds if thread <= d
             ai = offset * (2*thread-1)
             bi = offset * (2*thread)
@@ -66,7 +66,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     d = 1
     while d < threads
         offset >>= 1
-        sync_threads()
+        KI.barrier()
         @inbounds if thread <= d
             ai = offset * (2*thread-1)
             bi = offset * (2*thread)
@@ -78,7 +78,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
         d *= 2
     end
 
-    sync_threads()
+    KI.barrier()
 
     # write results to device memory
     @inbounds if i <= length(Rdim)
@@ -100,14 +100,14 @@ end
 function aggregate_partial_scan(op::Function, output::AbstractArray,
                                 aggregates::AbstractArray, Rdim, Rpre, Rpost, Rother,
                                 init)
-    threads = blockDim().x
-    thread = threadIdx().x
-    block = blockIdx().x
+    threads = KI.get_local_size().x
+    thread = KI.get_local_id().x
+    block = KI.get_group_id().x
 
     # iterate the main dimension using threads and the first block dimension
-    i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
+    i = (KI.get_group_id().x-1i32) * KI.get_local_size().x + KI.get_local_id().x
     # iterate the other dimensions using the remaining block dimensions
-    j = (blockIdx().z-1i32) * gridDim().y + blockIdx().y
+    j = (KI.get_group_id().z-1i32) * KI.get_num_groups().y + KI.get_group_id().y
 
     @inbounds if i <= length(Rdim) && j <= length(Rother)
         I = Rother[j]
diff --git a/src/device/random.jl b/src/device/random.jl
index d776bf886d..0cbedef6ff 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -63,8 +63,6 @@ end
 @inline Philox2x32() = Philox2x32{7}()
 
 @inline function Base.getproperty(rng::Philox2x32, field::Symbol)
-    threadId = threadIdx().x + (threadIdx().y - 1i32) * blockDim().x +
-                               (threadIdx().z - 1i32) * blockDim().x * blockDim().y
     warpId = (threadId - 1i32) >> 0x5 + 1i32  # fld1
 
     if field === :key
@@ -72,9 +70,9 @@ end
     elseif field === :ctr1
         @inbounds global_random_counters()[warpId]
     elseif field === :ctr2
-        blockId = blockIdx().x + (blockIdx().y - 1i32) * gridDim().x +
-                                 (blockIdx().z - 1i32) * gridDim().x * gridDim().y
-        globalId = threadId + (blockId - 1i32) * (blockDim().x * blockDim().y * blockDim().z)
+        globalId = KI.get_global_id().x +
+                   (KI.get_global_id().y - 1i32) * KI.get_global_size().x +
+                   (KI.get_global_id().z - 1i32) * KI.get_global_size().x * KI.get_global_size().y
         globalId%UInt32
     end::UInt32
 end
diff --git a/src/indexing.jl b/src/indexing.jl
index b958dc02ec..f0fab5711c 100644
--- a/src/indexing.jl
+++ b/src/indexing.jl
@@ -33,7 +33,7 @@ function Base.findall(bools::AnyCuArray{Bool})
     if n > 0
         ## COV_EXCL_START
         function kernel(ys::CuDeviceArray, bools, indices)
-            i = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
+            i = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x
 
             @inbounds if i <= length(bools) && bools[i]
                 i′ = CartesianIndices(bools)[i]
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index d796b5dae1..c7ec3cf21f 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -19,9 +19,9 @@ end
 @inline function reduce_block(op, val::T, neutral, shuffle::Val{true}) where T
     # shared mem for partial sums
     assume(warpsize() == 32)
-    shared = CuStaticSharedArray(T, 32)
+    shared = KI.localmemory(T, 32)
 
-    wid, lane = fldmod1(threadIdx().x, warpsize())
+    wid, lane = fldmod1(KI.get_local_id().x, warpsize())
 
     # each warp performs partial reduction
     val = reduce_warp(op, val)
@@ -32,10 +32,10 @@ end
     end
 
     # wait for all partial reductions
-    sync_threads()
+    KI.barrier()
 
     # read from shared memory only if that warp existed
-    val = if threadIdx().x <= fld1(blockDim().x, warpsize())
+    val = if KI.get_local_id().x <= fld1(KI.get_local_size().x, warpsize())
          @inbounds shared[lane]
     else
         neutral
@@ -49,8 +49,8 @@ end
     return val
 end
 @inline function reduce_block(op, val::T, neutral, shuffle::Val{false}) where T
-    threads = blockDim().x
-    thread = threadIdx().x
+    threads = KI.get_local_size().x
+    thread = KI.get_local_id().x
 
     # shared mem for a complete reduction
     shared = CuDynamicSharedArray(T, (threads,))
@@ -59,7 +59,7 @@ end
     # perform a reduction
     d = 1
     while d < threads
-        sync_threads()
+        KI.barrier()
         index = 2 * d * (thread-1) + 1
         @inbounds if index <= threads
             other_val = if index + d <= threads
@@ -92,10 +92,10 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
 
     # decompose the 1D hardware indices into separate ones for reduction (across threads
     # and possibly blocks if it doesn't fit) and other elements (remaining blocks)
-    threadIdx_reduce = threadIdx().x
-    blockDim_reduce = blockDim().x
-    blockIdx_reduce, blockIdx_other = fldmod1(blockIdx().x, length(Rother))
-    gridDim_reduce = gridDim().x ÷ length(Rother)
+    threadIdx_reduce = KI.get_local_id().x
+    blockDim_reduce = KI.get_local_size().x
+    blockIdx_reduce, blockIdx_other = fldmod1(KI.get_group_id().x, length(Rother))
+    gridDim_reduce = KI.get_num_groups().x ÷ length(Rother)
 
     # block-based indexing into the values outside of the reduction dimension
     # (that means we can safely synchronize threads within this block)
@@ -134,7 +134,7 @@ function partial_mapreduce_grid(f, op, neutral, Rreduce, Rother, shuffle, R::Abs
 end
 
 function serial_mapreduce_kernel(f, op, neutral, Rreduce, Rother, R, As)
-    grid_idx = threadIdx().x + (blockIdx().x - 1i32) * blockDim().x
+    grid_idx = KI.get_local_id().x + (KI.get_group_id().x - 1i32) * KI.get_local_size().x
     @inbounds if grid_idx <= length(Rother)
         Iother = Rother[grid_idx]
 
@@ -160,14 +160,14 @@ end
 
 # factored out for use in tests
 function serial_mapreduce_threshold(dev)
-    max_concurrency = attribute(dev, DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) *
-                      attribute(dev, DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)
+    max_concurrency = KI.max_work_group_size(CUDABackend()) * KI.multiprocessor_count(CUDABackend())
     return max_concurrency
 end
 
 function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
                                  A::Union{AbstractArray,Broadcast.Broadcasted};
                                  init=nothing) where {F, OP, T}
+    backend = CUDABackend()
     if !isa(A, Broadcast.Broadcasted)
         # XXX: Base.axes isn't defined anymore for Broadcasted, breaking this check
         Base.check_reducedims(R, A)
@@ -201,11 +201,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
     if length(Rother) >= serial_mapreduce_threshold(dev)
         args = (f, op, init, Rreduce, Rother, R, A)
-        kernel = @cuda launch=false serial_mapreduce_kernel(args...)
+        kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
         kernel_config = launch_configuration(kernel.fun)
         threads = kernel_config.threads
         blocks = cld(length(Rother), threads)
-        kernel(args...; threads, blocks)
+        kernel(args...; workgroupsize=threads, numworkgroups=blocks)
         return R
     end
 
@@ -228,9 +228,9 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single thread block.
-    kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A)
+    kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
     compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
-    kernel_config = launch_configuration(kernel.fun; shmem=compute_shmem∘compute_threads)
+    kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
     reduce_threads = compute_threads(kernel_config.threads)
     reduce_shmem = compute_shmem(reduce_threads)
 
@@ -255,7 +255,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # perform the actual reduction
     if reduce_blocks == 1
         # we can cover the dimensions to reduce using a single block
-        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
+        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem)
     else
         # TODO: provide a version that atomically reduces from different blocks
 
@@ -286,7 +286,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                       threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
+                       ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end

From 70944da4801b4742db4c5285a99c9f7c68a1c8d7 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 10:48:00 -0300
Subject: [PATCH 04/23] CI

---
 .buildkite/pipeline.yml         | 813 ++++++++++++++++----------------
 test/base/kernelabstractions.jl |   8 +-
 2 files changed, 419 insertions(+), 402 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index a21abe8026..ae6aea7a2d 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -7,9 +7,6 @@ steps:
         plugins:
           - JuliaCI/julia#v1:
               version: "{{matrix.julia}}"
-          - JuliaCI/julia-test#v1:
-              test_args: "--quickfail"
-              allow_reresolve: false
           - JuliaCI/julia-coverage#v1:
               dirs:
                 - src
@@ -20,6 +17,18 @@ steps:
           cuda: "*"
         commands: |
           echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
+
+          julia -e 'println("--- :julia: Developing KernelAbstractions")
+                using Pkg
+                Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")'
+
+          julia -e 'println("--- :julia: Instantiating project")
+                using Pkg
+                Pkg.develop(; path=pwd())' || exit 3
+
+          julia -e 'println("+++ :julia: Running tests")
+                using Pkg
+                Pkg.test("CUDA"; coverage=true)'
         if: |
           build.message =~ /\[only tests\]/ ||
           build.message =~ /\[only julia\]/ ||
@@ -39,428 +48,434 @@ steps:
                 julia: "nightly"
               soft_fail: true
 
-  # then, test supported CUDA toolkits (installed through the artifact system)
-  - group: "CUDA"
-    key: "cuda"
-    depends_on: "julia"
-    steps:
-      - label: "CUDA {{matrix.cuda}}"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.11"
-          - JuliaCI/julia-test#v1:
-              test_args: "--quickfail core base libraries"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only cuda\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip cuda\]/
-        timeout_in_minutes: 60
-        matrix:
-          setup:
-            cuda:
-              - "13.0"
-              - "12.9"
-              - "12.8"
-              - "12.6"
-              - "12.5"
-              - "12.4"
-              - "12.3"
-              - "12.2"
-              - "12.1"
-              - "12.0"
-          adjustments:
-            - with:
-                cuda: "13.0"
-              soft_fail: true
-        commands: |
-          echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
+  # # then, test supported CUDA toolkits (installed through the artifact system)
+  # - group: "CUDA"
+  #   key: "cuda"
+  #   depends_on: "julia"
+  #   steps:
+  #     - label: "CUDA {{matrix.cuda}}"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "1.11"
+  #         - JuliaCI/julia-test#v1:
+  #             test_args: "--quickfail core base libraries"
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only cuda\]/ ||
+  #         build.message !~ /\[only/ && !build.pull_request.draft &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip cuda\]/
+  #       timeout_in_minutes: 60
+  #       matrix:
+  #         setup:
+  #           cuda:
+  #             - "13.0"
+  #             - "12.9"
+  #             - "12.8"
+  #             - "12.6"
+  #             - "12.5"
+  #             - "12.4"
+  #             - "12.3"
+  #             - "12.2"
+  #             - "12.1"
+  #             - "12.0"
+  #         adjustments:
+  #           - with:
+  #               cuda: "13.0"
+  #             soft_fail: true
+  #       commands: |
+  #         echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
 
-  - group: ":nesting_dolls: Subpackages"
-    depends_on: "cuda"
-    steps:
-      - label: "{{matrix.package}} on CUDA {{matrix.cuda}}"
-        matrix:
-          setup:
-            cuda:
-              - "12.0"
-              - "13.0"
-            package:
-              - "cuDNN"
-              - "cuTENSOR"
-              - "cuStateVec"
-              - "cuTensorNet"
-          adjustments:
-            - with:
-                package: "cuStateVec"
-                cuda: "12.0"
-              soft_fail: true
-            - with:
-                package: "cuTensorNet"
-                cuda: "12.0"
-              soft_fail: true
-            - with:
-                package: "cuStateVec"
-                cuda: "13.0"
-              soft_fail: true
-            - with:
-                package: "cuTensorNet"
-                cuda: "13.0"
-              soft_fail: true
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.10"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only subpackages\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip subpackages\]/
-        timeout_in_minutes: 30
-        commands: |
-          julia -e '
-            using Pkg
+  # - group: ":nesting_dolls: Subpackages"
+  #   depends_on: "cuda"
+  #   steps:
+  #     - label: "{{matrix.package}} on CUDA {{matrix.cuda}}"
+  #       matrix:
+  #         setup:
+  #           cuda:
+  #             - "12.0"
+  #             - "13.0"
+  #           package:
+  #             - "cuDNN"
+  #             - "cuTENSOR"
+  #             - "cuStateVec"
+  #             - "cuTensorNet"
+  #         adjustments:
+  #           - with:
+  #               package: "cuStateVec"
+  #               cuda: "12.0"
+  #             soft_fail: true
+  #           - with:
+  #               package: "cuTensorNet"
+  #               cuda: "12.0"
+  #             soft_fail: true
+  #           - with:
+  #               package: "cuStateVec"
+  #               cuda: "13.0"
+  #             soft_fail: true
+  #           - with:
+  #               package: "cuTensorNet"
+  #               cuda: "13.0"
+  #             soft_fail: true
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "1.10"
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only subpackages\]/ ||
+  #         build.message !~ /\[only/ && !build.pull_request.draft &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip subpackages\]/
+  #       timeout_in_minutes: 30
+  #       commands: |
+  #         julia -e '
+  #           using Pkg
 
-            println("--- :julia: Instantiating project")
-            withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
-              Pkg.activate(joinpath("lib", lowercase("{{matrix.package}}")))
-              try
-                Pkg.instantiate()
-              catch
-                # if we fail to instantiate, assume that we need newer dependencies
-                deps = [PackageSpec(path=".")]
-                if "{{matrix.package}}" == "cuTensorNet"
-                  push!(deps, PackageSpec(path="lib/cutensor"))
-                end
-                Pkg.develop(deps)
-              end
+  #           println("--- :julia: Instantiating project")
+  #           withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
+  #             Pkg.activate(joinpath("lib", lowercase("{{matrix.package}}")))
+  #             try
+  #               Pkg.instantiate()
+  #             catch
+  #               # if we fail to instantiate, assume that we need newer dependencies
+  #               deps = [PackageSpec(path=".")]
+  #               if "{{matrix.package}}" == "cuTensorNet"
+  #                 push!(deps, PackageSpec(path="lib/cutensor"))
+  #               end
+  #               Pkg.develop(deps)
+  #             end
 
-              Pkg.add("CUDA_Runtime_jll")
-              write(joinpath("lib", lowercase("{{matrix.package}}"), "LocalPreferences.toml"),
-                    "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"")
-            end
+  #             Pkg.add("CUDA_Runtime_jll")
+  #             write(joinpath("lib", lowercase("{{matrix.package}}"), "LocalPreferences.toml"),
+  #                   "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"")
+  #           end
 
-            println("+++ :julia: Running tests")
-            Pkg.test(; coverage=true)'
+  #           println("+++ :julia: Running tests")
+  #           Pkg.test(; coverage=true)'
 
-  - group: ":telescope: Downstream"
-    depends_on: "cuda"
-    steps:
-      #- label: "NNlib.jl"
-      #  plugins:
-      #    - JuliaCI/julia#v1:
-      #        version: "1.11"
-      #    - JuliaCI/julia-coverage#v1:
-      #        dirs:
-      #          - src
-      #          - lib
-      #          - examples
-      #  command: |
-      #    julia --project -e '
-      #      using Pkg
-      #
-      #      cuda = pwd()
-      #      cudnn = joinpath(cuda, "lib", "cudnn")
-      #      devdir = mktempdir()
-      #      nnlib = joinpath(devdir, "NNlib")
-      #
-      #      println("--- :julia: Installing TestEnv")
-      #      Pkg.activate(; temp=true)
-      #      Pkg.add("TestEnv")
-      #      using TestEnv
-      #
-      #      println("--- :julia: Installing NNlib")
-      #      withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0,
-      #              "JULIA_PKG_DEVDIR" => devdir) do
-      #        Pkg.develop("NNlib")
-      #        Pkg.activate(nnlib)
-      #
-      #        try
-      #          Pkg.develop([PackageSpec(path=cuda), PackageSpec(path=cudnn)])
-      #          TestEnv.activate()
-      #        catch err
-      #          @error "Could not install NNlib" exception=(err,catch_backtrace())
-      #          exit(3)
-      #        finally
-      #          Pkg.activate(nnlib)
-      #        end
-      #      end
-      #
-      #      println("+++ :julia: Running tests")
-      #      Pkg.test(; coverage=true)'
-      #  env:
-      #    NNLIB_TEST_CUDA: "true"
-      #    NNLIB_TEST_CPU: "false"
-      #  agents:
-      #    queue: "juliagpu"
-      #    cuda: "*"
-      #  if: |
-      #    build.message =~ /\[only tests\]/ ||
-      #    build.message =~ /\[only downstream\]/ ||
-      #    build.message !~ /\[only/ && !build.pull_request.draft &&
-      #      build.message !~ /\[skip tests\]/ &&
-      #      build.message !~ /\[skip downstream\]/
-      #  timeout_in_minutes: 30
-      #  soft_fail:
-      #    - exit_status: 3
-      - label: "Enzyme.jl"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.10" # XXX: Enzyme.jl is broken on 1.11
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        command: |
-          julia -e '
-            using Pkg
+  # - group: ":telescope: Downstream"
+  #   depends_on: "cuda"
+  #   steps:
+  #     #- label: "NNlib.jl"
+  #     #  plugins:
+  #     #    - JuliaCI/julia#v1:
+  #     #        version: "1.11"
+  #     #    - JuliaCI/julia-coverage#v1:
+  #     #        dirs:
+  #     #          - src
+  #     #          - lib
+  #     #          - examples
+  #     #  command: |
+  #     #    julia --project -e '
+  #     #      using Pkg
+  #     #
+  #     #      cuda = pwd()
+  #     #      cudnn = joinpath(cuda, "lib", "cudnn")
+  #     #      devdir = mktempdir()
+  #     #      nnlib = joinpath(devdir, "NNlib")
+  #     #
+  #     #      println("--- :julia: Installing TestEnv")
+  #     #      Pkg.activate(; temp=true)
+  #     #      Pkg.add("TestEnv")
+  #     #      using TestEnv
+  #     #
+  #     #      println("--- :julia: Installing NNlib")
+  #     #      withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0,
+  #     #              "JULIA_PKG_DEVDIR" => devdir) do
+  #     #        Pkg.develop("NNlib")
+  #     #        Pkg.activate(nnlib)
+  #     #
+  #     #        try
+  #     #          Pkg.develop([PackageSpec(path=cuda), PackageSpec(path=cudnn)])
+  #     #          TestEnv.activate()
+  #     #        catch err
+  #     #          @error "Could not install NNlib" exception=(err,catch_backtrace())
+  #     #          exit(3)
+  #     #        finally
+  #     #          Pkg.activate(nnlib)
+  #     #        end
+  #     #      end
+  #     #
+  #     #      println("+++ :julia: Running tests")
+  #     #      Pkg.test(; coverage=true)'
+  #     #  env:
+  #     #    NNLIB_TEST_CUDA: "true"
+  #     #    NNLIB_TEST_CPU: "false"
+  #     #  agents:
+  #     #    queue: "juliagpu"
+  #     #    cuda: "*"
+  #     #  if: |
+  #     #    build.message =~ /\[only tests\]/ ||
+  #     #    build.message =~ /\[only downstream\]/ ||
+  #     #    build.message !~ /\[only/ && !build.pull_request.draft &&
+  #     #      build.message !~ /\[skip tests\]/ &&
+  #     #      build.message !~ /\[skip downstream\]/
+  #     #  timeout_in_minutes: 30
+  #     #  soft_fail:
+  #     #    - exit_status: 3
+  #     - label: "Enzyme.jl"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "1.10" # XXX: Enzyme.jl is broken on 1.11
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       command: |
+  #         julia -e '
+  #           using Pkg
 
-            println("--- :julia: Instantiating project")
-            withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
-              # add Enzyme to the test deps
-              Pkg.activate("test")
-              Pkg.add(["Enzyme", "EnzymeCore"])
+  #           println("--- :julia: Instantiating project")
+  #           withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
+  #             # add Enzyme to the test deps
+  #             Pkg.activate("test")
+  #             Pkg.add(["Enzyme", "EnzymeCore"])
 
-              # to check compatibility, also add Enzyme to the main environment
-              # (or Pkg.test, which merges both environments, could fail)
-              Pkg.activate(".")
-              # Try to co-develop Enzyme and KA, if that fails, try just to dev Enzyme
-              try
-                Pkg.develop([PackageSpec("Enzyme"), PackageSpec("KernelAbstractions")])
-              catch err
-                try
-                  Pkg.develop([PackageSpec("Enzyme")])
-                catch err
-                  @error "Could not install Enzyme" exception=(err,catch_backtrace())
-                  exit(3)
-                end
-              end
-            end
+  #             # to check compatibility, also add Enzyme to the main environment
+  #             # (or Pkg.test, which merges both environments, could fail)
+  #             Pkg.activate(".")
+  #             # Try to co-develop Enzyme and KA, if that fails, try just to dev Enzyme
+  #             try
+  #               Pkg.develop([PackageSpec("Enzyme"), PackageSpec("KernelAbstractions")])
+  #             catch err
+  #               try
+  #                 Pkg.develop([PackageSpec("Enzyme")])
+  #               catch err
+  #                 @error "Could not install Enzyme" exception=(err,catch_backtrace())
+  #                 exit(3)
+  #               end
+  #             end
+  #           end
 
-            println("+++ :julia: Running tests")
-            Pkg.test(; coverage=true, test_args=`extensions/enzyme`)'
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only downstream\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip downstream\]/
-        timeout_in_minutes: 60
-        soft_fail: true
+  #           println("+++ :julia: Running tests")
+  #           Pkg.test(; coverage=true, test_args=`extensions/enzyme`)'
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only downstream\]/ ||
+  #         build.message !~ /\[only/ && !build.pull_request.draft &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip downstream\]/
+  #       timeout_in_minutes: 60
+  #       soft_fail: true
 
-  - group: ":eyes: Special"
-    depends_on: "cuda"
-    steps:
-      - label: "GPU-less environment"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.11"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-          - JuliaCI/julia-test#v1:
-              run_tests: false
-        command: |
-          julia --project -e '
-            using CUDA
-            @assert !CUDA.functional()
-            @assert !isdefined(CUDA, :libcudart)
-            CUDA.set_runtime_version!(v"11.6")'
-          julia --project -e '
-            using CUDA
-            @assert !CUDA.functional()
-            @assert isdefined(CUDA, :libcudart)'
-        agents:
-          queue: "juliagpu"
-          intel: "*"
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only special\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip special\]/
-        timeout_in_minutes: 5
+  # - group: ":eyes: Special"
+  #   depends_on: "cuda"
+  #   steps:
+  #     - label: "GPU-less environment"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "1.11"
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #         - JuliaCI/julia-test#v1:
+  #             run_tests: false
+  #       command: |
+  #         julia --project -e '
+  #           using CUDA
+  #           @assert !CUDA.functional()
+  #           @assert !isdefined(CUDA, :libcudart)
+  #           CUDA.set_runtime_version!(v"11.6")'
+  #         julia --project -e '
+  #           using CUDA
+  #           @assert !CUDA.functional()
+  #           @assert isdefined(CUDA, :libcudart)'
+  #       agents:
+  #         queue: "juliagpu"
+  #         intel: "*"
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only special\]/ ||
+  #         build.message !~ /\[only/ && !build.pull_request.draft &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip special\]/
+  #       timeout_in_minutes: 5
 
-      - label: "Compute sanitizer"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.11"
-          - JuliaCI/julia-test#v1:
-              test_args: "--sanitize core base"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only special\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip special\]/
-        timeout_in_minutes: 60
+  #     - label: "Compute sanitizer"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "1.11"
+  #         - JuliaCI/julia-test#v1:
+  #             test_args: "--sanitize core base"
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only special\]/ ||
+  #         build.message !~ /\[only/ && !build.pull_request.draft &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip special\]/
+  #       timeout_in_minutes: 60
 
-      - label: "Legacy memory allocator"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.11"
-          - JuliaCI/julia-test#v1:
-              test_args: "--quickfail core base"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        env:
-          JULIA_CUDA_MEMORY_POOL: 'none'
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only special\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip special\]/
-        timeout_in_minutes: 30
+  #     - label: "Legacy memory allocator"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "1.11"
+  #         - JuliaCI/julia-test#v1:
+  #             test_args: "--quickfail core base"
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #       env:
+  #         JULIA_CUDA_MEMORY_POOL: 'none'
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only special\]/ ||
+  #         build.message !~ /\[only/ && !build.pull_request.draft &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip special\]/
+  #       timeout_in_minutes: 30
 
-      - label: "CuArray with {{matrix.memory}} memory"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.11"
-          - JuliaCI/julia-test#v1:
-              test_args: "--quickfail core base libraries"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only special\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip special\]/
-        timeout_in_minutes: 45
-        matrix:
-          setup:
-            memory:
-              - "unified"
-              - "host"
-        commands: |
-          echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
+  #     - label: "CuArray with {{matrix.memory}} memory"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "1.11"
+  #         - JuliaCI/julia-test#v1:
+  #             test_args: "--quickfail core base libraries"
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only special\]/ ||
+  #         build.message !~ /\[only/ && !build.pull_request.draft &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip special\]/
+  #       timeout_in_minutes: 45
+  #       matrix:
+  #         setup:
+  #           memory:
+  #             - "unified"
+  #             - "host"
+  #       commands: |
+  #         echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
 
-      - label: "MultiGPU"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.11"
-          - JuliaCI/julia-test#v1:
-              test_args: "--gpu=0,1 --quickfail core base libraries"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-          multigpu: "*"
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only special\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip special\]/
-        timeout_in_minutes: 45
+  #     - label: "MultiGPU"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "1.11"
+  #         - JuliaCI/julia-test#v1:
+  #             test_args: "--gpu=0,1 --quickfail core base libraries"
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #         multigpu: "*"
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only special\]/ ||
+  #         build.message !~ /\[only/ && !build.pull_request.draft &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip special\]/
+  #       timeout_in_minutes: 45
 
-      - label: ":older_man: Old dependencies"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.10" # use the oldest supported Julia version (and update below)
-          - JuliaCI/julia-test#v1:
-              test_args: "--quickfail core base"
-              allow_reresolve: false
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        commands: |
-          git clone https://github.com/StefanKarpinski/Resolver.jl /tmp/Resolver
-          julia -e 'using Pkg; Pkg.activate("/tmp/Resolver/bin"); Pkg.instantiate()'
-          julia /tmp/Resolver/bin/resolve.jl . --min=@alldeps --julia="1.10"
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only special\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip special\]/
-        timeout_in_minutes: 30
+  #     - label: ":older_man: Old dependencies"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "1.10" # use the oldest supported Julia version (and update below)
+  #         - JuliaCI/julia-test#v1:
+  #             test_args: "--quickfail core base"
+  #             allow_reresolve: false
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #       commands: |
+  #         git clone https://github.com/StefanKarpinski/Resolver.jl /tmp/Resolver
+  #         julia -e 'using Pkg; Pkg.activate("/tmp/Resolver/bin"); Pkg.instantiate()'
+  #         julia /tmp/Resolver/bin/resolve.jl . --min=@alldeps --julia="1.10"
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only special\]/ ||
+  #         build.message !~ /\[only/ && !build.pull_request.draft &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip special\]/
+  #       timeout_in_minutes: 30
 
   - wait: ~
     continue_on_failure: true
 
-  - label: ":books: Documentation"
+  # - label: ":books: Documentation"
+  #   plugins:
+  #     - JuliaCI/julia#v1:
+  #         version: "1.11"
+  #   command: |
+  #     julia --project -e '
+  #       println("--- :julia: Instantiating project")
+  #       using Pkg
+  #       Pkg.instantiate()
+  #       Pkg.activate("docs")
+  #       Pkg.instantiate()
+  #       push!(LOAD_PATH, @__DIR__)
+
+  #       println("+++ :julia: Building documentation")
+  #       include("docs/make.jl")'
+  #   agents:
+  #     queue: "juliagpu"
+  #     cuda: "*"
+  #   if: |
+  #     build.message =~ /\[only docs\]/ ||
+  #     build.message !~ /\[only/ && !build.pull_request.draft &&
+  #       build.message !~ /\[skip docs\]/
+  #   timeout_in_minutes: 15
+
+  - label: ":racehorse: Benchmarks"
     plugins:
       - JuliaCI/julia#v1:
           version: "1.11"
     command: |
       julia --project -e '
-        println("--- :julia: Instantiating project")
         using Pkg
-        Pkg.instantiate()
-        Pkg.activate("docs")
-        Pkg.instantiate()
-        push!(LOAD_PATH, @__DIR__)
 
-        println("+++ :julia: Building documentation")
-        include("docs/make.jl")'
-    agents:
-      queue: "juliagpu"
-      cuda: "*"
-    if: |
-      build.message =~ /\[only docs\]/ ||
-      build.message !~ /\[only/ && !build.pull_request.draft &&
-        build.message !~ /\[skip docs\]/
-    timeout_in_minutes: 15
+        println("--- :julia: Developing KernelAbstractions")
+        Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")'
 
-  - label: ":racehorse: Benchmarks"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: "1.11"
-    command: |
       julia --project=perf -e '
         using Pkg
 
diff --git a/test/base/kernelabstractions.jl b/test/base/kernelabstractions.jl
index 2cb607ee3e..2f2c4300b5 100644
--- a/test/base/kernelabstractions.jl
+++ b/test/base/kernelabstractions.jl
@@ -4,7 +4,9 @@ using SparseArrays
 
 include(joinpath(dirname(pathof(KernelAbstractions)), "..", "test", "testsuite.jl"))
 
-Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray)
+Testsuite.testsuite(()->CUDABackend(false, false), "CUDA", CUDA, CuArray, CuDeviceArray; skip_tests=Set([
+    "CPU synchronization",
+    "fallback test: callable types",]))
 for (PreferBlocks, AlwaysInline) in Iterators.product((true, false), (true, false))
     Testsuite.unittest_testsuite(()->CUDABackend(PreferBlocks, AlwaysInline), "CUDA", CUDA, CuDeviceArray)
 end
@@ -16,7 +18,7 @@ end
 @testset "CUDA Backend Adapt Tests" begin
     # CPU → GPU
     A = sprand(Float32, 10, 10, 0.5) #CSC
-    A_d = adapt(CUDABackend(), A) 
+    A_d = adapt(CUDABackend(), A)
     @test A_d isa CUSPARSE.CuSparseMatrixCSC
     @test adapt(CUDABackend(), A_d) |> typeof == typeof(A_d)
 
@@ -24,5 +26,5 @@ end
     B_d = A |> cu # CuCSC
     B = adapt(KA.CPU(), A_d)
     @test B isa SparseMatrixCSC
-    @test adapt(KA.CPU(), B) |> typeof == typeof(B) 
+    @test adapt(KA.CPU(), B) |> typeof == typeof(B)
 end

From 0d49f177272c546b074ae94b5ea32afcafcef1f9 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 12:27:10 -0300
Subject: [PATCH 05/23] Apply suggestion from @christiangnrd

---
 src/mapreduce.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index c7ec3cf21f..b32a87aaff 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -286,7 +286,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                       ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+                    workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end

From 762e591e7f17d7d9d66039e70849e546cfab2552 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 12:27:49 -0300
Subject: [PATCH 06/23] Apply suggestion from @christiangnrd

---
 src/mapreduce.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index b32a87aaff..8f75fcce59 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -255,7 +255,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # perform the actual reduction
     if reduce_blocks == 1
         # we can cover the dimensions to reduce using a single block
-        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; ; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem)
+        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem)
     else
         # TODO: provide a version that atomically reduces from different blocks
 

From e133fbdd8f65e09e58c23ad7654bc0c9c98a0727 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 12:36:35 -0300
Subject: [PATCH 07/23] Apply suggestion from @christiangnrd

---
 src/CUDAKernels.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 7f836d39fd..abc7dbe906 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -204,7 +204,7 @@ end
     return (; x = Int((blockDim().x-1)*blockDim().x + threadIdx().x), y = Int((blockDim().y-1)*blockDim().y + threadIdx().y), z = Int((blockDim().z-1)*blockDim().z + threadIdx().z))
 end
 
-@device_override @inline function KI.get_num_grouups()
+@device_override @inline function KI.get_num_groups()
     return (; x = Int(gridDim().x), y = Int(gridDim().y), z = Int(gridDim().z))
 end
 

From f3413babd7b70f9ef2e72c7d8aa1df51b880afa2 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 13:22:49 -0300
Subject: [PATCH 08/23] Fix

---
 src/CUDAKernels.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index abc7dbe906..c99ca7e723 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -212,7 +212,7 @@ end
     return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(lockDim().z * gridDim().z))
 end
 
-@device_override @inline function KI.__validindex(ctx)
+@device_override @inline function KA.__validindex(ctx)
     if KA.__dynamic_checkbounds(ctx)
         I = @inbounds KA.expand(KA.__iterspace(ctx), blockIdx().x, threadIdx().x)
         return I in KA.__ndrange(ctx)

From 0474a67eaa0f79184a5d64bc584d936955dbea5a Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 13:22:54 -0300
Subject: [PATCH 09/23] shtgbr

---
 .buildkite/pipeline.yml | 88 ++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index ae6aea7a2d..f7cc0f5ef0 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,52 +1,52 @@
 steps:
   # first, test supported Julia versions (using local CUDA as installed on the system)
-  - group: ":julia: Julia"
-    key: "julia"
-    steps:
-      - label: "Julia {{matrix.julia}}"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "{{matrix.julia}}"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        commands: |
-          echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
+  # - group: ":julia: Julia"
+  #   key: "julia"
+  #   steps:
+  #     - label: "Julia {{matrix.julia}}"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "{{matrix.julia}}"
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #       commands: |
+  #         echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
 
-          julia -e 'println("--- :julia: Developing KernelAbstractions")
-                using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")'
+  #         julia -e 'println("--- :julia: Developing KernelAbstractions")
+  #               using Pkg
+  #               Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")'
 
-          julia -e 'println("--- :julia: Instantiating project")
-                using Pkg
-                Pkg.develop(; path=pwd())' || exit 3
+  #         julia -e 'println("--- :julia: Instantiating project")
+  #               using Pkg
+  #               Pkg.develop(; path=pwd())' || exit 3
 
-          julia -e 'println("+++ :julia: Running tests")
-                using Pkg
-                Pkg.test("CUDA"; coverage=true)'
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only julia\]/ ||
-          build.message !~ /\[only/ &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip julia\]/
-        timeout_in_minutes: 90
-        matrix:
-          setup:
-            julia:
-              - "1.10"
-              - "1.11"
-              - "1.12"
-              - "nightly"
-          adjustments:
-            - with:
-                julia: "nightly"
-              soft_fail: true
+  #         julia -e 'println("+++ :julia: Running tests")
+  #               using Pkg
+  #               Pkg.test("CUDA"; coverage=true)'
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only julia\]/ ||
+  #         build.message !~ /\[only/ &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip julia\]/
+  #       timeout_in_minutes: 90
+  #       matrix:
+  #         setup:
+  #           julia:
+  #             - "1.10"
+  #             - "1.11"
+  #             - "1.12"
+  #             - "nightly"
+  #         adjustments:
+  #           - with:
+  #               julia: "nightly"
+  #             soft_fail: true
 
   # # then, test supported CUDA toolkits (installed through the artifact system)
   # - group: "CUDA"

From 0c1ea747e13e6c357299e16627cd9ad662e42e29 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 13:32:51 -0300
Subject: [PATCH 10/23] Fix

---
 src/device/random.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/device/random.jl b/src/device/random.jl
index 0cbedef6ff..7d72d90a1a 100644
--- a/src/device/random.jl
+++ b/src/device/random.jl
@@ -63,6 +63,8 @@ end
 @inline Philox2x32() = Philox2x32{7}()
 
 @inline function Base.getproperty(rng::Philox2x32, field::Symbol)
+    threadId = threadIdx().x + (threadIdx().y - 1i32) * blockDim().x +
+                               (threadIdx().z - 1i32) * blockDim().x * blockDim().y
     warpId = (threadId - 1i32) >> 0x5 + 1i32  # fld1
 
     if field === :key

From 14f1c275a4e07122993ecdaf7ac54dbc3c50b0cc Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 13:40:49 -0300
Subject: [PATCH 11/23] another fix

---
 src/CUDAKernels.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index c99ca7e723..b9e6cc94f8 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -209,7 +209,7 @@ end
 end
 
 @device_override @inline function KI.get_global_size()
-    return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(lockDim().z * gridDim().z))
+    return (; x = Int(blockDim().x * gridDim().x), y = Int(blockDim().y * gridDim().y), z = Int(blockDim().z * gridDim().z))
 end
 
 @device_override @inline function KA.__validindex(ctx)

From 0a3e029fdeebc54020e6cb429b5b4db1d0c7791a Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 14:01:03 -0300
Subject: [PATCH 12/23] fix

---
 src/CUDAKernels.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index b9e6cc94f8..fb5079373b 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -197,11 +197,11 @@ end
 end
 
 @device_override @inline function KI.get_global_id()
-    return (; x = Int(blockDim().x), y = Int(blockDim().y), z = Int(blockDim().z))
+    return (; x = Int((blockIdx().x-1)*blockDim().x + threadIdx().x), y = Int((blockIdx().y-1)*blockDim().y + threadIdx().y), z = Int((blockIdx().z-1)*blockDim().z + threadIdx().z))
 end
 
 @device_override @inline function KI.get_local_size()
-    return (; x = Int((blockDim().x-1)*blockDim().x + threadIdx().x), y = Int((blockDim().y-1)*blockDim().y + threadIdx().y), z = Int((blockDim().z-1)*blockDim().z + threadIdx().z))
+    return (; x = Int(blockDim().x), y = Int(blockDim().y), z = Int(blockDim().z))
 end
 
 @device_override @inline function KI.get_num_groups()

From d652d0be6fb8bb612533e9bcd0ff055802b312d5 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 14:18:12 -0300
Subject: [PATCH 13/23] dfgbsg

---
 src/CUDAKernels.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index fb5079373b..bf4c269023 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -178,10 +178,10 @@ function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDA
     Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
-    Int(attribute(device(), DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
+    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))
 end
 function KI.multiprocessor_count(::CUDABackend)::Int
-    Int(attribute(device(), DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
+    Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT))
 end
 
 ## indexing

From 53dfbf0f0b2bc82fc78bb8ada5fec208f51b610f Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 14:36:23 -0300
Subject: [PATCH 14/23] dgbsg

---
 src/mapreduce.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 8f75fcce59..f0f096b646 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -265,8 +265,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         # NOTE: we can't use the previously-compiled kernel, or its launch configuration,
         #       since the type of `partial` might not match the original output container
         #       (e.g. if that was a view).
-        partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
-        partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmem∘compute_threads)
+        partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
+        partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads)
         partial_reduce_threads = compute_threads(partial_kernel_config.threads)
         partial_reduce_shmem = compute_shmem(partial_reduce_threads)
         partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks

From 08baa0f28661adf8577d85105efa9aed325ea82c Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 15:34:36 -0300
Subject: [PATCH 15/23] Finally?

---
 src/mapreduce.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index f0f096b646..3af00be038 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -202,7 +202,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     if length(Rother) >= serial_mapreduce_threshold(dev)
         args = (f, op, init, Rreduce, Rother, R, A)
         kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
-        kernel_config = launch_configuration(kernel.fun)
+        kernel_config = launch_configuration(kernel.kern.fun)
         threads = kernel_config.threads
         blocks = cld(length(Rother), threads)
         kernel(args...; workgroupsize=threads, numworkgroups=blocks)
@@ -255,7 +255,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # perform the actual reduction
     if reduce_blocks == 1
         # we can cover the dimensions to reduce using a single block
-        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem)
+        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
     else
         # TODO: provide a version that atomically reduces from different blocks
 

From fed145c9ed694ee105b03a0c8132255d775be321 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 15:59:15 -0300
Subject: [PATCH 16/23] Revert "shtgbr"

This reverts commit 3fba9403abf18d842105f76124e511fad0fe4231.
---
 .buildkite/pipeline.yml | 88 ++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index f7cc0f5ef0..ae6aea7a2d 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,52 +1,52 @@
 steps:
   # first, test supported Julia versions (using local CUDA as installed on the system)
-  # - group: ":julia: Julia"
-  #   key: "julia"
-  #   steps:
-  #     - label: "Julia {{matrix.julia}}"
-  #       plugins:
-  #         - JuliaCI/julia#v1:
-  #             version: "{{matrix.julia}}"
-  #         - JuliaCI/julia-coverage#v1:
-  #             dirs:
-  #               - src
-  #               - lib
-  #               - examples
-  #       agents:
-  #         queue: "juliagpu"
-  #         cuda: "*"
-  #       commands: |
-  #         echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
+  - group: ":julia: Julia"
+    key: "julia"
+    steps:
+      - label: "Julia {{matrix.julia}}"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: "{{matrix.julia}}"
+          - JuliaCI/julia-coverage#v1:
+              dirs:
+                - src
+                - lib
+                - examples
+        agents:
+          queue: "juliagpu"
+          cuda: "*"
+        commands: |
+          echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
 
-  #         julia -e 'println("--- :julia: Developing KernelAbstractions")
-  #               using Pkg
-  #               Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")'
+          julia -e 'println("--- :julia: Developing KernelAbstractions")
+                using Pkg
+                Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")'
 
-  #         julia -e 'println("--- :julia: Instantiating project")
-  #               using Pkg
-  #               Pkg.develop(; path=pwd())' || exit 3
+          julia -e 'println("--- :julia: Instantiating project")
+                using Pkg
+                Pkg.develop(; path=pwd())' || exit 3
 
-  #         julia -e 'println("+++ :julia: Running tests")
-  #               using Pkg
-  #               Pkg.test("CUDA"; coverage=true)'
-  #       if: |
-  #         build.message =~ /\[only tests\]/ ||
-  #         build.message =~ /\[only julia\]/ ||
-  #         build.message !~ /\[only/ &&
-  #           build.message !~ /\[skip tests\]/ &&
-  #           build.message !~ /\[skip julia\]/
-  #       timeout_in_minutes: 90
-  #       matrix:
-  #         setup:
-  #           julia:
-  #             - "1.10"
-  #             - "1.11"
-  #             - "1.12"
-  #             - "nightly"
-  #         adjustments:
-  #           - with:
-  #               julia: "nightly"
-  #             soft_fail: true
+          julia -e 'println("+++ :julia: Running tests")
+                using Pkg
+                Pkg.test("CUDA"; coverage=true)'
+        if: |
+          build.message =~ /\[only tests\]/ ||
+          build.message =~ /\[only julia\]/ ||
+          build.message !~ /\[only/ &&
+            build.message !~ /\[skip tests\]/ &&
+            build.message !~ /\[skip julia\]/
+        timeout_in_minutes: 90
+        matrix:
+          setup:
+            julia:
+              - "1.10"
+              - "1.11"
+              - "1.12"
+              - "nightly"
+          adjustments:
+            - with:
+                julia: "nightly"
+              soft_fail: true
 
   # # then, test supported CUDA toolkits (installed through the artifact system)
   # - group: "CUDA"

From 46cbc05c0c3736b5f9c5d1013b33a3d6dc279256 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 17:21:41 -0300
Subject: [PATCH 17/23] Reapply "shtgbr"

This reverts commit fc12cd16974cff27fdc4cd53aa335ed64052a2b5.
---
 .buildkite/pipeline.yml | 88 ++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index ae6aea7a2d..f7cc0f5ef0 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1,52 +1,52 @@
 steps:
   # first, test supported Julia versions (using local CUDA as installed on the system)
-  - group: ":julia: Julia"
-    key: "julia"
-    steps:
-      - label: "Julia {{matrix.julia}}"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "{{matrix.julia}}"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        commands: |
-          echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
+  # - group: ":julia: Julia"
+  #   key: "julia"
+  #   steps:
+  #     - label: "Julia {{matrix.julia}}"
+  #       plugins:
+  #         - JuliaCI/julia#v1:
+  #             version: "{{matrix.julia}}"
+  #         - JuliaCI/julia-coverage#v1:
+  #             dirs:
+  #               - src
+  #               - lib
+  #               - examples
+  #       agents:
+  #         queue: "juliagpu"
+  #         cuda: "*"
+  #       commands: |
+  #         echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
 
-          julia -e 'println("--- :julia: Developing KernelAbstractions")
-                using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")'
+  #         julia -e 'println("--- :julia: Developing KernelAbstractions")
+  #               using Pkg
+  #               Pkg.add(url="https://github.com/christiangnrd/KernelAbstractions.jl", rev="intrinsics")'
 
-          julia -e 'println("--- :julia: Instantiating project")
-                using Pkg
-                Pkg.develop(; path=pwd())' || exit 3
+  #         julia -e 'println("--- :julia: Instantiating project")
+  #               using Pkg
+  #               Pkg.develop(; path=pwd())' || exit 3
 
-          julia -e 'println("+++ :julia: Running tests")
-                using Pkg
-                Pkg.test("CUDA"; coverage=true)'
-        if: |
-          build.message =~ /\[only tests\]/ ||
-          build.message =~ /\[only julia\]/ ||
-          build.message !~ /\[only/ &&
-            build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip julia\]/
-        timeout_in_minutes: 90
-        matrix:
-          setup:
-            julia:
-              - "1.10"
-              - "1.11"
-              - "1.12"
-              - "nightly"
-          adjustments:
-            - with:
-                julia: "nightly"
-              soft_fail: true
+  #         julia -e 'println("+++ :julia: Running tests")
+  #               using Pkg
+  #               Pkg.test("CUDA"; coverage=true)'
+  #       if: |
+  #         build.message =~ /\[only tests\]/ ||
+  #         build.message =~ /\[only julia\]/ ||
+  #         build.message !~ /\[only/ &&
+  #           build.message !~ /\[skip tests\]/ &&
+  #           build.message !~ /\[skip julia\]/
+  #       timeout_in_minutes: 90
+  #       matrix:
+  #         setup:
+  #           julia:
+  #             - "1.10"
+  #             - "1.11"
+  #             - "1.12"
+  #             - "nightly"
+  #         adjustments:
+  #           - with:
+  #               julia: "nightly"
+  #             soft_fail: true
 
   # # then, test supported CUDA toolkits (installed through the artifact system)
   # - group: "CUDA"

From d341f2d399a2f6a2a92d08a5870fa167d74edd15 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 22 Oct 2025 17:26:59 -0300
Subject: [PATCH 18/23] Revert KIKernel

---
 src/mapreduce.jl | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 3af00be038..97a4176b41 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -201,11 +201,14 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
     if length(Rother) >= serial_mapreduce_threshold(dev)
         args = (f, op, init, Rreduce, Rother, R, A)
-        kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
-        kernel_config = launch_configuration(kernel.kern.fun)
+        # kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
+        kernel = @cuda launch=false serial_mapreduce_kernel(args...)
+        # kernel_config = launch_configuration(kernel.kern.fun)
+        kernel_config = launch_configuration(kernel.fun)
         threads = kernel_config.threads
         blocks = cld(length(Rother), threads)
-        kernel(args...; workgroupsize=threads, numworkgroups=blocks)
+        # kernel(args...; workgroupsize=threads, numworkgroups=blocks)
+        kernel(args...; threads, blocks)
         return R
     end
 
@@ -228,9 +231,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single thread block.
-    kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
+    # kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
+    kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A)
     compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
-    kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
+    # kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
+    kernel_config = launch_configuration(kernel.fun; shmem=compute_shmem∘compute_threads)
     reduce_threads = compute_threads(kernel_config.threads)
     reduce_shmem = compute_shmem(reduce_threads)
 
@@ -255,7 +260,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # perform the actual reduction
     if reduce_blocks == 1
         # we can cover the dimensions to reduce using a single block
-        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
+        # kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
+        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
     else
         # TODO: provide a version that atomically reduces from different blocks
 
@@ -265,8 +271,10 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         # NOTE: we can't use the previously-compiled kernel, or its launch configuration,
         #       since the type of `partial` might not match the original output container
         #       (e.g. if that was a view).
-        partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
-        partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads)
+        # partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
+        partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
+        # partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads)
+        partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmem∘compute_threads)
         partial_reduce_threads = compute_threads(partial_kernel_config.threads)
         partial_reduce_shmem = compute_shmem(partial_reduce_threads)
         partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks
@@ -286,7 +294,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
+                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end

From 6a2c51ba94c06cb95af7af2b5eecc7b3a62c5b3f Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 3 Nov 2025 16:26:15 -0400
Subject: [PATCH 19/23] fix

---
 src/CUDAKernels.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index bf4c269023..0ef0a7d089 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -175,7 +175,9 @@ end
 
 
 function KI.kernel_max_work_group_size(::CUDABackend, kikern::KI.KIKernel{<:CUDABackend}; max_work_items::Int=typemax(Int))::Int
-    Int(min(kikern.kern.pipeline.maxTotalThreadsPerThreadgroup, max_work_items))
+    kernel_config = launch_configuration(kikern.kern.fun)
+
+    Int(min(kernel_config.threads, max_work_items))
 end
 function KI.max_work_group_size(::CUDABackend)::Int
     Int(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK))

From b63f875f099c811e752e5a1683dac615e8e94878 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 4 Nov 2025 13:26:20 -0400
Subject: [PATCH 20/23] New

---
 src/CUDAKernels.jl | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
index 0ef0a7d089..995728201d 100644
--- a/src/CUDAKernels.jl
+++ b/src/CUDAKernels.jl
@@ -1,7 +1,7 @@
 module CUDAKernels
 
 using ..CUDA
-using ..CUDA: @device_override, CUSPARSE, default_memory, UnifiedMemory
+using ..CUDA: @device_override, CUSPARSE, default_memory, UnifiedMemory, cufunction, cudaconvert
 
 import KernelAbstractions as KA
 import KernelAbstractions: KernelIntrinsics as KI
@@ -158,11 +158,10 @@ function (obj::KA.Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=n
     return nothing
 end
 
+KI.kiconvert(::CUDABackend, arg) = cudaconvert(arg)
 
-function KI.KIKernel(::CUDABackend, f, args...; kwargs...)
-    kern = eval(quote
-        @cuda launch=false $(kwargs...) $(f)($(args...))
-    end)
+function KI.kifunction(::CUDABackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    kern = cufunction(f, tt; name, kwargs...)
     KI.KIKernel{CUDABackend, typeof(kern)}(CUDABackend(), kern)
 end
 

From f61fd57e6b41b30f01f6234a7d2f156f2d470d3d Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 4 Aug 2025 16:54:37 -0300
Subject: [PATCH 21/23] Test GPUArrays `reverse` [only julia] [only benchmarks]

---
 perf/Project.toml     |   1 +
 perf/runbenchmarks.jl |   2 +
 src/reverse.jl        | 280 +++++++++++++++++++++---------------------
 test/base/array.jl    |  88 ++++++-------
 test/runtests.jl      |   3 +
 5 files changed, 190 insertions(+), 184 deletions(-)

diff --git a/perf/Project.toml b/perf/Project.toml
index 8314c7b285..9b028ce66a 100644
--- a/perf/Project.toml
+++ b/perf/Project.toml
@@ -2,5 +2,6 @@
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index 72d8b1e76a..21e9a96aac 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -1,4 +1,6 @@
 # benchmark suite execution and codespeed submission
+using Pkg
+Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="reverse")
 
 using CUDA
 
diff --git a/src/reverse.jl b/src/reverse.jl
index d0c73da2ec..94914941f5 100644
--- a/src/reverse.jl
+++ b/src/reverse.jl
@@ -1,155 +1,155 @@
 # reversing
 
-# the kernel works by treating the array as 1d. after reversing by dimension x an element at
-# pos [i1, i2, i3, ... , i{x},            ..., i{n}] will be at
-# pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x
-
-# out-of-place version, copying a single value per thread from input to output
-function _reverse(input::AnyCuArray{T, N}, output::AnyCuArray{T, N};
-                  dims=1:ndims(input)) where {T, N}
-    @assert size(input) == size(output)
-    rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N)
-    ref = size(input) .+ 1
-    # converts an ND-index in the data array to the linear index
-    lin_idx = LinearIndices(input)
-    # converts a linear index in a reduced array to an ND-index, but using the reduced size
-    nd_idx = CartesianIndices(input)
-
-    ## COV_EXCL_START
-    function kernel(input::AbstractArray{T, N}, output::AbstractArray{T, N}) where {T, N}
-        offset_in = blockDim().x * (blockIdx().x - 1i32)
-        index_in = offset_in + threadIdx().x
-
-        @inbounds if index_in <= length(input)
-            idx = Tuple(nd_idx[index_in])
-            idx = ifelse.(rev_dims, ref .- idx, idx)
-            index_out =  lin_idx[idx...]
-            output[index_out] = input[index_in]
-        end
-
-        return
-    end
-    ## COV_EXCL_STOP
-
-    nthreads = 256
-    nblocks = cld(length(input), nthreads)
-
-    @cuda threads=nthreads blocks=nblocks kernel(input, output)
-end
-
-# in-place version, swapping elements on half the number of threads
-function _reverse!(data::AnyCuArray{T, N}; dims=1:ndims(data)) where {T, N}
-    rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N)
-    half_dim = findlast(rev_dims)
-    if isnothing(half_dim)
-        # no reverse operation needed at all in this case.
-        return
-    end
-    ref = size(data) .+ 1
-    # converts an ND-index in the data array to the linear index
-    lin_idx = LinearIndices(data)
-    reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N)
-    reduced_length = prod(reduced_size)
-    # converts a linear index in a reduced array to an ND-index, but using the reduced size
-    nd_idx = CartesianIndices(reduced_size)
-
-    ## COV_EXCL_START
-    function kernel(data::AbstractArray{T, N}) where {T, N}
-        offset_in = blockDim().x * (blockIdx().x - 1i32)
-
-        index_in = offset_in + threadIdx().x
-
-        @inbounds if index_in <= reduced_length
-            idx = Tuple(nd_idx[index_in])
-            index_in = lin_idx[idx...]
-            idx = ifelse.(rev_dims, ref .- idx, idx)
-            index_out =  lin_idx[idx...]
-
-            if index_in < index_out
-                temp = data[index_out]
-                data[index_out] = data[index_in]
-                data[index_in] = temp
-            end
-        end
-
-        return
-    end
-    ## COV_EXCL_STOP
-
-    # NOTE: we launch slightly more than half the number of elements in the array as threads.
-    # The last non-singleton dimension along which to reverse is used to define how the array is split.
-    # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by
-    # ignoring the threads that cross the mid-point
-
-    nthreads = 256
-    nblocks = cld(prod(reduced_size), nthreads)
-
-    @cuda threads=nthreads blocks=nblocks kernel(data)
-end
+# # the kernel works by treating the array as 1d. after reversing by dimension x an element at
+# # pos [i1, i2, i3, ... , i{x},            ..., i{n}] will be at
+# # pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x
+
+# # out-of-place version, copying a single value per thread from input to output
+# function _reverse(input::AnyCuArray{T, N}, output::AnyCuArray{T, N};
+#                   dims=1:ndims(input)) where {T, N}
+#     @assert size(input) == size(output)
+#     rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N)
+#     ref = size(input) .+ 1
+#     # converts an ND-index in the data array to the linear index
+#     lin_idx = LinearIndices(input)
+#     # converts a linear index in a reduced array to an ND-index, but using the reduced size
+#     nd_idx = CartesianIndices(input)
+
+#     ## COV_EXCL_START
+#     function kernel(input::AbstractArray{T, N}, output::AbstractArray{T, N}) where {T, N}
+#         offset_in = blockDim().x * (blockIdx().x - 1i32)
+#         index_in = offset_in + threadIdx().x
+
+#         @inbounds if index_in <= length(input)
+#             idx = Tuple(nd_idx[index_in])
+#             idx = ifelse.(rev_dims, ref .- idx, idx)
+#             index_out =  lin_idx[idx...]
+#             output[index_out] = input[index_in]
+#         end
+
+#         return
+#     end
+#     ## COV_EXCL_STOP
+
+#     nthreads = 256
+#     nblocks = cld(length(input), nthreads)
+
+#     @cuda threads=nthreads blocks=nblocks kernel(input, output)
+# end
+
+# # in-place version, swapping elements on half the number of threads
+# function _reverse!(data::AnyCuArray{T, N}; dims=1:ndims(data)) where {T, N}
+#     rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N)
+#     half_dim = findlast(rev_dims)
+#     if isnothing(half_dim)
+#         # no reverse operation needed at all in this case.
+#         return
+#     end
+#     ref = size(data) .+ 1
+#     # converts an ND-index in the data array to the linear index
+#     lin_idx = LinearIndices(data)
+#     reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N)
+#     reduced_length = prod(reduced_size)
+#     # converts a linear index in a reduced array to an ND-index, but using the reduced size
+#     nd_idx = CartesianIndices(reduced_size)
+
+#     ## COV_EXCL_START
+#     function kernel(data::AbstractArray{T, N}) where {T, N}
+#         offset_in = blockDim().x * (blockIdx().x - 1i32)
+
+#         index_in = offset_in + threadIdx().x
+
+#         @inbounds if index_in <= reduced_length
+#             idx = Tuple(nd_idx[index_in])
+#             index_in = lin_idx[idx...]
+#             idx = ifelse.(rev_dims, ref .- idx, idx)
+#             index_out =  lin_idx[idx...]
+
+#             if index_in < index_out
+#                 temp = data[index_out]
+#                 data[index_out] = data[index_in]
+#                 data[index_in] = temp
+#             end
+#         end
+
+#         return
+#     end
+#     ## COV_EXCL_STOP
+
+#     # NOTE: we launch slightly more than half the number of elements in the array as threads.
+#     # The last non-singleton dimension along which to reverse is used to define how the array is split.
+#     # Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by
+#     # ignoring the threads that cross the mid-point
+
+#     nthreads = 256
+#     nblocks = cld(prod(reduced_size), nthreads)
+
+#     @cuda threads=nthreads blocks=nblocks kernel(data)
+# end
 
 
 # n-dimensional API
 
-function Base.reverse!(data::AnyCuArray{T, N}; dims=:) where {T, N}
-    if isa(dims, Colon)
-        dims = 1:ndims(data)
-    end
-    if !applicable(iterate, dims)
-        throw(ArgumentError("dimension $dims is not an iterable"))
-    end
-    if !all(1 .≤ dims .≤ ndims(data))
-        throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))"))
-    end
-
-    _reverse!(data; dims=dims)
-
-    return data
-end
-
-# out-of-place
-function Base.reverse(input::AnyCuArray{T, N}; dims=:) where {T, N}
-    if isa(dims, Colon)
-        dims = 1:ndims(input)
-    end
-    if !applicable(iterate, dims)
-        throw(ArgumentError("dimension $dims is not an iterable"))
-    end
-    if !all(1 .≤ dims .≤ ndims(input))
-        throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))"))
-    end
-
-    if all(size(input)[[dims...]].==1)
-        # no reverse operation needed at all in this case.
-        return copy(input)
-    else
-        output = similar(input)
-        _reverse(input, output; dims=dims)
-        return output
-    end
-end
+# function Base.reverse!(data::AnyCuArray{T, N}; dims=:) where {T, N}
+#     if isa(dims, Colon)
+#         dims = 1:ndims(data)
+#     end
+#     if !applicable(iterate, dims)
+#         throw(ArgumentError("dimension $dims is not an iterable"))
+#     end
+#     if !all(1 .≤ dims .≤ ndims(data))
+#         throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))"))
+#     end
+
+#     _reverse!(data; dims=dims)
+
+#     return data
+# end
+
+# # out-of-place
+# function Base.reverse(input::AnyCuArray{T, N}; dims=:) where {T, N}
+#     if isa(dims, Colon)
+#         dims = 1:ndims(input)
+#     end
+#     if !applicable(iterate, dims)
+#         throw(ArgumentError("dimension $dims is not an iterable"))
+#     end
+#     if !all(1 .≤ dims .≤ ndims(input))
+#         throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))"))
+#     end
+
+#     if all(size(input)[[dims...]].==1)
+#         # no reverse operation needed at all in this case.
+#         return copy(input)
+#     else
+#         output = similar(input)
+#         _reverse(input, output; dims=dims)
+#         return output
+#     end
+# end
 
 
 # 1-dimensional API
 
-# in-place
-Base.@propagate_inbounds function Base.reverse!(data::AnyCuVector{T}, start::Integer,
-                                                stop::Integer=length(data)) where {T}
-    _reverse!(view(data, start:stop))
-    return data
-end
+# # in-place
+# Base.@propagate_inbounds function Base.reverse!(data::AnyCuVector{T}, start::Integer,
+#                                                 stop::Integer=length(data)) where {T}
+#     _reverse!(view(data, start:stop))
+#     return data
+# end
 
-Base.reverse!(data::AnyCuVector{T}) where {T} = @inbounds reverse!(data, 1, length(data))
+# Base.reverse!(data::AnyCuVector{T}) where {T} = @inbounds reverse!(data, 1, length(data))
 
-# out-of-place
-Base.@propagate_inbounds function Base.reverse(input::AnyCuVector{T}, start::Integer,
-                                               stop::Integer=length(input)) where {T}
-    output = similar(input)
+# # out-of-place
+# Base.@propagate_inbounds function Base.reverse(input::AnyCuVector{T}, start::Integer,
+#                                                stop::Integer=length(input)) where {T}
+#     output = similar(input)
 
-    start > 1 && copyto!(output, 1, input, 1, start-1)
-    _reverse(view(input, start:stop), view(output, start:stop))
-    stop < length(input) && copyto!(output, stop+1, input, stop+1)
+#     start > 1 && copyto!(output, 1, input, 1, start-1)
+#     _reverse(view(input, start:stop), view(output, start:stop))
+#     stop < length(input) && copyto!(output, stop+1, input, stop+1)
 
-    return output
-end
+#     return output
+# end
 
-Base.reverse(data::AnyCuVector{T}) where {T} = @inbounds reverse(data, 1, length(data))
+# Base.reverse(data::AnyCuVector{T}) where {T} = @inbounds reverse(data, 1, length(data))
diff --git a/test/base/array.jl b/test/base/array.jl
index 4c0ebca8da..a42963f634 100644
--- a/test/base/array.jl
+++ b/test/base/array.jl
@@ -478,50 +478,50 @@ end
   @test Array(x) == zeros(4)
 end
 
-@testset "reverse" begin
-    # 1-d out-of-place
-    @test testf(x->reverse(x), rand(1000))
-    @test testf(x->reverse(x, 10), rand(1000))
-    @test testf(x->reverse(x, 10, 90), rand(1000))
-
-    # 1-d in-place
-    @test testf(x->reverse!(x), rand(1000))
-    @test testf(x->reverse!(x, 10), rand(1000))
-    @test testf(x->reverse!(x, 10, 90), rand(1000))
-
-    # n-d out-of-place
-    for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]),
-        dim in 1:length(shape)
-      @test testf(x->reverse(x; dims=dim), rand(shape...))
-
-      cpu = rand(shape...)
-      gpu = CuArray(cpu)
-      reverse!(gpu; dims=dim)
-      @test Array(gpu) == reverse(cpu; dims=dim)
-    end
-
-    # supports multidimensional reverse
-    for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]),
-        dim in ((1,2),(2,3),(1,3),:)
-      @test testf(x->reverse(x; dims=dim), rand(shape...))
-
-      cpu = rand(shape...)
-      gpu = CuArray(cpu)
-      reverse!(gpu; dims=dim)
-      @test Array(gpu) == reverse(cpu; dims=dim)
-    end
-
-    # wrapped array
-    @test testf(x->reverse(x), reshape(rand(2,2), 4))
-
-    # error throwing
-    cpu = rand(1,2,3,4)
-    gpu = CuArray(cpu)
-    @test_throws ArgumentError reverse!(gpu, dims=5)
-    @test_throws ArgumentError reverse!(gpu, dims=0)
-    @test_throws ArgumentError reverse(gpu, dims=5)
-    @test_throws ArgumentError reverse(gpu, dims=0)
-end
+# @testset "reverse" begin
+#     # 1-d out-of-place
+#     @test testf(x->reverse(x), rand(1000))
+#     @test testf(x->reverse(x, 10), rand(1000))
+#     @test testf(x->reverse(x, 10, 90), rand(1000))
+
+#     # 1-d in-place
+#     @test testf(x->reverse!(x), rand(1000))
+#     @test testf(x->reverse!(x, 10), rand(1000))
+#     @test testf(x->reverse!(x, 10, 90), rand(1000))
+
+#     # n-d out-of-place
+#     for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]),
+#         dim in 1:length(shape)
+#       @test testf(x->reverse(x; dims=dim), rand(shape...))
+
+#       cpu = rand(shape...)
+#       gpu = CuArray(cpu)
+#       reverse!(gpu; dims=dim)
+#       @test Array(gpu) == reverse(cpu; dims=dim)
+#     end
+
+#     # supports multidimensional reverse
+#     for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]),
+#         dim in ((1,2),(2,3),(1,3),:)
+#       @test testf(x->reverse(x; dims=dim), rand(shape...))
+
+#       cpu = rand(shape...)
+#       gpu = CuArray(cpu)
+#       reverse!(gpu; dims=dim)
+#       @test Array(gpu) == reverse(cpu; dims=dim)
+#     end
+
+#     # wrapped array
+#     @test testf(x->reverse(x), reshape(rand(2,2), 4))
+
+#     # error throwing
+#     cpu = rand(1,2,3,4)
+#     gpu = CuArray(cpu)
+#     @test_throws ArgumentError reverse!(gpu, dims=5)
+#     @test_throws ArgumentError reverse!(gpu, dims=0)
+#     @test_throws ArgumentError reverse(gpu, dims=5)
+#     @test_throws ArgumentError reverse(gpu, dims=0)
+# end
 
 @testset "findall" begin
     # 1D
diff --git a/test/runtests.jl b/test/runtests.jl
index 1f651bc78d..bffaea670c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,3 +1,6 @@
+using Pkg
+Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="reverse")
+
 using Distributed
 using Dates
 import REPL

From e2f192a55b811b51cc2517b039c3d86d4931cfcb Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 5 Aug 2025 11:19:24 -0300
Subject: [PATCH 22/23] temp

---
 perf/array.jl         | 230 +++++++++++++++++++++---------------------
 perf/runbenchmarks.jl |  26 ++---
 2 files changed, 128 insertions(+), 128 deletions(-)

diff --git a/perf/array.jl b/perf/array.jl
index 30348a5120..d2df04f371 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -19,37 +19,37 @@ gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
 gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
 gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
 
-group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
+# group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
 
-group["copy"] = @async_benchmarkable copy($gpu_mat)
+# group["copy"] = @async_benchmarkable copy($gpu_mat)
 
-gpu_mat2 = copy(gpu_mat)
-let group = addgroup!(group, "copyto!")
-    group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
-    group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
-    group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
-end
+# gpu_mat2 = copy(gpu_mat)
+# let group = addgroup!(group, "copyto!")
+#     group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
+#     group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
+#     group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
+# end
 
-let group = addgroup!(group, "iteration")
-    group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
+# let group = addgroup!(group, "iteration")
+#     group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
 
-    group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
+#     group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
 
-    let group = addgroup!(group, "findall")
-        group["bool"] = @benchmarkable findall($gpu_vec_bools)
-        group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
-    end
+#     let group = addgroup!(group, "findall")
+#         group["bool"] = @benchmarkable findall($gpu_vec_bools)
+#         group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
+#     end
 
-    let group = addgroup!(group, "findfirst")
-        group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
-        group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
-    end
+#     let group = addgroup!(group, "findfirst")
+#         group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
+#         group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
+#     end
 
-    let group = addgroup!(group, "findmin") # findmax
-        group["1d"] = @async_benchmarkable findmin($gpu_vec)
-        group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
-    end
-end
+#     let group = addgroup!(group, "findmin") # findmax
+#         group["1d"] = @async_benchmarkable findmin($gpu_vec)
+#         group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
+#     end
+# end
 
 let group = addgroup!(group, "reverse")
     group["1d"] = @async_benchmarkable reverse($gpu_vec)
@@ -62,94 +62,94 @@ let group = addgroup!(group, "reverse")
     group["2dL_inplace"] = @async_benchmarkable reverse!($gpu_mat_long; dims=2)
 end
 
-group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
-
-# no need to test inplace version, which performs the same operation (but with an alloc)
-let group = addgroup!(group, "accumulate")
-    let group = addgroup!(group, "Float32")
-        group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
-        group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
-        group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2)
-
-        group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1)
-        group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2)
-    end
-    let group = addgroup!(group, "Int64")
-        group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints)
-        group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1)
-        group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2)
-
-        group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1)
-        group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2)
-    end
-end
-
-let group = addgroup!(group, "reductions")
-    let group = addgroup!(group, "reduce")
-        let group = addgroup!(group, "Float32")
-            group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
-            group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
-            group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2)
-            group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1)
-            group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2)
-        end
-        let group = addgroup!(group, "Int64")
-            group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints)
-            group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1)
-            group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2)
-            group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1)
-            group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2)
-        end
-    end
-
-    let group = addgroup!(group, "mapreduce")
-        let group = addgroup!(group, "Float32")
-            group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
-            group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
-            group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2)
-            group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1)
-            group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2)
-        end
-        let group = addgroup!(group, "Int64")
-            group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints)
-            group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
-            group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
-            group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1)
-            group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
-        end
-    end
-
-    # used by sum, prod, minimum, maximum, all, any, count
-end
-
-let group = addgroup!(group, "random")
-    let group = addgroup!(group, "rand")
-        group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
-        group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
-    end
-
-    let group = addgroup!(group, "rand!")
-        group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
-        group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
-    end
-
-    let group = addgroup!(group, "randn")
-        group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
-    end
-
-    let group = addgroup!(group, "randn!")
-        group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
-    end
-end
-
-let group = addgroup!(group, "sorting")
-    group["1d"] = @async_benchmarkable sort($gpu_vec)
-    group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
-    group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
-end
-
-let group = addgroup!(group, "permutedims")
-    group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
-    group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
-    group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
-end
+# group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
+
+# # no need to test inplace version, which performs the same operation (but with an alloc)
+# let group = addgroup!(group, "accumulate")
+#     let group = addgroup!(group, "Float32")
+#         group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
+#         group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
+#         group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=2)
+
+#         group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=1)
+#         group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long; dims=2)
+#     end
+#     let group = addgroup!(group, "Int64")
+#         group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec_ints)
+#         group["dims=1"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=1)
+#         group["dims=2"] = @async_benchmarkable accumulate(+, $gpu_mat_ints; dims=2)
+
+#         group["dims=1L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=1)
+#         group["dims=2L"] = @async_benchmarkable accumulate(+, $gpu_mat_long_ints; dims=2)
+#     end
+# end
+
+# let group = addgroup!(group, "reductions")
+#     let group = addgroup!(group, "reduce")
+#         let group = addgroup!(group, "Float32")
+#             group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
+#             group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
+#             group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat; dims=2)
+#             group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=1)
+#             group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long; dims=2)
+#         end
+#         let group = addgroup!(group, "Int64")
+#             group["1d"] = @async_benchmarkable reduce(+, $gpu_vec_ints)
+#             group["dims=1"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=1)
+#             group["dims=2"] = @async_benchmarkable reduce(+, $gpu_mat_ints; dims=2)
+#             group["dims=1L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=1)
+#             group["dims=2L"] = @async_benchmarkable reduce(+, $gpu_mat_long_ints; dims=2)
+#         end
+#     end
+
+#     let group = addgroup!(group, "mapreduce")
+#         let group = addgroup!(group, "Float32")
+#             group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
+#             group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
+#             group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=2)
+#             group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=1)
+#             group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long; dims=2)
+#         end
+#         let group = addgroup!(group, "Int64")
+#             group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec_ints)
+#             group["dims=1"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
+#             group["dims=2"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
+#             group["dims=1L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=1)
+#             group["dims=2L"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat_long_ints; dims=2)
+#         end
+#     end
+
+#     # used by sum, prod, minimum, maximum, all, any, count
+# end
+
+# let group = addgroup!(group, "random")
+#     let group = addgroup!(group, "rand")
+#         group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
+#         group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
+#     end
+
+#     let group = addgroup!(group, "rand!")
+#         group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
+#         group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
+#     end
+
+#     let group = addgroup!(group, "randn")
+#         group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
+#     end
+
+#     let group = addgroup!(group, "randn!")
+#         group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
+#     end
+# end
+
+# let group = addgroup!(group, "sorting")
+#     group["1d"] = @async_benchmarkable sort($gpu_vec)
+#     group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
+#     group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
+# end
+
+# let group = addgroup!(group, "permutedims")
+#     group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
+#     group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
+#     group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
+# end
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index 21e9a96aac..f66a1a8a9b 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -18,13 +18,13 @@ end
 
 # before anything else, run latency benchmarks. these spawn subprocesses, so we don't want
 # to do so after regular benchmarks have caused the memory allocator to reserve memory.
-@info "Running latency benchmarks"
-latency_results = include("latency.jl")
+# @info "Running latency benchmarks"
+# latency_results = include("latency.jl")
 
 SUITE = BenchmarkGroup()
 
-include("cuda.jl")
-include("kernel.jl")
+# include("cuda.jl")
+# include("kernel.jl")
 include("array.jl")
 
 @info "Preparing main benchmarks"
@@ -36,20 +36,20 @@ GC.gc(true)
 CUDA.reclaim()
 
 # benchmark groups that aren't part of the suite
-addgroup!(SUITE, "integration")
+# addgroup!(SUITE, "integration")
 
 @info "Running main benchmarks"
 results = run(SUITE, verbose=true)
 
 # integration tests (that do nasty things, so need to be run last)
-@info "Running integration benchmarks"
-integration_results = BenchmarkGroup()
-integration_results["volumerhs"] = include("volumerhs.jl")
-integration_results["byval"] = include("byval.jl")
-integration_results["cudadevrt"] = include("cudadevrt.jl")
-
-results["latency"] = latency_results
-results["integration"] = integration_results
+# @info "Running integration benchmarks"
+# integration_results = BenchmarkGroup()
+# integration_results["volumerhs"] = include("volumerhs.jl")
+# integration_results["byval"] = include("byval.jl")
+# integration_results["cudadevrt"] = include("cudadevrt.jl")
+
+# results["latency"] = latency_results
+# results["integration"] = integration_results
 
 # write out the results
 result_file = length(ARGS) >= 1 ? ARGS[1] : "benchmarkresults.json"

From 69871e593e6e1b878ac4fde022481e2959cedbc2 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 3 Nov 2025 16:05:40 -0400
Subject: [PATCH 23/23] trhws

---
 Project.toml          | 4 ++++
 perf/runbenchmarks.jl | 2 --
 test/runtests.jl      | 3 ---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Project.toml b/Project.toml
index 5dba52ac6f..4c7d6e5bce 100644
--- a/Project.toml
+++ b/Project.toml
@@ -44,6 +44,10 @@ EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 SparseMatricesCSR = "a0a7dd2c-ebf4-11e9-1f05-cf50bc540ca1"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 
+[sources]
+GPUArrays = {rev = "reverse", url = "https://github.com/christiangnrd/GPUArrays.jl"}
+KernelAbstractions = {rev = "intrinsics", url = "https://github.com/christiangnrd/KernelAbstractions.jl"}
+
 [extensions]
 ChainRulesCoreExt = "ChainRulesCore"
 EnzymeCoreExt = "EnzymeCore"
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index f66a1a8a9b..677f8e8b8a 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -1,6 +1,4 @@
 # benchmark suite execution and codespeed submission
-using Pkg
-Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="reverse")
 
 using CUDA
 
diff --git a/test/runtests.jl b/test/runtests.jl
index bffaea670c..1f651bc78d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,3 @@
-using Pkg
-Pkg.add(url="https://github.com/christiangnrd/GPUArrays.jl", rev="reverse")
-
 using Distributed
 using Dates
 import REPL