Revert KIKernel

christiangnrd · christiangnrd · commit d341f2d399a2 · 2025-11-03T16:03:04.000-04:00
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -201,11 +201,14 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
     if length(Rother) >= serial_mapreduce_threshold(dev)
         args = (f, op, init, Rreduce, Rother, R, A)
-        kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
-        kernel_config = launch_configuration(kernel.kern.fun)
+        # kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
+        kernel = @cuda launch=false serial_mapreduce_kernel(args...)
+        # kernel_config = launch_configuration(kernel.kern.fun)
+        kernel_config = launch_configuration(kernel.fun)
         threads = kernel_config.threads
         blocks = cld(length(Rother), threads)
-        kernel(args...; workgroupsize=threads, numworkgroups=blocks)
+        # kernel(args...; workgroupsize=threads, numworkgroups=blocks)
+        kernel(args...; threads, blocks)
         return R
     end
 
@@ -228,9 +231,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # we might not be able to launch all those threads to reduce each slice in one go.
     # that's why each threads also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single thread block.
-    kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
+    # kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
+    kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A)
     compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
-    kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
+    # kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
+    kernel_config = launch_configuration(kernel.fun; shmem=compute_shmem∘compute_threads)
     reduce_threads = compute_threads(kernel_config.threads)
     reduce_shmem = compute_shmem(reduce_threads)
 
@@ -255,7 +260,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
     # perform the actual reduction
     if reduce_blocks == 1
         # we can cover the dimensions to reduce using a single block
-        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
+        # kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
+        kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
     else
         # TODO: provide a version that atomically reduces from different blocks
 
@@ -265,8 +271,10 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         # NOTE: we can't use the previously-compiled kernel, or its launch configuration,
         #       since the type of `partial` might not match the original output container
         #       (e.g. if that was a view).
-        partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
-        partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads)
+        # partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
+        partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
+        # partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads)
+        partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmem∘compute_threads)
         partial_reduce_threads = compute_threads(partial_kernel_config.threads)
         partial_reduce_shmem = compute_shmem(partial_reduce_threads)
         partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks
@@ -286,7 +294,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
         end
 
         partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
-                    workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
+                    threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
+                    # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
 
         GPUArrays.mapreducedim!(identity, op, R, partial; init)
     end