@@ -201,11 +201,14 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
201201 # If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
202202 if length (Rother) >= serial_mapreduce_threshold (dev)
203203 args = (f, op, init, Rreduce, Rother, R, A)
204- kernel = KI. KIKernel (backend, serial_mapreduce_kernel, args... )
205- kernel_config = launch_configuration (kernel. kern. fun)
204+ # kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
205+ kernel = @cuda launch= false serial_mapreduce_kernel (args... )
206+ # kernel_config = launch_configuration(kernel.kern.fun)
207+ kernel_config = launch_configuration (kernel. fun)
206208 threads = kernel_config. threads
207209 blocks = cld (length (Rother), threads)
208- kernel (args... ; workgroupsize= threads, numworkgroups= blocks)
210+ # kernel(args...; workgroupsize=threads, numworkgroups=blocks)
211+ kernel (args... ; threads, blocks)
209212 return R
210213 end
211214
@@ -228,9 +231,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
228231 # we might not be able to launch all those threads to reduce each slice in one go.
229232 # that's why each threads also loops across their inputs, processing multiple values
230233 # so that we can span the entire reduction dimension using a single thread block.
231- kernel = KI. KIKernel (backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val (shuffle), R, A)
234+ # kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
235+ kernel = @cuda launch= false partial_mapreduce_grid (f, op, init, Rreduce, Rother, Val (shuffle), R, A)
232236 compute_shmem (threads) = shuffle ? 0 : threads* sizeof (T)
233- kernel_config = launch_configuration (kernel. kern. fun; shmem= compute_shmem∘ compute_threads)
237+ # kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
238+ kernel_config = launch_configuration (kernel. fun; shmem= compute_shmem∘ compute_threads)
234239 reduce_threads = compute_threads (kernel_config. threads)
235240 reduce_shmem = compute_shmem (reduce_threads)
236241
@@ -255,7 +260,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
255260 # perform the actual reduction
256261 if reduce_blocks == 1
257262 # we can cover the dimensions to reduce using a single block
258- kernel (f, op, init, Rreduce, Rother, Val (shuffle), R, A; workgroupsize= threads, numworkgroups= blocks, shmem)
263+ # kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
264+ kernel (f, op, init, Rreduce, Rother, Val (shuffle), R, A; threads, blocks, shmem)
259265 else
260266 # TODO : provide a version that atomically reduces from different blocks
261267
@@ -265,8 +271,10 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
265271 # NOTE: we can't use the previously-compiled kernel, or its launch configuration,
266272 # since the type of `partial` might not match the original output container
267273 # (e.g. if that was a view).
268- partial_kernel = KI. KIKernel (backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val (shuffle), partial, A)
269- partial_kernel_config = launch_configuration (partial_kernel. kern. fun; shmem= compute_shmem∘ compute_threads)
274+ # partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
275+ partial_kernel = @cuda launch= false partial_mapreduce_grid (f, op, init, Rreduce, Rother, Val (shuffle), partial, A)
276+ # partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads)
277+ partial_kernel_config = launch_configuration (partial_kernel. fun; shmem= compute_shmem∘ compute_threads)
270278 partial_reduce_threads = compute_threads (partial_kernel_config. threads)
271279 partial_reduce_shmem = compute_shmem (partial_reduce_threads)
272280 partial_reduce_blocks = if other_blocks >= partial_kernel_config. blocks
@@ -286,7 +294,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
286294 end
287295
288296 partial_kernel (f, op, init, Rreduce, Rother, Val (shuffle), partial, A;
289- workgroupsize= partial_threads, numworkgroups= partial_blocks, shmem= partial_shmem)
297+ threads= partial_threads, blocks= partial_blocks, shmem= partial_shmem)
298+ # workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
290299
291300 GPUArrays. mapreducedim! (identity, op, R, partial; init)
292301 end
0 commit comments