Skip to content

Commit d341f2d

Browse files
committed
Revert KIKernel
1 parent 46cbc05 commit d341f2d

File tree

1 file changed

+18
-9
lines changed

1 file changed

+18
-9
lines changed

src/mapreduce.jl

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -201,11 +201,14 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
201201
# If `Rother` is large enough, then a naive loop is more efficient than partial reductions.
202202
if length(Rother) >= serial_mapreduce_threshold(dev)
203203
args = (f, op, init, Rreduce, Rother, R, A)
204-
kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
205-
kernel_config = launch_configuration(kernel.kern.fun)
204+
# kernel = KI.KIKernel(backend, serial_mapreduce_kernel, args...)
205+
kernel = @cuda launch=false serial_mapreduce_kernel(args...)
206+
# kernel_config = launch_configuration(kernel.kern.fun)
207+
kernel_config = launch_configuration(kernel.fun)
206208
threads = kernel_config.threads
207209
blocks = cld(length(Rother), threads)
208-
kernel(args...; workgroupsize=threads, numworkgroups=blocks)
210+
# kernel(args...; workgroupsize=threads, numworkgroups=blocks)
211+
kernel(args...; threads, blocks)
209212
return R
210213
end
211214

@@ -228,9 +231,11 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
228231
# we might not be able to launch all those threads to reduce each slice in one go.
229232
# that's why each threads also loops across their inputs, processing multiple values
230233
# so that we can span the entire reduction dimension using a single thread block.
231-
kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
234+
# kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), R, A)
235+
kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), R, A)
232236
compute_shmem(threads) = shuffle ? 0 : threads*sizeof(T)
233-
kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmemcompute_threads)
237+
# kernel_config = launch_configuration(kernel.kern.fun; shmem=compute_shmem∘compute_threads)
238+
kernel_config = launch_configuration(kernel.fun; shmem=compute_shmemcompute_threads)
234239
reduce_threads = compute_threads(kernel_config.threads)
235240
reduce_shmem = compute_shmem(reduce_threads)
236241

@@ -255,7 +260,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
255260
# perform the actual reduction
256261
if reduce_blocks == 1
257262
# we can cover the dimensions to reduce using a single block
258-
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
263+
# kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; workgroupsize=threads, numworkgroups=blocks, shmem)
264+
kernel(f, op, init, Rreduce, Rother, Val(shuffle), R, A; threads, blocks, shmem)
259265
else
260266
# TODO: provide a version that atomically reduces from different blocks
261267

@@ -265,8 +271,10 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
265271
# NOTE: we can't use the previously-compiled kernel, or its launch configuration,
266272
# since the type of `partial` might not match the original output container
267273
# (e.g. if that was a view).
268-
partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
269-
partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmemcompute_threads)
274+
# partial_kernel = KI.KIKernel(backend, partial_mapreduce_grid, f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
275+
partial_kernel = @cuda launch=false partial_mapreduce_grid(f, op, init, Rreduce, Rother, Val(shuffle), partial, A)
276+
# partial_kernel_config = launch_configuration(partial_kernel.kern.fun; shmem=compute_shmem∘compute_threads)
277+
partial_kernel_config = launch_configuration(partial_kernel.fun; shmem=compute_shmemcompute_threads)
270278
partial_reduce_threads = compute_threads(partial_kernel_config.threads)
271279
partial_reduce_shmem = compute_shmem(partial_reduce_threads)
272280
partial_reduce_blocks = if other_blocks >= partial_kernel_config.blocks
@@ -286,7 +294,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyCuArray{T},
286294
end
287295

288296
partial_kernel(f, op, init, Rreduce, Rother, Val(shuffle), partial, A;
289-
workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
297+
threads=partial_threads, blocks=partial_blocks, shmem=partial_shmem)
298+
# workgroupsize=partial_threads, numworkgroups=partial_blocks, shmem=partial_shmem)
290299

291300
GPUArrays.mapreducedim!(identity, op, R, partial; init)
292301
end

0 commit comments

Comments
 (0)