Skip to content

Commit 857669f

Browse files
committed
KA 0.10
1 parent a90ed05 commit 857669f

File tree

12 files changed

+171
-172
lines changed

12 files changed

+171
-172
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "AcceleratedKernels"
22
uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
33
authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
4-
version = "0.4.3"
4+
version = "0.5"
55

66
[deps]
77
ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
@@ -19,7 +19,7 @@ AcceleratedKernelsoneAPIExt = "oneAPI"
1919
[compat]
2020
ArgCheck = "2"
2121
GPUArraysCore = "0.2.0"
22-
KernelAbstractions = "0.9.34"
22+
KernelAbstractions = "0.10"
2323
Markdown = "1"
2424
UnsafeAtomics = "0.3.0"
2525
julia = "1.10"

src/AcceleratedKernels.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ module AcceleratedKernels
1414
using ArgCheck: @argcheck
1515
using GPUArraysCore: AnyGPUArray, @allowscalar
1616
using KernelAbstractions
17-
using KernelAbstractions: @context
1817
import UnsafeAtomics
1918

2019

src/accumulate/accumulate_1d_gpu.jl

Lines changed: 37 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12,27 +12,28 @@ const ACC_FLAG_P::UInt8 = 1 # Only current block's prefix available
1212
end
1313

1414

15-
@kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_block!(
15+
function _accumulate_block!(
1616
op, v, init, neutral,
1717
inclusive,
1818
flags, prefixes, # one per block
19-
)
19+
::Val{block_size}
20+
) where block_size
21+
@inbounds begin
2022
# NOTE: shmem_size MUST be greater than 2 * block_size
2123
# NOTE: block_size MUST be a power of 2
2224
len = length(v)
23-
@uniform block_size = @groupsize()[1]
24-
temp = @localmem eltype(v) (0x2 * block_size + conflict_free_offset(0x2 * block_size),)
25+
temp = KI.localmemory(eltype(v), 0x2 * block_size + conflict_free_offset(0x2 * block_size))
2526

2627
# NOTE: for many index calculations in this library, computation using zero-indexing leads to
2728
# fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
2829
# indexing). Internal calculations will be done using zero indexing except when actually
2930
# accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
3031

3132
# Group (block) and local (thread) indices
32-
iblock = @index(Group, Linear) - 0x1
33-
ithread = @index(Local, Linear) - 0x1
33+
iblock = KI.get_group_id().x - 0x1
34+
ithread = KI.get_local_id().x - 0x1
3435

35-
num_blocks = @ndrange()[1] ÷ block_size
36+
num_blocks = KI.get_num_groups().x
3637
block_offset = iblock * block_size * 0x2 # Processing two elements per thread
3738

3839
# Copy two elements from the main array; offset indices to avoid bank conflicts
@@ -59,7 +60,7 @@ end
5960
next_pow2 = block_size * 0x2
6061
d = next_pow2 >> 0x1
6162
while d > 0x0 # TODO: unroll this like in reduce.jl ?
62-
@synchronize()
63+
KI.barrier()
6364

6465
if ithread < d
6566
_ai = offset * (0x2 * ithread + 0x1) - 0x1
@@ -84,7 +85,7 @@ end
8485
d = typeof(ithread)(1)
8586
while d < next_pow2
8687
offset = offset >> 0x1
87-
@synchronize()
88+
KI.barrier()
8889

8990
if ithread < d
9091
_ai = offset * (0x2 * ithread + 0x1) - 0x1
@@ -103,10 +104,10 @@ end
103104
# Later blocks should always be inclusively-scanned
104105
if inclusive || iblock != 0x0
105106
# To compute an inclusive scan, shift elements left...
106-
@synchronize()
107+
KI.barrier()
107108
t1 = temp[ai + bank_offset_a + 0x1]
108109
t2 = temp[bi + bank_offset_b + 0x1]
109-
@synchronize()
110+
KI.barrier()
110111

111112
if ai > 0x0
112113
temp[ai - 0x1 + conflict_free_offset(ai - 0x1) + 0x1] = t1
@@ -123,7 +124,7 @@ end
123124
end
124125
end
125126

126-
@synchronize()
127+
KI.barrier()
127128

128129
# Write this block's final prefix to global array and set flag to "block prefix computed"
129130
if bi == 0x2 * block_size - 0x1
@@ -145,24 +146,25 @@ end
145146
if block_offset + bi < len
146147
v[block_offset + bi + 0x1] = temp[bi + bank_offset_b + 0x1]
147148
end
149+
end
150+
nothing
148151
end
149152

150153

151-
@kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous!(
152-
op, v, flags, @Const(prefixes),
153-
)
154-
154+
function _accumulate_previous!(
155+
op, v, flags, prefixes, ::Val{block_size}
156+
) where block_size
157+
@inbounds begin
155158
len = length(v)
156-
block_size = @groupsize()[1]
157159

158160
# NOTE: for many index calculations in this library, computation using zero-indexing leads to
159161
# fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
160162
# indexing). Internal calculations will be done using zero indexing except when actually
161163
# accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
162164

163165
# Group (block) and local (thread) indices
164-
iblock = @index(Group, Linear) - 0x1 + 0x1 # Skipping first block
165-
ithread = @index(Local, Linear) - 0x1
166+
iblock = KI.get_group_id().x - 0x1 + 0x1 # Skipping first block
167+
ithread = KI.get_local_id().x - 0x1
166168
block_offset = iblock * block_size * 0x2 # Processing two elements per thread
167169

168170
# Each block looks back to find running prefix sum
@@ -197,7 +199,7 @@ end
197199
# There are two synchronization concerns here:
198200
# 1. Withing a group we want to ensure that all writed to `v` have occured before setting the flag.
199201
# 2. Between groups we need to use a fence and atomic load/store to ensure that memory operations are not re-ordered
200-
@synchronize() # within-block
202+
KI.barrier() # within-block
201203
# Note: This fence is needed to ensure that the flag is not set before copying into v.
202204
# See https://doc.rust-lang.org/std/sync/atomic/fn.fence.html
203205
# for more details.
@@ -206,24 +208,26 @@ end
206208
if ithread == 0x0
207209
UnsafeAtomics.store!(pointer(flags, iblock + 0x1), convert(eltype(flags), ACC_FLAG_A), UnsafeAtomics.monotonic)
208210
end
211+
end
212+
nothing
209213
end
210214

211215

212-
@kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous_coupled_preblocks!(
213-
op, v, prefixes,
214-
)
216+
function _accumulate_previous_coupled_preblocks!(
217+
op, v, prefixes, ::Val{block_size}
218+
) where block_size
219+
@inbounds begin
215220
# No decoupled lookback
216221
len = length(v)
217-
block_size = @groupsize()[1]
218222

219223
# NOTE: for many index calculations in this library, computation using zero-indexing leads to
220224
# fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
221225
# indexing). Internal calculations will be done using zero indexing except when actually
222226
# accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
223227

224228
# Group (block) and local (thread) indices
225-
iblock = @index(Group, Linear) - 0x1 + 0x1 # Skipping first block
226-
ithread = @index(Local, Linear) - 0x1
229+
iblock = KI.get_group_id().x - 0x1 + 0x1 # Skipping first block
230+
ithread = KI.get_local_id().x - 0x1
227231
block_offset = iblock * block_size * 0x2 # Processing two elements per thread
228232

229233
# Each block looks back to find running prefix sum
@@ -250,6 +254,8 @@ end
250254
if block_offset + bi < len
251255
v[block_offset + bi + 0x1] = op(running_prefix, v[block_offset + bi + 0x1])
252256
end
257+
end
258+
nothing
253259
end
254260

255261

@@ -298,14 +304,10 @@ function accumulate_1d_gpu!(
298304
flags = temp_flags
299305
end
300306

301-
kernel1! = _accumulate_block!(backend, block_size)
302-
kernel1!(op, v, init, neutral, inclusive, flags, prefixes,
303-
ndrange=num_blocks * block_size)
307+
KI.@kernel backend workgroupsize=block_size numworkgroups=num_blocks _accumulate_block!(op, v, init, neutral, inclusive, flags, prefixes, Val(block_size))
304308

305309
if num_blocks > 1
306-
kernel2! = _accumulate_previous!(backend, block_size)
307-
kernel2!(op, v, flags, prefixes,
308-
ndrange=(num_blocks - 1) * block_size)
310+
KI.@kernel backend workgroupsize=block_size numworkgroups=(num_blocks-1) _accumulate_previous!(op, v, flags, prefixes, Val(block_size))
309311
end
310312

311313
return v
@@ -349,22 +351,17 @@ function accumulate_1d_gpu!(
349351
prefixes = temp
350352
end
351353

352-
kernel1! = _accumulate_block!(backend, block_size)
353-
kernel1!(op, v, init, neutral, inclusive, nothing, prefixes,
354-
ndrange=num_blocks * block_size)
354+
KI.@kernel backend workgroupsize=block_size numworkgroups=num_blocks _accumulate_block!(op, v, init, neutral, inclusive, nothing, prefixes, Val(block_size))
355355

356356
if num_blocks > 1
357357

358358
# Accumulate prefixes of all blocks; use neutral as init here to not reinclude init
359359
num_blocks_prefixes = (length(prefixes) + elems_per_block - 1) ÷ elems_per_block
360-
kernel1!(op, prefixes, neutral, neutral, true, nothing, nothing,
361-
ndrange=num_blocks_prefixes * block_size)
360+
KI.@kernel backend workgroupsize=block_size numworkgroups=num_blocks_prefixes _accumulate_block!(op, prefixes, neutral, neutral, true, nothing, nothing, Val(block_size))
362361

363362
# Prefixes are pre-accumulated (completely accumulated if num_blocks_prefixes == 1, or
364363
# partially, which we will account for in the coupled lookback)
365-
kernel2! = _accumulate_previous_coupled_preblocks!(backend, block_size)
366-
kernel2!(op, v, prefixes,
367-
ndrange=(num_blocks - 1) * block_size)
364+
KI.@kernel backend workgroupsize=block_size numworkgroups=(num_blocks-1) _accumulate_previous_coupled_preblocks!(op, v, prefixes, Val(block_size))
368365
end
369366

370367
return v

src/accumulate/accumulate_nd.jl

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -51,18 +51,14 @@ function accumulate_nd!(
5151
if length_outer >= length_dims
5252
# One thread per outer dimension
5353
blocks = (length_outer + block_size - 1) ÷ block_size
54-
kernel1! = _accumulate_nd_by_thread!(backend, block_size)
55-
kernel1!(
56-
v, op, init, dims, inclusive,
57-
ndrange=(block_size * blocks,),
54+
KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_thread!(
55+
v, op, init, dims, inclusive, Val(block_size)
5856
)
5957
else
6058
# One block per outer dimension
6159
blocks = length_outer
62-
kernel2! = _accumulate_nd_by_block!(backend, block_size)
63-
kernel2!(
64-
v, op, init, neutral, dims, inclusive,
65-
ndrange=(block_size, blocks),
60+
KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_block!(
61+
v, op, init, neutral, dims, inclusive, Val(block_size)
6662
)
6763
end
6864
end
@@ -121,9 +117,11 @@ function _accumulate_nd_cpu_sections!(
121117
end
122118

123119

124-
@kernel inbounds=true cpu=false unsafe_indices=true function _accumulate_nd_by_thread!(
120+
function _accumulate_nd_by_thread!(
125121
v, op, init, dims, inclusive,
126-
)
122+
::Val{block_size}
123+
) where block_size
124+
@inbounds begin
127125
# One thread per outer dimension element, when there are more outer elements than in the
128126
# reduced dim e.g. accumulate(+, rand(3, 1000), dims=1) => only 3 elements in the accumulated
129127
# dim
@@ -135,16 +133,14 @@ end
135133
length_dims = vsizes[dims]
136134
length_outer = length(v) ÷ length_dims
137135

138-
block_size = @groupsize()[1]
139-
140136
# NOTE: for many index calculations in this library, computation using zero-indexing leads to
141137
# fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
142138
# indexing). Internal calculations will be done using zero indexing except when actually
143139
# accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
144140

145141
# Group (block) and local (thread) indices
146-
iblock = @index(Group, Linear) - 0x1
147-
ithread = @index(Local, Linear) - 0x1
142+
iblock = KI.get_group_id().x - 0x1
143+
ithread = KI.get_local_id().x - 0x1
148144

149145
# Each thread handles one outer element
150146
tid = ithread + iblock * block_size
@@ -178,12 +174,16 @@ end
178174
end
179175
end
180176
end
177+
end
178+
nothing
181179
end
182180

183181

184-
@kernel inbounds=true cpu=false unsafe_indices=true function _accumulate_nd_by_block!(
182+
function _accumulate_nd_by_block!(
185183
v, op, init, neutral, dims, inclusive,
186-
)
184+
::Val{block_size}
185+
) where block_size
186+
@inbounds begin
187187
# NOTE: shmem_size MUST be greater than 2 * block_size
188188
# NOTE: block_size MUST be a power of 2
189189

@@ -198,19 +198,17 @@ end
198198
length_dims = vsizes[dims]
199199
length_outer = length(v) ÷ length_dims
200200

201-
@uniform block_size = @groupsize()[1]
202-
203-
temp = @localmem eltype(v) (0x2 * block_size + conflict_free_offset(0x2 * block_size),)
204-
running_prefix = @localmem eltype(v) (1,)
201+
temp = KI.localmemory(eltype(v), 0x2 * block_size + conflict_free_offset(0x2 * block_size))
202+
running_prefix = KI.localmemory(eltype(v), 1)
205203

206204
# NOTE: for many index calculations in this library, computation using zero-indexing leads to
207205
# fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
208206
# indexing). Internal calculations will be done using zero indexing except when actually
209207
# accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
210208

211209
# Group (block) and local (thread) indices
212-
iblock = @index(Group, Linear) - 0x1
213-
ithread = @index(Local, Linear) - 0x1
210+
iblock = KI.get_group_id().x - 0x1
211+
ithread = KI.get_local_id().x - 0x1
214212

215213
# Each block handles one outer element; guaranteed to have exact number of blocks, so no need
216214
# for `if iblock < length_outer`
@@ -234,7 +232,7 @@ end
234232
if ithread == 0x0
235233
running_prefix[0x1] = neutral
236234
end
237-
@synchronize()
235+
KI.barrier()
238236

239237
while ichunk < num_chunks
240238
block_offset = ichunk * block_size * 0x2 # Processing two elements per thread
@@ -271,7 +269,7 @@ end
271269
next_pow2 = block_size * 0x2
272270
d = next_pow2 >> 0x1
273271
while d > 0x0 # TODO: unroll this like in reduce.jl ?
274-
@synchronize()
272+
KI.barrier()
275273

276274
if ithread < d
277275
_ai = offset * (0x2 * ithread + 0x1) - 0x1
@@ -296,7 +294,7 @@ end
296294
d = typeof(ithread)(1)
297295
while d < next_pow2
298296
offset = offset >> 0x1
299-
@synchronize()
297+
KI.barrier()
300298

301299
if ithread < d
302300
_ai = offset * (0x2 * ithread + 0x1) - 0x1
@@ -315,10 +313,10 @@ end
315313
# Later blocks should always be inclusively-scanned
316314
if inclusive || ichunk != 0x0
317315
# To compute an inclusive scan, shift elements left...
318-
@synchronize()
316+
KI.barrier()
319317
t1 = temp[ai + bank_offset_a + 0x1]
320318
t2 = temp[bi + bank_offset_b + 0x1]
321-
@synchronize()
319+
KI.barrier()
322320

323321
if ai > 0x0
324322
temp[ai - 0x1 + conflict_free_offset(ai - 0x1) + 0x1] = t1
@@ -344,7 +342,7 @@ end
344342
end
345343

346344
_running_prefix = running_prefix[0x1]
347-
@synchronize()
345+
KI.barrier()
348346

349347
if block_offset + ai < length_dims
350348
total = op(_running_prefix, temp[ai + bank_offset_a + 0x1])
@@ -367,8 +365,10 @@ end
367365
if bi == 0x2 * block_size - 0x1
368366
running_prefix[0x1] = total
369367
end
370-
@synchronize()
368+
KI.barrier()
371369

372370
ichunk += 0x1
373371
end
372+
end
373+
nothing
374374
end

0 commit comments

Comments
 (0)