diff --git a/examples/histogram.jl b/examples/histogram.jl index 9edc0293..29d3bf57 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -12,16 +12,15 @@ function create_histogram(input) return histogram_output end -# This a 1D histogram kernel where the histogramming happens on shmem -@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input) - gid = @index(Group, Linear) - lid = @index(Local, Linear) +# This a 1D histogram kernel where the histogramming happens on static shmem +function histogram_kernel!(histogram_output, input, ::Val{gs}) where {gs} + gid = KI.get_group_id().x + lid = KI.get_local_id().x - @uniform gs = prod(@groupsize()) tid = (gid - 1) * gs + lid - @uniform N = length(histogram_output) + N = length(histogram_output) - shared_histogram = @localmem eltype(input) (gs) + shared_histogram = KI.localmemory(eltype(input), gs) # This will go through all input elements and assign them to a location in # shmem. Note that if there is not enough shem, we create different shmem @@ -32,7 +31,7 @@ end # Setting shared_histogram to 0 @inbounds shared_histogram[lid] = 0 - @synchronize() + KI.barrier() max_element = min_element + gs if max_element > N @@ -46,21 +45,20 @@ end @atomic shared_histogram[bin] += 1 end - @synchronize() + KI.barrier() if ((lid + min_element - 1) <= N) @atomic histogram_output[lid + min_element - 1] += shared_histogram[lid] end end - + return end function histogram!(histogram_output, input, groupsize = 256) backend = get_backend(histogram_output) # Need static block size - kernel! = histogram_kernel!(backend, (groupsize,)) - kernel!(histogram_output, input, ndrange = size(input)) + KI.@kernel backend workgroupsize = groupsize numworkgroups = cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize)) return end diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index ac56edb5..905b3a05 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -9,70 +9,68 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # # Metal sometimes supports fewer. const TILE_DIM = 16 -@kernel unsafe_indices = true function coalesced_matmul_kernel!( - output, @Const(input1), @Const(input2), N, R, M, - ::Val{BANK} = Val(1), - ) where {BANK} - gi, gj = @index(Group, NTuple) - i, j = @index(Local, NTuple) - - TILE_DIM = @uniform @groupsize()[1] +function coalesced_matmul_kernel!( + output, input1, input2, N, R, M, + ::Val{TDIM}, ::Val{BANK} = Val(1) + ) where {TDIM, BANK} + gi, gj, _ = KI.get_group_id() + i, j, _ = KI.get_local_id() # +1 to avoid bank conflicts on shared memory - tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) - tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM) + tile1 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM)) + tile2 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM)) - # private variable for tile output - outval = @private eltype(output) 1 - @inbounds outval[1] = -zero(eltype(output)) + # variable for tile output + outval = -zero(eltype(output)) - @uniform N = size(output, 1) + N = size(output, 1) # number of tiles depends on inner dimension - @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM) + NUM_TILES = div(R + TDIM - 1, TDIM) # loop over all tiles needed for this calculation for t in 0:(NUM_TILES - 1) # Can't use @index(Global), because we use a smaller ndrange - I = (gi - 1) * TILE_DIM + i - J = (gj - 1) * TILE_DIM + j + I = (gi - 1) * TDIM + i + J = (gj - 1) * TDIM + j # load inputs into tiles, with bounds checking for non-square matrices - if I <= N && t * TILE_DIM + j <= R - @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j] + if I <= N && t * TDIM + j <= R + @inbounds tile1[i, j] = input1[I, t * TDIM + j] else @inbounds tile1[i, j] = 0.0 end if t * TILE_DIM + i <= R && J <= M - @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J] + @inbounds tile2[i, j] = input2[t * TDIM + i, J] else @inbounds tile2[i, j] = 0.0 end # wait for all tiles to be loaded - @synchronize + KI.barrier() # get global values again - I = (gi - 1) * TILE_DIM + i - J = (gj - 1) * TILE_DIM + j + I = (gi - 1) * TDIM + i + J = (gj - 1) * TDIM + j # calculate value of spot in output, use temporary value to allow for vectorization out = zero(eltype(output)) - @simd for k in 1:TILE_DIM + @simd for k in 1:TDIM @inbounds out += tile1[i, k] * tile2[k, j] end - outval[1] += out + outval += out - @synchronize + KI.barrier() end # get global indices again - I = (gi - 1) * TILE_DIM + i - J = (gj - 1) * TILE_DIM + j + I = (gi - 1) * TDIM + i + J = (gj - 1) * TDIM + j # save if inbounds if I <= N && J <= M - @inbounds output[I, J] = outval[1] + @inbounds output[I, J] = outval end + return end N = 1024 @@ -82,9 +80,10 @@ A = rand!(allocate(backend, Float32, N, R)) B = rand!(allocate(backend, Float32, R, M)) C = KernelAbstractions.zeros(backend, Float32, N, M) -kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM)) +workgroupsize = (TILE_DIM, TILE_DIM) +numworkgroups = (cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM)) -kern(C, A, B, N, R, M, ndrange = size(C)) +KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM)) KernelAbstractions.synchronize(backend) @test isapprox(A * B, C) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 9af2d9d4..a0f57c74 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -194,6 +194,15 @@ function unsafe_free! end unsafe_free!(::AbstractArray) = return +""" +Abstract type for all KernelAbstractions backends. +""" +abstract type Backend end + +include("intrinsics.jl") +import .KernelIntrinsics: KernelIntrinsics, KI +export KernelIntrinsics, KI + ### # Kernel language # - @localmem @@ -360,6 +369,25 @@ macro context() return esc(:(__ctx__)) end +# Defined to keep cpu support for `__print` +@generated function KI._print(items...) + str = "" + args = [] + + for i in 1:length(items) + item = :(items[$i]) + T = items[i] + if T <: Val + item = QuoteNode(T.parameters[1]) + end + push!(args, item) + end + + return quote + print($(args...)) + end +end + """ @print(items...) @@ -460,13 +488,27 @@ end # Internal kernel functions ### -function __index_Local_Linear end -function __index_Group_Linear end -function __index_Global_Linear end +@inline function __index_Local_Linear(ctx) + return KI.get_local_id().x +end + +@inline function __index_Group_Linear(ctx) + return KI.get_group_id().x +end -function __index_Local_Cartesian end -function __index_Group_Cartesian end -function __index_Global_Cartesian end +@inline function __index_Global_Linear(ctx) + return KI.get_global_id().x +end + +@inline function __index_Local_Cartesian(ctx) + return @inbounds workitems(__iterspace(ctx))[KI.get_local_id().x] +end +@inline function __index_Group_Cartesian(ctx) + return @inbounds blocks(__iterspace(ctx))[KI.get_group_id().x] +end +@inline function __index_Global_Cartesian(ctx) + return @inbounds expand(__iterspace(ctx), KI.get_group_id().x, KI.get_local_id().x) +end @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...)) @@ -482,11 +524,6 @@ constify(arg) = adapt(ConstAdaptor(), arg) # Backend hierarchy ### -""" - -Abstract type for all KernelAbstractions backends. -""" -abstract type Backend end """ Abstract type for all GPU based KernelAbstractions backends. @@ -796,29 +833,11 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory end - -function __synchronize() - error("@synchronize used outside kernel or not captured") -end - -@generated function __print(items...) - str = "" - args = [] +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims) - for i in 1:length(items) - item = :(items[$i]) - T = items[i] - if T <: Val - item = QuoteNode(T.parameters[1]) - end - push!(args, item) - end +__synchronize() = KI.barrier() - return quote - print($(args...)) - end -end +__print(args...) = KI._print(args...) # Utils __size(args::Tuple) = Tuple{args...} diff --git a/src/intrinsics.jl b/src/intrinsics.jl new file mode 100644 index 00000000..2025bf5e --- /dev/null +++ b/src/intrinsics.jl @@ -0,0 +1,365 @@ +""" +# `KernelIntrinics`/`KI` + +The `KernelIntrinics` (or `KI`) module defines the API interface for backends to define various lower-level device and +host-side functionality. The `KI` intrinsics are used to define the higher-level device-side +intrinsics functionality in `KernelAbstractions`. + +Both provide APIs for host and device-side functionality, but `KI` focuses on on lower-level +functionality that is shared amongst backends, while `KernelAbstractions` provides higher-level functionality +such as writing kernels that work on arrays with an arbitrary number of dimensions, or convenience functions +like allocating arrays on a backend. +""" +module KernelIntrinsics + +const KI = KernelIntrinsics +export KI + +import ..KernelAbstractions: Backend +import GPUCompiler: split_kwargs, assign_args! + +""" + get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} + +Return the number of global work-items specified. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} + ``` +""" +function get_global_size end + +""" + get_global_id()::@NamedTuple{x::Int, y::Int, z::Int} + +Returns the unique global work-item ID. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_global_id()::@NamedTuple{x::Int, y::Int, z::Int} + ``` +""" +function get_global_id end + +""" + get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} + +Return the number of local work-items specified. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} + ``` +""" +function get_local_size end + +""" + get_local_id()::@NamedTuple{x::Int, y::Int, z::Int} + +Returns the unique local work-item ID. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_local_id()::@NamedTuple{x::Int, y::Int, z::Int} + ``` +""" +function get_local_id end + +""" + get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} + +Returns the number of groups. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} + ``` +""" +function get_num_groups end + +""" + get_group_id()::@NamedTuple{x::Int, y::Int, z::Int} + +Returns the unique group ID. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_group_id()::@NamedTuple{x::Int, y::Int, z::Int} + ``` +""" +function get_group_id end + +""" + localmemory(T, dims) + +Declare memory that is local to a workgroup. + +!!! note + Backend implementations **must** implement: + ``` + @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims} + ``` + As well as the on-device functionality. +""" +localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) + +""" + barrier() + +After a `barrier()` call, all read and writes to global and local memory +from each thread in the workgroup are visible in from all other threads in the +workgroup. + +This does **not** guarantee that a write from a thread in a certain workgroup will +be visible to a thread in a different workgroup. + +!!! note + `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all. + +!!! note + Backend implementations **must** implement: + ``` + @device_override barrier() + ``` +""" +function barrier() + error("Group barrier used outside kernel or not captured") +end + +""" + _print(args...) + + Overloaded by backends to enable `KernelAbstractions.@print` + functionality. + +!!! note + Backend implementations **must** implement: + ``` + @device_override _print(args...) + ``` + If the backend does not support printing, + define it to return `nothing`. +""" +function _print end + + +""" + Kernel{Backend, Kern} + +Kernel closure struct that is used to represent the backend +kernel on the host. + +!!! note + Backend implementations **must** implement: + ``` + (kernel::Kernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) + ``` + As well as the on-device functionality. +""" +struct Kernel{B, Kern} + backend::B + kern::Kern +end + +""" + kernel_max_work_group_size(kern; [max_work_items::Int])::Int + +The maximum workgroup size limit for a kernel as reported by the backend. +This function should always be used to determine the workgroup size before +launching a kernel. + +!!! note + Backend implementations **must** implement: + ``` + kernel_max_work_group_size(kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int + ``` + As well as the on-device functionality. +""" +function kernel_max_work_group_size end + +""" + max_work_group_size(backend, kern; [max_work_items::Int])::Int + +The maximum workgroup size limit for a kernel as reported by the backend. +This function represents a theoretical maximum; `kernel_max_work_group_size` +should be used before launching a kernel as some backends may error if +kernel launch with too big a workgroup is attempted. + +!!! note + Backend implementations **must** implement: + ``` + max_work_group_size(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" +function max_work_group_size end + +""" + multiprocessor_count(backend::NewBackend)::Int + +The multiprocessor count for the current device used by `backend`. +Used for certain algorithm optimizations. + +!!! note + Backend implementations **may** implement: + ``` + multiprocessor_count(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" +multiprocessor_count(_) = 0 + +""" + argconvert(::NewBackend, arg) + +This function is called for every argument to be passed to a kernel, +converting them to their device side representation. + +!!! note + Backend implementations **must** implement: + ``` + argconvert(::NewBackend, arg) + ``` +""" +function argconvert end + +""" + KI.kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + +Low-level interface to compile a function invocation for the currently-active GPU, returning +a callable kernel object. For a higher-level interface, use [`KI.@kernel`](@ref). + +Currently, `kernel_function` only supports the `name` keyword argument as it is the only one +by all backends. + +Keyword arguments: +- `name`: override the name that the kernel will have in the generated code + +!!! note + Backend implementations **must** implement: + ``` + kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + ``` +""" +function kernel_function end + +const MACRO_KWARGS = [:launch] +const COMPILER_KWARGS = [:name] +const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] + +""" + KI.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) + +High-level interface for executing code on a GPU. + +The `KI.@kernel` macro should prefix a call, with `func` a callable function or object that +should return nothing. It will be compiled to a function native to the specified `backend` +upon first use, and to a certain extent arguments will be converted and managed automatically +using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is +called and launched according to the specified `backend`. + +There are a few keyword arguments that influence the behavior of `KI.@kernel`: + +- `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned + kernel object should be launched by calling it and passing arguments again. +- `name`: the name of the kernel in the generated code. Defaults to an automatically- + generated name. + +!!! note + `KI.@kernel` differs from the `KernelAbstractions` macro in that this macro acts + a wrapper around backend kernel compilation/launching (such as `@cuda`, `@metal`, etc.). It is + used when calling a function to be run on a specific backend, while `KernelAbstractions.@kernel` + is used kernel definition for use with the original higher-level `KernelAbstractions` API. +""" +macro kernel(backend, ex...) + call = ex[end] + kwargs = map(ex[1:(end - 1)]) do kwarg + if kwarg isa Symbol + :($kwarg = $kwarg) + elseif Meta.isexpr(kwarg, :(=)) + kwarg + else + throw(ArgumentError("Invalid keyword argument '$kwarg'")) + end + end + + # destructure the kernel call + Meta.isexpr(call, :call) || throw(ArgumentError("final argument to @kikern should be a function call")) + f = call.args[1] + args = call.args[2:end] + + code = quote end + vars, var_exprs = assign_args!(code, args) + + # group keyword argument + macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = + split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS) + if !isempty(other_kwargs) + key, val = first(other_kwargs).args + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + + # handle keyword arguments that influence the macro's behavior + launch = true + for kwarg in macro_kwargs + key, val = kwarg.args + if key === :launch + isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @kikern should be a Bool")) + launch = val::Bool + else + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + end + if !launch && !isempty(call_kwargs) + error("@kikern with launch=false does not support launch-time keyword arguments; use them when calling the kernel") + end + + # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5) + # we esc() the whole thing now, necessitating gensyms... + @gensym f_var kernel_f kernel_args kernel_tt kernel + + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!( + code.args, + quote + $f_var = $f + GC.@preserve $(vars...) $f_var begin + $kernel_f = $argconvert($backend, $f_var) + $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),)) + $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...} + $kernel = $kernel_function($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) + if $launch + $kernel($(var_exprs...); $(call_kwargs...)) + end + $kernel + end + end + ) + + return esc( + quote + let + $code + end + end + ) +end + +end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 87fccdb9..14fe6ae8 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -2,9 +2,10 @@ module POCLKernels using ..POCL using ..POCL: @device_override, cl, method_table -using ..POCL: device +using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA +import KernelAbstractions.KernelIntrinsics as KI import StaticArrays @@ -56,7 +57,7 @@ KA.functional(::POCLBackend) = true KA.pagelock!(::POCLBackend, x) = nothing KA.get_backend(::Array) = POCLBackend() -KA.synchronize(::POCLBackend) = nothing +KA.synchronize(::POCLBackend) = cl.finish(cl.queue()) KA.supports_float64(::POCLBackend) = true KA.supports_unified(::POCLBackend) = true @@ -138,31 +139,66 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize return nothing end +KI.argconvert(::POCLBackend, arg) = clconvert(arg) + +function KI.kernel_function(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} + kern = clfunction(f, tt; name, kwargs...) + return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) +end + +function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) + local_size = StaticArrays.MVector{3}((1, 1, 1)) + if !isnothing(workgroupsize) + for (i, val) in enumerate(workgroupsize) + local_size[i] = val + end + end + + global_size = StaticArrays.MVector{3}((1, 1, 1)) + if !isnothing(numworkgroups) + for (i, val) in enumerate(numworkgroups) + global_size[i] = val * local_size[i] + end + end + + return obj.kern(args...; local_size, global_size) +end + +function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int + wginfo = cl.work_group_info(kernel.kern.fun, device()) + return Int(min(wginfo.size, max_work_items)) +end +function KI.max_work_group_size(::POCLBackend)::Int + return Int(device().max_work_group_size) +end +function KI.multiprocessor_count(::POCLBackend)::Int + return Int(device().max_compute_units) +end ## Indexing Functions -@device_override @inline function KA.__index_Local_Linear(ctx) - return get_local_id(1) +@device_override @inline function KI.get_local_id() + return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return get_group_id(1) +@device_override @inline function KI.get_group_id() + return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3))) end -@device_override @inline function KA.__index_Global_Linear(ctx) - return get_global_id(1) +@device_override @inline function KI.get_global_id() + return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3))) end -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] +@device_override @inline function KI.get_local_size() + return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3))) end -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] +@device_override @inline function KI.get_num_groups() + return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3))) end -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) +@device_override @inline function KI.get_global_size() + return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end @device_override @inline function KA.__validindex(ctx) @@ -177,7 +213,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} ptr = POCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end @@ -189,11 +225,11 @@ end ## Synchronization and Printing -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end -@device_override @inline function KA.__print(args...) +@device_override @inline function KI._print(args...) POCL._print(args...) end diff --git a/test/intrinsics.jl b/test/intrinsics.jl new file mode 100644 index 00000000..7a7ec1e0 --- /dev/null +++ b/test/intrinsics.jl @@ -0,0 +1,118 @@ +function test_intrinsics_kernel(results) + # Test all intrinsics return NamedTuples with x, y, z fields + global_size = KI.get_global_size() + global_id = KI.get_global_id() + local_size = KI.get_local_size() + local_id = KI.get_local_id() + num_groups = KI.get_num_groups() + group_id = KI.get_group_id() + + if UInt32(global_id.x) <= UInt32(global_size.x) + results[1, global_id.x] = global_id.x + results[2, global_id.x] = local_id.x + results[3, global_id.x] = group_id.x + results[4, global_id.x] = global_size.x + results[5, global_id.x] = local_size.x + results[6, global_id.x] = num_groups.x + end + return +end + +function intrinsics_testsuite(backend, AT) + @testset "KernelIntrinsics Tests" begin + @testset "Launch parameters" begin + # 1d + function launch_kernel1d(arr) + i, _, _ = KI.get_local_id() + gi, _, _ = KI.get_group_id() + ngi, _, _ = KI.get_num_groups() + + arr[(gi - 1) * ngi + i] = 1.0f0 + return + end + arr1d = AT(zeros(Float32, 4)) + KI.@kernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr1d) .== 1) + + # 1d tuple + arr1dt = AT(zeros(Float32, 4)) + KI.@kernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr1dt) .== 1) + + # 2d + function launch_kernel2d(arr) + i, j, _ = KI.get_local_id() + gi, gj, _ = KI.get_group_id() + ngi, ngj, _ = KI.get_num_groups() + + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1.0f0 + return + end + arr2d = AT(zeros(Float32, 4, 4)) + KI.@kernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr2d) .== 1) + + # 3d + function launch_kernel3d(arr) + i, j, k = KI.get_local_id() + gi, gj, gk = KI.get_group_id() + ngi, ngj, ngk = KI.get_num_groups() + + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1.0f0 + return + end + arr3d = AT(zeros(Float32, 4, 4, 4)) + KI.@kernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr3d) .== 1) + end + + @testset "Basic intrinsics functionality" begin + + @test KI.max_work_group_size(backend()) isa Int + @test KI.multiprocessor_count(backend()) isa Int + + # Test with small kernel + N = 16 + results = AT(zeros(Int, 6, N)) + kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results) + + @test KI.kernel_max_work_group_size(kernel) isa Int + @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1 + + kernel(results, workgroupsize = 4, numworkgroups = 4) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for i in 1:N + global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i] + + # Global IDs should be 1-based and sequential + @test global_id_x == i + + # Global size should match our ndrange + @test global_size_x == N + + # Local size should be 4 (our workgroupsize) + @test local_size_x == 4 + + # Number of groups should be ceil(N/4) = 4 + @test num_groups_x == 4 + + # Group ID should be 1-based + expected_group = div(i - 1, 4) + 1 + @test group_id_x == expected_group + + # Local ID should be 1-based within group + expected_local = ((i - 1) % 4) + 1 + @test local_id_x == expected_local + end + end + end + return nothing +end diff --git a/test/testsuite.jl b/test/testsuite.jl index 2418db99..31b801b3 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -1,6 +1,7 @@ module Testsuite using ..KernelAbstractions +import ..KernelAbstractions.KernelIntrinsics as KI using ..Test # We can't add test-dependencies withouth breaking backend packages @@ -26,6 +27,7 @@ end include("test.jl") +include("intrinsics.jl") include("localmem.jl") include("private.jl") include("unroll.jl") @@ -47,6 +49,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ specialfunctions_testsuite(backend) end + @conditional_testset "Intrinsics" skip_tests begin + intrinsics_testsuite(backend, AT) + end + @conditional_testset "Localmem" skip_tests begin localmem_testsuite(backend, AT) end