From 70ef83a06e1c267dcc063d3c68454367efaadde6 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 4 Feb 2025 15:50:59 +0100 Subject: [PATCH 01/44] define basic intrinsics --- src/KernelAbstractions.jl | 30 +++++++++++++++++----- src/intrinsics.jl | 52 +++++++++++++++++++++++++++++++++++++++ src/pocl/backend.jl | 25 ++++++------------- 3 files changed, 83 insertions(+), 24 deletions(-) create mode 100644 src/intrinsics.jl diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 9af2d9d4..8c2844fd 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -194,6 +194,10 @@ function unsafe_free! end unsafe_free!(::AbstractArray) = return +include("intrinsics.jl") +import .KernelIntrinsics +export KernelIntrinsics + ### # Kernel language # - @localmem @@ -460,13 +464,27 @@ end # Internal kernel functions ### -function __index_Local_Linear end -function __index_Group_Linear end -function __index_Global_Linear end +function __index_Local_Linear(ctx) + return KernelIntrinsics.get_local_id().x +end -function __index_Local_Cartesian end -function __index_Group_Cartesian end -function __index_Global_Cartesian end +function __index_Group_Linear(ctx) + return KernelIntrinsics.get_group_id().x +end + +function __index_Global_Linear(ctx) + return KernelIntrinsics.get_global_id().x +end + +function __index_Local_Cartesian(ctx) + return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x] +end +function __index_Group_Cartesian(ctx) + return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x] +end +function __index_Global_Cartesian(ctx) + return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x) +end @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...)) diff --git a/src/intrinsics.jl b/src/intrinsics.jl new file mode 100644 index 00000000..33ed56fe --- /dev/null +++ b/src/intrinsics.jl @@ -0,0 +1,52 @@ +module KernelIntrinsics + +""" + get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Return the number of global work-items specified. + +!!! note + 1-based. +""" +function get_global_size end + +""" + get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique global work-item ID. +""" +function get_global_id end + +""" + get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Return the number of local work-items specified. +""" +function get_local_size end + +""" + get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique local work-item ID. +""" +function get_local_id end + +""" + get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the number of groups. +""" +function get_num_groups end + +""" + get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + +Returns the unique group ID. +""" +function get_group_id end + +function localmemory end +function barrier end +function print end + +end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 87fccdb9..32c2951a 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -140,29 +140,18 @@ end ## Indexing Functions +const KI = KA.KernelIntrinsics -@device_override @inline function KA.__index_Local_Linear(ctx) - return get_local_id(1) +@device_override @inline function KI.get_local_id() + return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3)) end -@device_override @inline function KA.__index_Group_Linear(ctx) - return get_group_id(1) +@device_override @inline function KI.get_group_id() + return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3)) end -@device_override @inline function KA.__index_Global_Linear(ctx) - return get_global_id(1) -end - -@device_override @inline function KA.__index_Local_Cartesian(ctx) - @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)] -end - -@device_override @inline function KA.__index_Group_Cartesian(ctx) - @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)] -end - -@device_override @inline function KA.__index_Global_Cartesian(ctx) - return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) +@device_override @inline function KI.get_global_id() + return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) end @device_override @inline function KA.__validindex(ctx) From 02f3124edc32627324c359779e05dad94a5e261d Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Sep 2025 12:37:32 -0300 Subject: [PATCH 02/44] Fix docstrings --- src/intrinsics.jl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 33ed56fe..15202fab 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -4,9 +4,6 @@ module KernelIntrinsics get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} Return the number of global work-items specified. - -!!! note - 1-based. """ function get_global_size end @@ -14,6 +11,9 @@ function get_global_size end get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique global work-item ID. + +!!! note + 1-based. """ function get_global_id end @@ -28,6 +28,9 @@ function get_local_size end get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique local work-item ID. + +!!! note + 1-based. """ function get_local_id end @@ -42,6 +45,9 @@ function get_num_groups end get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} Returns the unique group ID. + +!!! note + 1-based. """ function get_group_id end From 1103574e4c44eddd2beab10ed8a15ee7ac8fdec6 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 16 Sep 2025 13:51:00 -0300 Subject: [PATCH 03/44] localmemory --- src/KernelAbstractions.jl | 4 +++- src/pocl/backend.jl | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 8c2844fd..f5f76306 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -814,7 +814,9 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory end +function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} + KernelIntrinsics.localmemory(t, dims, id) +end function __synchronize() error("@synchronize used outside kernel or not captured") diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 32c2951a..c06d014c 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -166,7 +166,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} ptr = POCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end From 8f404522b3809917498784ba56b8980d39423f26 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:30:33 -0300 Subject: [PATCH 04/44] Move default barrier implementation to KernelIntrinsics --- src/KernelAbstractions.jl | 2 +- src/intrinsics.jl | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index f5f76306..cd3d1fc9 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -819,7 +819,7 @@ function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, end function __synchronize() - error("@synchronize used outside kernel or not captured") + KernelIntrinsics.barrier() end @generated function __print(items...) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 15202fab..32cc125d 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -52,7 +52,9 @@ Returns the unique group ID. function get_group_id end function localmemory end -function barrier end +function barrier() + error("Group barrier used outside kernel or not captured") +end function print end end From 471dc5857a1d867e42be79e719f5fb4d02d07117 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 15:36:22 -0300 Subject: [PATCH 05/44] Implement remaining intrinsics for POCL --- src/pocl/backend.jl | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index c06d014c..87345897 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -154,6 +154,18 @@ end return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) end +@device_override @inline function KI.get_local_size() + return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3)) +end + +@device_override @inline function KI.get_num_groups() + return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3)) +end + +@device_override @inline function KI.get_global_size() + return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3)) +end + @device_override @inline function KA.__validindex(ctx) if KA.__dynamic_checkbounds(ctx) I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) @@ -178,7 +190,7 @@ end ## Synchronization and Printing -@device_override @inline function KA.__synchronize() +@device_override @inline function KI.barrier() work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end From a8a14a1595b4501e3296b58e9ade7d05d615499b Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:23:11 -0300 Subject: [PATCH 06/44] Add basic tests Modified from initial Claude code --- test/intrinsics.jl | 62 ++++++++++++++++++++++++++++++++++++++++++++++ test/testsuite.jl | 5 ++++ 2 files changed, 67 insertions(+) create mode 100644 test/intrinsics.jl diff --git a/test/intrinsics.jl b/test/intrinsics.jl new file mode 100644 index 00000000..693b2bd9 --- /dev/null +++ b/test/intrinsics.jl @@ -0,0 +1,62 @@ + +@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results) + # Test all intrinsics return NamedTuples with x, y, z fields + global_size = KernelIntrinsics.get_global_size() + global_id = KernelIntrinsics.get_global_id() + local_size = KernelIntrinsics.get_local_size() + local_id = KernelIntrinsics.get_local_id() + num_groups = KernelIntrinsics.get_num_groups() + group_id = KernelIntrinsics.get_group_id() + + if UInt32(global_id.x) <= UInt32(global_size.x) + results[1, global_id.x] = global_id.x + results[2, global_id.x] = local_id.x + results[3, global_id.x] = group_id.x + results[4, global_id.x] = global_size.x + results[5, global_id.x] = local_size.x + results[6, global_id.x] = num_groups.x + end +end + + +function intrinsics_testsuite(backend, AT) + @testset "KernelIntrinsics Tests" begin + @testset "Basic intrinsics functionality" begin + + # Test with small kernel + N = 16 + results = AT(zeros(UInt32, 6, N)) + + kernel = test_intrinsics_kernel(backend(), 4, (N,)) + kernel(results, ndrange = N) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for i in 1:N + global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i] + + # Global IDs should be 1-based and sequential + @test global_id_x == i + + # Global size should match our ndrange + @test global_size_x == N + + # Local size should be 4 (our workgroupsize) + @test local_size_x == 4 + + # Number of groups should be ceil(N/4) = 4 + @test num_groups_x == 4 + + # Group ID should be 1-based + expected_group = div(i - 1, 4) + 1 + @test group_id_x == expected_group + + # Local ID should be 1-based within group + expected_local = ((i - 1) % 4) + 1 + @test local_id_x == expected_local + end + end + end +end diff --git a/test/testsuite.jl b/test/testsuite.jl index 2418db99..3426e542 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -26,6 +26,7 @@ end include("test.jl") +include("intrinsics.jl") include("localmem.jl") include("private.jl") include("unroll.jl") @@ -47,6 +48,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ specialfunctions_testsuite(backend) end + @conditional_testset "Intrinsics" skip_tests begin + intrinsics_testsuite(backend, AT) + end + @conditional_testset "Localmem" skip_tests begin localmem_testsuite(backend, AT) end From 96d9ffcd8b51278936485e25cb9d3151e0cd58a4 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 1 Oct 2025 13:24:43 -0300 Subject: [PATCH 07/44] Int32 -> Int https://github.com/JuliaGPU/KernelAbstractions.jl/pull/562/files#r1958255435 --- src/intrinsics.jl | 12 ++++++------ src/pocl/backend.jl | 12 ++++++------ test/intrinsics.jl | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 32cc125d..06011620 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,14 +1,14 @@ module KernelIntrinsics """ - get_global_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of global work-items specified. """ function get_global_size end """ - get_global_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_global_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique global work-item ID. @@ -18,14 +18,14 @@ Returns the unique global work-item ID. function get_global_id end """ - get_local_size()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of local work-items specified. """ function get_local_size end """ - get_local_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_local_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique local work-item ID. @@ -35,14 +35,14 @@ Returns the unique local work-item ID. function get_local_id end """ - get_num_groups()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} Returns the number of groups. """ function get_num_groups end """ - get_group_id()::@NamedTuple{x::Int32, y::Int32, z::Int32} + get_group_id()::@NamedTuple{x::Int, y::Int, z::Int} Returns the unique group ID. diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 87345897..ef0aba6b 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -143,27 +143,27 @@ end const KI = KA.KernelIntrinsics @device_override @inline function KI.get_local_id() - return (; x = get_local_id(1), y = get_local_id(2), z = get_local_id(3)) + return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) end @device_override @inline function KI.get_group_id() - return (; x = get_group_id(1), y = get_group_id(2), z = get_group_id(3)) + return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3))) end @device_override @inline function KI.get_global_id() - return (; x = get_global_id(1), y = get_global_id(2), z = get_global_id(3)) + return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3))) end @device_override @inline function KI.get_local_size() - return (; x = get_local_size(1), y = get_local_size(2), z = get_local_size(3)) + return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3))) end @device_override @inline function KI.get_num_groups() - return (; x = get_num_groups(1), y = get_num_groups(2), z = get_num_groups(3)) + return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3))) end @device_override @inline function KI.get_global_size() - return (; x = get_global_size(1), y = get_global_size(2), z = get_global_size(3)) + return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end @device_override @inline function KA.__validindex(ctx) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 693b2bd9..e58b58a6 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -25,7 +25,7 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 - results = AT(zeros(UInt32, 6, N)) + results = AT(zeros(Int, 6, N)) kernel = test_intrinsics_kernel(backend(), 4, (N,)) kernel(results, ndrange = N) From 685a7d48165e838e9c863670c9ec17e3a476f9bd Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 2 Oct 2025 16:58:13 -0300 Subject: [PATCH 08/44] Format --- src/KernelAbstractions.jl | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index cd3d1fc9..138c63c5 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -814,13 +814,9 @@ include("macros.jl") ### function Scratchpad end -function SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} - KernelIntrinsics.localmemory(t, dims, id) -end +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) -function __synchronize() - KernelIntrinsics.barrier() -end +__synchronize() = KernelIntrinsics.barrier() @generated function __print(items...) str = "" From a54fb5d674fc2b016a25b77f3d8f647fe1242917 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 00:45:13 -0300 Subject: [PATCH 09/44] Launch interface --- src/KernelAbstractions.jl | 7 ++++--- src/intrinsics.jl | 39 ++++++++++++++++++++++++++++++++++++++- src/pocl/backend.jl | 2 +- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 138c63c5..7b626618 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -244,10 +244,10 @@ Declare storage that is local to a workgroup. """ macro localmem(T, dims) # Stay in sync with CUDAnative - id = gensym("static_shmem") + # id = gensym("static_shmem") return quote - $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id)))) + $SharedMemory($(esc(T)), Val($(esc(dims))))#, Val($(QuoteNode(id)))) end end @@ -814,7 +814,8 @@ include("macros.jl") ### function Scratchpad end -SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) +# SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) +SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.localmemory(t, dims) __synchronize() = KernelIntrinsics.barrier() diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 06011620..77bbf2a3 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -51,10 +51,47 @@ Returns the unique group ID. """ function get_group_id end -function localmemory end +""" + localmemory(T, dims) + +Declare memory that is local to a workgroup. + +!!! note + Backend implementations **must** implement: + ``` + localmemory(T::DataType, ::Val{Dims}) where {T, Dims} + ``` + As well as the on-device functionality. +""" +localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims)) +# @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem"))) + function barrier() error("Group barrier used outside kernel or not captured") end function print end + +""" + KIKernel{Backend, BKern} + +KIKernel closure struct that is used to represent the backend +kernel on the host. + +!!! note + Backend implementations **must** implement: + ``` + KI.KIKernel(::NewBackend, f, args...; kwargs...) + (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) + ``` + As well as the on-device functionality. +""" +struct KIKernel{Backend, BKern} + backend::Backend + kern::BKern +end + +function kernel_max_work_group_size end +function max_work_group_size end +function multiprocessor_count end end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index ef0aba6b..4a50cc7e 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -178,7 +178,7 @@ end ## Shared and Scratch Memory -@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id} +@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims} ptr = POCL.emit_localmemory(T, Val(prod(Dims))) CLDeviceArray(Dims, ptr) end From 5a6cd9bf398b1f3b6f2bf9a95d0e06288f26e8ed Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:24:19 -0300 Subject: [PATCH 10/44] Docs --- src/intrinsics.jl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 77bbf2a3..9a271fa2 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -66,9 +66,28 @@ Declare memory that is local to a workgroup. localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims)) # @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem"))) +""" + barrier() + +After a `barrier()` call, all read and writes to global and local memory +from each thread in the workgroup are visible in from all other threads in the +workgroup. + +!!! note + `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all. + +!!! note + Backend implementations **must** implement: + ``` + @device_override barrier() + ``` + As well as the on-device functionality. +""" function barrier() error("Group barrier used outside kernel or not captured") end + +# TODO function print end From faa52139ac062e077e52c54c1144c1963d03bf64 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:27:40 -0300 Subject: [PATCH 11/44] New stuff docs --- src/intrinsics.jl | 46 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 9a271fa2..efbe93f0 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -110,7 +110,51 @@ struct KIKernel{Backend, BKern} kern::BKern end +""" + kernel_max_work_group_size(backend, kern; [max_work_items::Int])::Int + +The maximum workgroup size limit for a kernel as reported by the backend. +This function should always be used to determine the workgroup size before +launching a kernel. + +!!! note + Backend implementations **must** implement: + ``` + kernel_max_work_group_size(backend::NewBackend, kern::KIKernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int + ``` + As well as the on-device functionality. +""" function kernel_max_work_group_size end + +""" + max_work_group_size(backend, kern; [max_work_items::Int])::Int + +The maximum workgroup size limit for a kernel as reported by the backend. +This function represents a theoretical maximum; `kernel_max_work_group_size` +should be used before launching a kernel as some backends may error if +kernel launch with too big a workgroup is attempted. + +!!! note + Backend implementations **must** implement: + ``` + max_work_group_size(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" function max_work_group_size end -function multiprocessor_count end + +""" + multiprocessor_count(backend::NewBackend)::Int + +The multiprocessor count for the current device used by `backend`. +Used for certain algorithm optimizations. + +!!! note + Backend implementations **may** implement: + ``` + multiprocessor_count(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" +multiprocessor_count(::Backend) = 0 end From 6baa445253f311dba4c7b3350e344bec20f052fd Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 10:36:26 -0300 Subject: [PATCH 12/44] Fix --- src/intrinsics.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index efbe93f0..5492d8f6 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -105,9 +105,9 @@ kernel on the host. ``` As well as the on-device functionality. """ -struct KIKernel{Backend, BKern} - backend::Backend - kern::BKern +struct KIKernel{B, Kern} + backend::B + kern::Kern end """ @@ -156,5 +156,5 @@ Used for certain algorithm optimizations. ``` As well as the on-device functionality. """ -multiprocessor_count(::Backend) = 0 +multiprocessor_count(_) = 0 end From 1953e8d0a6af642948ebe94559a0cdc9e8698931 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 22 Oct 2025 11:50:03 -0300 Subject: [PATCH 13/44] Temp adaptation to test in-progress interface with CUDA.jl --- src/intrinsics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5492d8f6..37cbba88 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -101,7 +101,7 @@ kernel on the host. Backend implementations **must** implement: ``` KI.KIKernel(::NewBackend, f, args...; kwargs...) - (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) + (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) ``` As well as the on-device functionality. """ From ba52bbb5c4ba167b14451921cfd8a55738d12fa4 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 4 Nov 2025 13:21:37 -0400 Subject: [PATCH 14/44] Better launch interface --- src/KernelAbstractions.jl | 10 ++--- src/intrinsics.jl | 88 ++++++++++++++++++++++++++++++++++++++- src/pocl/backend.jl | 33 ++++++++++++++- 3 files changed, 123 insertions(+), 8 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 7b626618..275ae740 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -194,6 +194,11 @@ function unsafe_free! end unsafe_free!(::AbstractArray) = return +""" +Abstract type for all KernelAbstractions backends. +""" +abstract type Backend end + include("intrinsics.jl") import .KernelIntrinsics export KernelIntrinsics @@ -500,11 +505,6 @@ constify(arg) = adapt(ConstAdaptor(), arg) # Backend hierarchy ### -""" - -Abstract type for all KernelAbstractions backends. -""" -abstract type Backend end """ Abstract type for all GPU based KernelAbstractions backends. diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 37cbba88..01ab5ca2 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,5 +1,8 @@ module KernelIntrinsics +import ..KernelAbstractions: Backend +import GPUCompiler: split_kwargs, assign_args! + """ get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} @@ -100,7 +103,6 @@ kernel on the host. !!! note Backend implementations **must** implement: ``` - KI.KIKernel(::NewBackend, f, args...; kwargs...) (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) ``` As well as the on-device functionality. @@ -157,4 +159,88 @@ Used for certain algorithm optimizations. As well as the on-device functionality. """ multiprocessor_count(_) = 0 + +# TODO: docstring +# kiconvert(::NewBackend, arg) +function kiconvert end + +# TODO: docstring +# KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} +function kifunction end + +const MACRO_KWARGS = [:launch, :backend] +const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] + +macro kikernel(backend, ex...) + call = ex[end] + kwargs = map(ex[1:end-1]) do kwarg + if kwarg isa Symbol + :($kwarg = $kwarg) + elseif Meta.isexpr(kwarg, :(=)) + kwarg + else + throw(ArgumentError("Invalid keyword argument '$kwarg'")) + end + end + + # destructure the kernel call + Meta.isexpr(call, :call) || throw(ArgumentError("final argument to @kikern should be a function call")) + f = call.args[1] + args = call.args[2:end] + + code = quote end + vars, var_exprs = assign_args!(code, args) + + # group keyword argument + macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = + split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS) + if !isempty(other_kwargs) + key,val = first(other_kwargs).args + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + + # handle keyword arguments that influence the macro's behavior + launch = true + for kwarg in macro_kwargs + key,val = kwarg.args + if key === :launch + isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @kikern should be a Bool")) + launch = val::Bool + else + throw(ArgumentError("Unsupported keyword argument '$key'")) + end + end + if !launch && !isempty(call_kwargs) + error("@kikern with launch=false does not support launch-time keyword arguments; use them when calling the kernel") + end + + # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5) + # we esc() the whole thing now, necessitating gensyms... + @gensym f_var kernel_f kernel_args kernel_tt kernel + + # convert the arguments, call the compiler and launch the kernel + # while keeping the original arguments alive + push!(code.args, + quote + $f_var = $f + GC.@preserve $(vars...) $f_var begin + $kernel_f = $kiconvert($backend, $f_var) + $kernel_args = map(x -> $kiconvert($backend, x), ($(var_exprs...),)) + $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...} + $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) + if $launch + $kernel($(var_exprs...); $(call_kwargs...)) + end + $kernel + end + end) + + return esc(quote + let + $code + end + end) +end + end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 4a50cc7e..b53bed69 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -2,7 +2,7 @@ module POCLKernels using ..POCL using ..POCL: @device_override, cl, method_table -using ..POCL: device +using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA @@ -138,9 +138,38 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize return nothing end +const KI = KA.KernelIntrinsics + +KI.kiconvert(::POCLBackend, arg) = clconvert(arg) + +function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + kern = clfunction(f, tt; name, kwargs...) + KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) +end + +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) + local_size = isnothing(workgroupsize) ? 1 : workgroupsize + global_size = if isnothing(numworkgroups) + 1 + else + numworkgroups*local_size + end + + obj.kern(args...; local_size, global_size) +end + + +function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int=typemax(Int))::Int + 4096 +end +function KI.max_work_group_size(::POCLBackend)::Int + 4096 +end +function KI.multiprocessor_count(::POCLBackend)::Int + 1 +end ## Indexing Functions -const KI = KA.KernelIntrinsics @device_override @inline function KI.get_local_id() return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3))) From 1fb74a005cb0aec6f6a4ca9b4de180db0180bd65 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 4 Nov 2025 16:35:58 -0400 Subject: [PATCH 15/44] Only keep common compiler kwarg --- src/intrinsics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 01ab5ca2..0ed4eb87 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -169,7 +169,7 @@ function kiconvert end function kifunction end const MACRO_KWARGS = [:launch, :backend] -const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] macro kikernel(backend, ex...) From 36c67ce660ec85900a02bad3fc103f3152e8e497 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 12:24:34 -0400 Subject: [PATCH 16/44] Fixup tests --- test/intrinsics.jl | 26 +++++++++++++++++--------- test/testsuite.jl | 1 + 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index e58b58a6..537b29fb 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -1,12 +1,12 @@ -@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results) +function test_intrinsics_kernel(results) # Test all intrinsics return NamedTuples with x, y, z fields - global_size = KernelIntrinsics.get_global_size() - global_id = KernelIntrinsics.get_global_id() - local_size = KernelIntrinsics.get_local_size() - local_id = KernelIntrinsics.get_local_id() - num_groups = KernelIntrinsics.get_num_groups() - group_id = KernelIntrinsics.get_group_id() + global_size = KI.get_global_size() + global_id = KI.get_global_id() + local_size = KI.get_local_size() + local_id = KI.get_local_id() + num_groups = KI.get_num_groups() + group_id = KI.get_group_id() if UInt32(global_id.x) <= UInt32(global_size.x) results[1, global_id.x] = global_id.x @@ -16,6 +16,7 @@ results[5, global_id.x] = local_size.x results[6, global_id.x] = num_groups.x end + return end @@ -23,12 +24,19 @@ function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @testset "Basic intrinsics functionality" begin + @test KI.max_work_group_size(backend()) isa Int + @test KI.multiprocessor_count(backend()) isa Int + # Test with small kernel N = 16 results = AT(zeros(Int, 6, N)) - kernel = test_intrinsics_kernel(backend(), 4, (N,)) - kernel(results, ndrange = N) + kernel = KI.@kikernel backend() test_intrinsics_kernel(results) + + @test KI.kernel_max_work_group_size(backend(), kernel) isa Int + @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items=1) == 1 + + kernel(results, workgroupsize = 4, numworkgroups = 4) KernelAbstractions.synchronize(backend()) host_results = Array(results) diff --git a/test/testsuite.jl b/test/testsuite.jl index 3426e542..31b801b3 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -1,6 +1,7 @@ module Testsuite using ..KernelAbstractions +import ..KernelAbstractions.KernelIntrinsics as KI using ..Test # We can't add test-dependencies withouth breaking backend packages From 0636279c5b0e8d5756c192aa8a0da741c3144866 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 12:25:48 -0400 Subject: [PATCH 17/44] No `backend` in macro kwargs --- src/intrinsics.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 0ed4eb87..cf3093a9 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -168,7 +168,7 @@ function kiconvert end # KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} function kifunction end -const MACRO_KWARGS = [:launch, :backend] +const MACRO_KWARGS = [:launch] const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] From 48c6b8b61458f809318b65617bdc8bf63cca8be2 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 12:25:51 -0400 Subject: [PATCH 18/44] Tests --- src/intrinsics.jl | 52 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index cf3093a9..dfa80bfb 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -160,18 +160,62 @@ Used for certain algorithm optimizations. """ multiprocessor_count(_) = 0 -# TODO: docstring -# kiconvert(::NewBackend, arg) +""" + kiconvert(::NewBackend, arg) + +This function is called for every argument to be passed to a kernel, allowing it to be +converted to a GPU-friendly format. + +!!! note + Backend implementations **must** implement: + ``` + kiconvert(::NewBackend, arg) + ``` +""" function kiconvert end -# TODO: docstring -# KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} +""" + KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + +Low-level interface to compile a function invocation for the currently-active GPU, returning +a callable kernel object. For a higher-level interface, use [`@kikernel`](@ref). + +Currently, only `kifunction` only supports the `name` keyword argument as it is the only one +by all backends. + +Keyword arguments: +- `name`: override the name that the kernel will have in the generated code + +!!! note + Backend implementations **must** implement: + ``` + kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + ``` +""" function kifunction end const MACRO_KWARGS = [:launch] const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] +""" + @kikernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) + +High-level interface for executing code on a GPU. + +The `@kikernel` macro should prefix a call, with `func` a callable function or object that +should return nothing. It will be compiled to a function native to the specified `backend` +upon first use, and to a certain extent arguments will be converted and managed automatically +using `kiconvert`. Finally, if `launch=true`, the newly created callable kernel object is +called and launched according to the specified `backend`. + +There are a few keyword arguments that influence the behavior of `@kikernel`: + +- `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned + kernel object should be launched by calling it and passing arguments again. +- `name`: the name of the kernel in the generated code. Defaults to an automatically- + generated name. +""" macro kikernel(backend, ex...) call = ex[end] kwargs = map(ex[1:end-1]) do kwarg From cc240c67f9d4866e304cde7964f6dcfc65c147b1 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:33:47 -0400 Subject: [PATCH 19/44] Fixes --- src/KernelAbstractions.jl | 12 ++++++------ src/pocl/backend.jl | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 275ae740..67d53206 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -469,25 +469,25 @@ end # Internal kernel functions ### -function __index_Local_Linear(ctx) +@inline function __index_Local_Linear(ctx) return KernelIntrinsics.get_local_id().x end -function __index_Group_Linear(ctx) +@inline function __index_Group_Linear(ctx) return KernelIntrinsics.get_group_id().x end -function __index_Global_Linear(ctx) +@inline function __index_Global_Linear(ctx) return KernelIntrinsics.get_global_id().x end -function __index_Local_Cartesian(ctx) +@inline function __index_Local_Cartesian(ctx) return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x] end -function __index_Group_Cartesian(ctx) +@inline function __index_Group_Cartesian(ctx) return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x] end -function __index_Global_Cartesian(ctx) +@inline function __index_Global_Cartesian(ctx) return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x) end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index b53bed69..ac111687 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -147,7 +147,7 @@ function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) local_size = isnothing(workgroupsize) ? 1 : workgroupsize global_size = if isnothing(numworkgroups) 1 From f41d1e015cc857c3e5b87b9c7c01d16c50e8dcc2 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:34:35 -0400 Subject: [PATCH 20/44] More fix --- src/pocl/backend.jl | 9 +++++---- test/intrinsics.jl | 3 +-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index ac111687..46fd32df 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -56,7 +56,7 @@ KA.functional(::POCLBackend) = true KA.pagelock!(::POCLBackend, x) = nothing KA.get_backend(::Array) = POCLBackend() -KA.synchronize(::POCLBackend) = nothing +KA.synchronize(::POCLBackend) = cl.finish(cl.queue()) KA.supports_float64(::POCLBackend) = true KA.supports_unified(::POCLBackend) = true @@ -160,13 +160,14 @@ end function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int=typemax(Int))::Int - 4096 + wginfo = cl.work_group_info(kikern.kern.fun, device()) + Int(min(wginfo.size, max_work_items)) end function KI.max_work_group_size(::POCLBackend)::Int - 4096 + Int(device().max_work_group_size) end function KI.multiprocessor_count(::POCLBackend)::Int - 1 + Int(device().max_compute_units) end ## Indexing Functions diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 537b29fb..cd75ad3f 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -30,8 +30,7 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 results = AT(zeros(Int, 6, N)) - - kernel = KI.@kikernel backend() test_intrinsics_kernel(results) + kernel = KI.@kikernel backend() launch=false test_intrinsics_kernel(results) @test KI.kernel_max_work_group_size(backend(), kernel) isa Int @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items=1) == 1 From 8e97be1aa76ed6066024b0b4849201466c1bfeac Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:51:40 -0400 Subject: [PATCH 21/44] Print --- src/KernelAbstractions.jl | 18 +----------------- src/intrinsics.jl | 19 ++++++++++++++++++- src/pocl/backend.jl | 2 +- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 67d53206..c53f9989 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -819,23 +819,7 @@ SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.loc __synchronize() = KernelIntrinsics.barrier() -@generated function __print(items...) - str = "" - args = [] - - for i in 1:length(items) - item = :(items[$i]) - T = items[i] - if T <: Val - item = QuoteNode(T.parameters[1]) - end - push!(args, item) - end - - return quote - print($(args...)) - end -end +__print(args...) = KernelIntrinsics._print(args...) # Utils __size(args::Tuple) = Tuple{args...} diff --git a/src/intrinsics.jl b/src/intrinsics.jl index dfa80bfb..665b5903 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -91,7 +91,24 @@ function barrier() end # TODO -function print end +@generated function _print(items...) + str = "" + args = [] + + for i in 1:length(items) + item = :(items[$i]) + T = items[i] + if T <: Val + item = QuoteNode(T.parameters[1]) + end + push!(args, item) + end + + return quote + print($(args...)) + end +end + """ diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 46fd32df..43b8d7b4 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -224,7 +224,7 @@ end work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end -@device_override @inline function KA.__print(args...) +@device_override @inline function KI._print(args...) POCL._print(args...) end From bf37826bfc08b5aa53160b4b276c055f1f42cceb Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:54:48 -0400 Subject: [PATCH 22/44] tweak --- src/pocl/backend.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 43b8d7b4..60e37bbc 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -5,6 +5,7 @@ using ..POCL: @device_override, cl, method_table using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA +import KA.KernelIntrinsics as KI import StaticArrays @@ -138,8 +139,6 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize return nothing end -const KI = KA.KernelIntrinsics - KI.kiconvert(::POCLBackend, arg) = clconvert(arg) function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} From cae65fe412b3ec65b5f2ada87878505c640ff88c Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:09:13 -0400 Subject: [PATCH 23/44] Fix --- src/pocl/backend.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 60e37bbc..e711600c 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -5,7 +5,7 @@ using ..POCL: @device_override, cl, method_table using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA -import KA.KernelIntrinsics as KI +import KernekAbstractions.KernelIntrinsics as KI import StaticArrays From 88616587fdb69a427189dc184898c631118db922 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:14:43 -0400 Subject: [PATCH 24/44] Ugh --- src/pocl/backend.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index e711600c..b09be712 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -5,7 +5,7 @@ using ..POCL: @device_override, cl, method_table using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA -import KernekAbstractions.KernelIntrinsics as KI +import KernelAbstractions.KernelIntrinsics as KI import StaticArrays From 86c18fccb39338912c9c4de6b42f0a891cae5a34 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:28:24 -0400 Subject: [PATCH 25/44] Format --- src/KernelAbstractions.jl | 5 +---- src/intrinsics.jl | 26 ++++++++++++++------------ src/pocl/backend.jl | 16 ++++++++-------- test/intrinsics.jl | 5 +++-- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index c53f9989..feaafa5b 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -248,11 +248,8 @@ end Declare storage that is local to a workgroup. """ macro localmem(T, dims) - # Stay in sync with CUDAnative - # id = gensym("static_shmem") - return quote - $SharedMemory($(esc(T)), Val($(esc(dims))))#, Val($(QuoteNode(id)))) + $SharedMemory($(esc(T)), Val($(esc(dims)))) end end diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 665b5903..6db136ac 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -66,8 +66,7 @@ Declare memory that is local to a workgroup. ``` As well as the on-device functionality. """ -localmemory(::Type{T}, dims) where T = localmemory(T, Val(dims)) -# @inline localmemory(::Type{T}, dims::Val{Dims}) where {T, Dims} = localmemory(T, dims, Val(gensym("static_shmem"))) +localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) """ barrier() @@ -110,7 +109,6 @@ end end - """ KIKernel{Backend, BKern} @@ -235,7 +233,7 @@ There are a few keyword arguments that influence the behavior of `@kikernel`: """ macro kikernel(backend, ex...) call = ex[end] - kwargs = map(ex[1:end-1]) do kwarg + kwargs = map(ex[1:(end - 1)]) do kwarg if kwarg isa Symbol :($kwarg = $kwarg) elseif Meta.isexpr(kwarg, :(=)) @@ -257,14 +255,14 @@ macro kikernel(backend, ex...) macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS) if !isempty(other_kwargs) - key,val = first(other_kwargs).args + key, val = first(other_kwargs).args throw(ArgumentError("Unsupported keyword argument '$key'")) end # handle keyword arguments that influence the macro's behavior launch = true for kwarg in macro_kwargs - key,val = kwarg.args + key, val = kwarg.args if key === :launch isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @kikern should be a Bool")) launch = val::Bool @@ -282,7 +280,8 @@ macro kikernel(backend, ex...) # convert the arguments, call the compiler and launch the kernel # while keeping the original arguments alive - push!(code.args, + push!( + code.args, quote $f_var = $f GC.@preserve $(vars...) $f_var begin @@ -295,13 +294,16 @@ macro kikernel(backend, ex...) end $kernel end - end) + end + ) - return esc(quote - let - $code + return esc( + quote + let + $code + end end - end) + ) end end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index b09be712..c38188b3 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -141,32 +141,32 @@ end KI.kiconvert(::POCLBackend, arg) = clconvert(arg) -function KI.kifunction(::POCLBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} +function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name=nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) local_size = isnothing(workgroupsize) ? 1 : workgroupsize global_size = if isnothing(numworkgroups) 1 else - numworkgroups*local_size + numworkgroups * local_size end - obj.kern(args...; local_size, global_size) + return obj.kern(args...; local_size, global_size) end -function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int=typemax(Int))::Int +function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int wginfo = cl.work_group_info(kikern.kern.fun, device()) - Int(min(wginfo.size, max_work_items)) + return Int(min(wginfo.size, max_work_items)) end function KI.max_work_group_size(::POCLBackend)::Int - Int(device().max_work_group_size) + return Int(device().max_work_group_size) end function KI.multiprocessor_count(::POCLBackend)::Int - Int(device().max_compute_units) + return Int(device().max_compute_units) end ## Indexing Functions diff --git a/test/intrinsics.jl b/test/intrinsics.jl index cd75ad3f..b3b95d50 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -30,10 +30,10 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 results = AT(zeros(Int, 6, N)) - kernel = KI.@kikernel backend() launch=false test_intrinsics_kernel(results) + kernel = KI.@kikernel backend() launch = false test_intrinsics_kernel(results) @test KI.kernel_max_work_group_size(backend(), kernel) isa Int - @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items=1) == 1 + @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items = 1) == 1 kernel(results, workgroupsize = 4, numworkgroups = 4) KernelAbstractions.synchronize(backend()) @@ -66,4 +66,5 @@ function intrinsics_testsuite(backend, AT) end end end + return nothing end From 0a6f1720a946a37e7da78d4e46b1f3c032ae16f9 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:30:02 -0400 Subject: [PATCH 26/44] Format --- src/pocl/backend.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index c38188b3..3aac42a5 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -141,9 +141,9 @@ end KI.kiconvert(::POCLBackend, arg) = clconvert(arg) -function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name=nothing, kwargs...) where {F, TT} +function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) - KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) + return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) From 2371f884bfe527064c60d5683317dbfa0e585423 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 19:32:24 -0400 Subject: [PATCH 27/44] drbh --- test/intrinsics.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index b3b95d50..6fbf55c0 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -1,4 +1,3 @@ - function test_intrinsics_kernel(results) # Test all intrinsics return NamedTuples with x, y, z fields global_size = KI.get_global_size() @@ -19,7 +18,6 @@ function test_intrinsics_kernel(results) return end - function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @testset "Basic intrinsics functionality" begin From 0a45913f5ca0d69136bae016d1fb623dcdddbb00 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 5 Nov 2025 20:29:39 -0400 Subject: [PATCH 28/44] `_print` docstring --- src/intrinsics.jl | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 6db136ac..4eddf4a3 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -89,7 +89,20 @@ function barrier() error("Group barrier used outside kernel or not captured") end -# TODO +""" + _print(items...) + + Overloaded by backends to enable `KernelAbstractions.@print` + functionality. + +!!! note + Backend implementations **must** implement: + ``` + _print(items...) + ``` + As well as the on-device functionality, + or define it to return `nothing` +""" @generated function _print(items...) str = "" args = [] From ff3b0777b0abd799fcd8c6105d221d789b03599d Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:36:56 -0400 Subject: [PATCH 29/44] Test all launch size options --- test/intrinsics.jl | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 6fbf55c0..79996850 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -20,6 +20,56 @@ end function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin + @testset "Launch parameters" begin + # 1d + function launch_kernel1d(arr) + i, _, _ = KI.get_local_id() + gi, _, _ = KI.get_group_id() + ngi, _, _ = KI.get_num_groups() + + arr[(gi - 1) * ngi + i] = 1f0 + return + end + arr1d = AT(zeros(Float32, 4)) + KI.@kikernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr1d) .== 1) + + # 1d tuple + arr1dt = AT(zeros(Float32, 4)) + KI.@kikernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr1dt) .== 1) + + # 2d + function launch_kernel2d(arr) + i, j, _ = KI.get_local_id() + gi, gj, _ = KI.get_group_id() + ngi, ngj, _ = KI.get_num_groups() + + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1f0 + return + end + arr2d = AT(zeros(Float32, 4, 4)) + KI.@kikernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr2d) .== 1) + + # 3d + function launch_kernel3d(arr) + i, j, k = KI.get_local_id() + gi, gj, gk = KI.get_group_id() + ngi, ngj, ngk = KI.get_num_groups() + + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1f0 + return + end + arr3d = AT(zeros(Float32, 4, 4, 4)) + KI.@kikernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d) + KernelAbstractions.synchronize(backend()) + @test all(Array(arr3d) .== 1) + end + @testset "Basic intrinsics functionality" begin @test KI.max_work_group_size(backend()) isa Int From 0732ae094ebbc2724e7f2c87b4279dd693308249 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:39:45 -0400 Subject: [PATCH 30/44] Fix backend --- src/pocl/backend.jl | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 3aac42a5..e66720ed 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -146,17 +146,23 @@ function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kw return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) - local_size = isnothing(workgroupsize) ? 1 : workgroupsize - global_size = if isnothing(numworkgroups) - 1 - else - numworkgroups * local_size +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) + local_size = StaticArrays.MVector{3}((1,1,1)) + if !isnothing(workgroupsize) + for (i, val) in enumerate(workgroupsize) + local_size[i] = val + end end - return obj.kern(args...; local_size, global_size) -end + global_size = StaticArrays.MVector{3}((1,1,1)) + if !isnothing(numworkgroups) + for (i, val) in enumerate(numworkgroups) + global_size[i] = val * local_size[i] + end + end + obj.kern(args...; local_size, global_size) +end function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int wginfo = cl.work_group_info(kikern.kern.fun, device()) From 7d6be1bcf9bcf70257ffb97ba7005f9644f449bf Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:54:31 -0400 Subject: [PATCH 31/44] Format --- src/pocl/backend.jl | 8 ++++---- test/intrinsics.jl | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index e66720ed..41bbea58 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -146,22 +146,22 @@ function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kw return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups=nothing, workgroupsize=nothing) - local_size = StaticArrays.MVector{3}((1,1,1)) +function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) + local_size = StaticArrays.MVector{3}((1, 1, 1)) if !isnothing(workgroupsize) for (i, val) in enumerate(workgroupsize) local_size[i] = val end end - global_size = StaticArrays.MVector{3}((1,1,1)) + global_size = StaticArrays.MVector{3}((1, 1, 1)) if !isnothing(numworkgroups) for (i, val) in enumerate(numworkgroups) global_size[i] = val * local_size[i] end end - obj.kern(args...; local_size, global_size) + return obj.kern(args...; local_size, global_size) end function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 79996850..3ec26c56 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -27,7 +27,7 @@ function intrinsics_testsuite(backend, AT) gi, _, _ = KI.get_group_id() ngi, _, _ = KI.get_num_groups() - arr[(gi - 1) * ngi + i] = 1f0 + arr[(gi - 1) * ngi + i] = 1.0f0 return end arr1d = AT(zeros(Float32, 4)) @@ -47,7 +47,7 @@ function intrinsics_testsuite(backend, AT) gi, gj, _ = KI.get_group_id() ngi, ngj, _ = KI.get_num_groups() - arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1f0 + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1.0f0 return end arr2d = AT(zeros(Float32, 4, 4)) @@ -61,7 +61,7 @@ function intrinsics_testsuite(backend, AT) gi, gj, gk = KI.get_group_id() ngi, ngj, ngk = KI.get_num_groups() - arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1f0 + arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1.0f0 return end arr3d = AT(zeros(Float32, 4, 4, 4)) From 04f21ec918a8a9bb9ce2482bdd4d78d7458d85ab Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 12:18:43 -0400 Subject: [PATCH 32/44] More consistent docstrings --- src/intrinsics.jl | 47 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 4eddf4a3..fb258578 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -7,6 +7,12 @@ import GPUCompiler: split_kwargs, assign_args! get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of global work-items specified. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_global_size()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_global_size end @@ -17,6 +23,12 @@ Returns the unique global work-item ID. !!! note 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_global_id()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_global_id end @@ -24,6 +36,12 @@ function get_global_id end get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} Return the number of local work-items specified. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_local_size()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_local_size end @@ -34,6 +52,12 @@ Returns the unique local work-item ID. !!! note 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_local_id()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_local_id end @@ -41,6 +65,12 @@ function get_local_id end get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} Returns the number of groups. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_num_groups end @@ -51,6 +81,12 @@ Returns the unique group ID. !!! note 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_group_id()::@NamedTuple{x::Int, y::Int, z::Int} + ``` """ function get_group_id end @@ -62,7 +98,7 @@ Declare memory that is local to a workgroup. !!! note Backend implementations **must** implement: ``` - localmemory(T::DataType, ::Val{Dims}) where {T, Dims} + @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims} ``` As well as the on-device functionality. """ @@ -83,14 +119,13 @@ workgroup. ``` @device_override barrier() ``` - As well as the on-device functionality. """ function barrier() error("Group barrier used outside kernel or not captured") end """ - _print(items...) + _print(args...) Overloaded by backends to enable `KernelAbstractions.@print` functionality. @@ -98,10 +133,10 @@ end !!! note Backend implementations **must** implement: ``` - _print(items...) + @device_override _print(args...) ``` - As well as the on-device functionality, - or define it to return `nothing` + If the backend does not support printing, + define it to return `nothing`. """ @generated function _print(items...) str = "" From d068b80405a483332a7afcf856d35ec5fde7263a Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 14:05:19 -0400 Subject: [PATCH 33/44] Qualify `map` --- src/intrinsics.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index fb258578..a70370b0 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -334,8 +334,8 @@ macro kikernel(backend, ex...) $f_var = $f GC.@preserve $(vars...) $f_var begin $kernel_f = $kiconvert($backend, $f_var) - $kernel_args = map(x -> $kiconvert($backend, x), ($(var_exprs...),)) - $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...} + $kernel_args = Base.map(x -> $kiconvert($backend, x), ($(var_exprs...),)) + $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...} $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) if $launch $kernel($(var_exprs...); $(call_kwargs...)) From 2fe4502e8418058f2f46a7d39f379204e0ddad68 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:44:28 -0400 Subject: [PATCH 34/44] `KIKernel` -> `Kernel` --- src/intrinsics.jl | 10 +++++----- src/pocl/backend.jl | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index a70370b0..5f48d338 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -158,19 +158,19 @@ end """ - KIKernel{Backend, BKern} + Kernel{Backend, BKern} -KIKernel closure struct that is used to represent the backend +Kernel closure struct that is used to represent the backend kernel on the host. !!! note Backend implementations **must** implement: ``` - (kernel::KIKernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) + (kernel::Kernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...) ``` As well as the on-device functionality. """ -struct KIKernel{B, Kern} +struct Kernel{B, Kern} backend::B kern::Kern end @@ -185,7 +185,7 @@ launching a kernel. !!! note Backend implementations **must** implement: ``` - kernel_max_work_group_size(backend::NewBackend, kern::KIKernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int + kernel_max_work_group_size(backend::NewBackend, kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int ``` As well as the on-device functionality. """ diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 41bbea58..7383a66f 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -143,10 +143,10 @@ KI.kiconvert(::POCLBackend, arg) = clconvert(arg) function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) - return KI.KIKernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) + return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end -function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) +function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing) local_size = StaticArrays.MVector{3}((1, 1, 1)) if !isnothing(workgroupsize) for (i, val) in enumerate(workgroupsize) @@ -164,7 +164,7 @@ function (obj::KI.KIKernel{POCLBackend})(args...; numworkgroups = nothing, workg return obj.kern(args...; local_size, global_size) end -function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.KIKernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int +function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int wginfo = cl.work_group_info(kikern.kern.fun, device()) return Int(min(wginfo.size, max_work_items)) end From aaf20a7942d517aee62d561532675123f7b87ec3 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:46:51 -0400 Subject: [PATCH 35/44] Remove redundant parameter --- src/intrinsics.jl | 4 ++-- src/pocl/backend.jl | 2 +- test/intrinsics.jl | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5f48d338..eda58ce4 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -176,7 +176,7 @@ struct Kernel{B, Kern} end """ - kernel_max_work_group_size(backend, kern; [max_work_items::Int])::Int + kernel_max_work_group_size(kern; [max_work_items::Int])::Int The maximum workgroup size limit for a kernel as reported by the backend. This function should always be used to determine the workgroup size before @@ -185,7 +185,7 @@ launching a kernel. !!! note Backend implementations **must** implement: ``` - kernel_max_work_group_size(backend::NewBackend, kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int + kernel_max_work_group_size(kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int ``` As well as the on-device functionality. """ diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 7383a66f..08e5a76e 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -164,7 +164,7 @@ function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgro return obj.kern(args...; local_size, global_size) end -function KI.kernel_max_work_group_size(::POCLBackend, kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int +function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int wginfo = cl.work_group_info(kikern.kern.fun, device()) return Int(min(wginfo.size, max_work_items)) end diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 3ec26c56..911e544f 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -80,8 +80,8 @@ function intrinsics_testsuite(backend, AT) results = AT(zeros(Int, 6, N)) kernel = KI.@kikernel backend() launch = false test_intrinsics_kernel(results) - @test KI.kernel_max_work_group_size(backend(), kernel) isa Int - @test KI.kernel_max_work_group_size(backend(), kernel; max_work_items = 1) == 1 + @test KI.kernel_max_work_group_size(kernel) isa Int + @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1 kernel(results, workgroupsize = 4, numworkgroups = 4) KernelAbstractions.synchronize(backend()) From e2941fbcb0c21a9cea78b40fb526deba07a8f8a9 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 19:13:27 -0400 Subject: [PATCH 36/44] `kiconvert` -> `argconvert` --- src/intrinsics.jl | 12 ++++++------ src/pocl/backend.jl | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index eda58ce4..ad2c8f6c 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -224,7 +224,7 @@ Used for certain algorithm optimizations. multiprocessor_count(_) = 0 """ - kiconvert(::NewBackend, arg) + argconvert(::NewBackend, arg) This function is called for every argument to be passed to a kernel, allowing it to be converted to a GPU-friendly format. @@ -232,10 +232,10 @@ converted to a GPU-friendly format. !!! note Backend implementations **must** implement: ``` - kiconvert(::NewBackend, arg) + argconvert(::NewBackend, arg) ``` """ -function kiconvert end +function argconvert end """ KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} @@ -269,7 +269,7 @@ High-level interface for executing code on a GPU. The `@kikernel` macro should prefix a call, with `func` a callable function or object that should return nothing. It will be compiled to a function native to the specified `backend` upon first use, and to a certain extent arguments will be converted and managed automatically -using `kiconvert`. Finally, if `launch=true`, the newly created callable kernel object is +using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is called and launched according to the specified `backend`. There are a few keyword arguments that influence the behavior of `@kikernel`: @@ -333,8 +333,8 @@ macro kikernel(backend, ex...) quote $f_var = $f GC.@preserve $(vars...) $f_var begin - $kernel_f = $kiconvert($backend, $f_var) - $kernel_args = Base.map(x -> $kiconvert($backend, x), ($(var_exprs...),)) + $kernel_f = $argconvert($backend, $f_var) + $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),)) $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...} $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) if $launch diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 08e5a76e..f5226bc7 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -139,7 +139,7 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize return nothing end -KI.kiconvert(::POCLBackend, arg) = clconvert(arg) +KI.argconvert(::POCLBackend, arg) = clconvert(arg) function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) From 5262c9e6fbd6f05332dfbb45037e737693a94e52 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 19:16:05 -0400 Subject: [PATCH 37/44] Remove old definition --- src/intrinsics.jl | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index ad2c8f6c..172d1de1 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -138,23 +138,8 @@ end If the backend does not support printing, define it to return `nothing`. """ -@generated function _print(items...) - str = "" - args = [] - - for i in 1:length(items) - item = :(items[$i]) - T = items[i] - if T <: Val - item = QuoteNode(T.parameters[1]) - end - push!(args, item) - end +function _print end - return quote - print($(args...)) - end -end """ From ca11482c526749b4c47ad3e23068ff8daed8c673 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Thu, 6 Nov 2025 19:28:48 -0400 Subject: [PATCH 38/44] Unbreak ABI --- src/KernelAbstractions.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index feaafa5b..95e71b12 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -248,8 +248,11 @@ end Declare storage that is local to a workgroup. """ macro localmem(T, dims) + # Stay in sync with CUDAnative + id = gensym("static_shmem") + return quote - $SharedMemory($(esc(T)), Val($(esc(dims)))) + $SharedMemory($(esc(T)), Val($(esc(dims))), Val($(QuoteNode(id)))) end end @@ -811,8 +814,7 @@ include("macros.jl") ### function Scratchpad end -# SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id) -SharedMemory(t::Type{T}, dims::Val{Dims}) where {T, Dims} = KernelIntrinsics.localmemory(t, dims) +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims) __synchronize() = KernelIntrinsics.barrier() From d6f3e66dad2ce09fdcba1b0d2256fb4fd7e8dd9c Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 09:05:50 -0400 Subject: [PATCH 39/44] Readd old definition --- src/KernelAbstractions.jl | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 95e71b12..fb48fd7b 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -369,6 +369,25 @@ macro context() return esc(:(__ctx__)) end +# Defined to keep cpu support for `__print` +@generated function KernelIntrinsics._print(items...) + str = "" + args = [] + + for i in 1:length(items) + item = :(items[$i]) + T = items[i] + if T <: Val + item = QuoteNode(T.parameters[1]) + end + push!(args, item) + end + + return quote + print($(args...)) + end +end + """ @print(items...) From 1400c9f061110e5e6732d66d7dfea2cf7914977e Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 09:15:21 -0400 Subject: [PATCH 40/44] Naming --- src/intrinsics.jl | 12 ++++++------ src/pocl/backend.jl | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 172d1de1..5904d3a5 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -143,7 +143,7 @@ function _print end """ - Kernel{Backend, BKern} + Kernel{Backend, Kern} Kernel closure struct that is used to represent the backend kernel on the host. @@ -223,12 +223,12 @@ converted to a GPU-friendly format. function argconvert end """ - KI.kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + KI.gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} Low-level interface to compile a function invocation for the currently-active GPU, returning a callable kernel object. For a higher-level interface, use [`@kikernel`](@ref). -Currently, only `kifunction` only supports the `name` keyword argument as it is the only one +Currently, only `gpufunction` only supports the `name` keyword argument as it is the only one by all backends. Keyword arguments: @@ -237,10 +237,10 @@ Keyword arguments: !!! note Backend implementations **must** implement: ``` - kifunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} ``` """ -function kifunction end +function gpufunction end const MACRO_KWARGS = [:launch] const COMPILER_KWARGS = [:name] @@ -321,7 +321,7 @@ macro kikernel(backend, ex...) $kernel_f = $argconvert($backend, $f_var) $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),)) $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...} - $kernel = $kifunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) + $kernel = $gpufunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) if $launch $kernel($(var_exprs...); $(call_kwargs...)) end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index f5226bc7..913fbe37 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -141,7 +141,7 @@ end KI.argconvert(::POCLBackend, arg) = clconvert(arg) -function KI.kifunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} +function KI.gpufunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end @@ -164,8 +164,8 @@ function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgro return obj.kern(args...; local_size, global_size) end -function KI.kernel_max_work_group_size(kikern::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int - wginfo = cl.work_group_info(kikern.kern.fun, device()) +function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int + wginfo = cl.work_group_info(kernel.kern.fun, device()) return Int(min(wginfo.size, max_work_items)) end function KI.max_work_group_size(::POCLBackend)::Int From 055a81f04ec9d358e78b381a07b293710c5b48d0 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 09:35:34 -0400 Subject: [PATCH 41/44] Reword --- src/intrinsics.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5904d3a5..5960033c 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -211,8 +211,8 @@ multiprocessor_count(_) = 0 """ argconvert(::NewBackend, arg) -This function is called for every argument to be passed to a kernel, allowing it to be -converted to a GPU-friendly format. +This function is called for every argument to be passed to a kernel, +converting them to their device side representation. !!! note Backend implementations **must** implement: From 913f8c8b048e0d72065f9243df5a20784f08972f Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 13:49:35 -0400 Subject: [PATCH 42/44] Docstrings --- src/intrinsics.jl | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 5960033c..fb1b00ad 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,3 +1,15 @@ +""" +# KernelIntrinics + +The `KernelIntrinics` module defines the API interface for backends to define various lower-level device and +host-side functionality. The `KernelIntrinsics` intrinsics are used to define the higher-level device-side +intrinsics functionality in `KernelAbstractions`. + +Both provide APIs for host and device-side functionality, but `KernelIntrinsics` focuses on on lower-level +functionality that is shared amongst backends, while `KernelAbstractions` provides higher-level functionality +such as writing kernels that work on arrays with an arbitrary number of dimensions, or convenience functions +like allocating arrays on a backend. +""" module KernelIntrinsics import ..KernelAbstractions: Backend @@ -111,6 +123,9 @@ After a `barrier()` call, all read and writes to global and local memory from each thread in the workgroup are visible in from all other threads in the workgroup. +This does **not** guarantee that a write from a thread in a certain workgroup will +be visible to a thread in a different workgroup. + !!! note `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all. From 298e8078888ece5f2af69deb36ee0692ef0cb4a6 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 14:03:50 -0400 Subject: [PATCH 43/44] Rename `kikernel` --- src/intrinsics.jl | 16 +++++++++++----- test/intrinsics.jl | 10 +++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/intrinsics.jl b/src/intrinsics.jl index fb1b00ad..cd94d473 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -241,7 +241,7 @@ function argconvert end KI.gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} Low-level interface to compile a function invocation for the currently-active GPU, returning -a callable kernel object. For a higher-level interface, use [`@kikernel`](@ref). +a callable kernel object. For a higher-level interface, use [`KernelIntrinsics.@kernel`](@ref). Currently, only `gpufunction` only supports the `name` keyword argument as it is the only one by all backends. @@ -262,24 +262,30 @@ const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] """ - @kikernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) + KernelIntrinsics.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) High-level interface for executing code on a GPU. -The `@kikernel` macro should prefix a call, with `func` a callable function or object that +The `KernelIntrinsics.@kernel` macro should prefix a call, with `func` a callable function or object that should return nothing. It will be compiled to a function native to the specified `backend` upon first use, and to a certain extent arguments will be converted and managed automatically using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is called and launched according to the specified `backend`. -There are a few keyword arguments that influence the behavior of `@kikernel`: +There are a few keyword arguments that influence the behavior of `KernelIntrinsics.@kernel`: - `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned kernel object should be launched by calling it and passing arguments again. - `name`: the name of the kernel in the generated code. Defaults to an automatically- generated name. + +!!! note + `KernelIntrinsics.@kernel` differs from the `KernelAbstractions` macro in that this macro acts + a wrapper around backend kernel compilation/launching (such as `@cuda`, `@metal`, etc.). It is + used when calling a function to be run on a specific backend, while `KernelAbstractions.@kernel` + is used kernel definition for use with the original higher-level `KernelAbstractions` API. """ -macro kikernel(backend, ex...) +macro kernel(backend, ex...) call = ex[end] kwargs = map(ex[1:(end - 1)]) do kwarg if kwarg isa Symbol diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 911e544f..7a7ec1e0 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -31,13 +31,13 @@ function intrinsics_testsuite(backend, AT) return end arr1d = AT(zeros(Float32, 4)) - KI.@kikernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d) + KI.@kernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d) KernelAbstractions.synchronize(backend()) @test all(Array(arr1d) .== 1) # 1d tuple arr1dt = AT(zeros(Float32, 4)) - KI.@kikernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt) + KI.@kernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt) KernelAbstractions.synchronize(backend()) @test all(Array(arr1dt) .== 1) @@ -51,7 +51,7 @@ function intrinsics_testsuite(backend, AT) return end arr2d = AT(zeros(Float32, 4, 4)) - KI.@kikernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d) + KI.@kernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d) KernelAbstractions.synchronize(backend()) @test all(Array(arr2d) .== 1) @@ -65,7 +65,7 @@ function intrinsics_testsuite(backend, AT) return end arr3d = AT(zeros(Float32, 4, 4, 4)) - KI.@kikernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d) + KI.@kernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d) KernelAbstractions.synchronize(backend()) @test all(Array(arr3d) .== 1) end @@ -78,7 +78,7 @@ function intrinsics_testsuite(backend, AT) # Test with small kernel N = 16 results = AT(zeros(Int, 6, N)) - kernel = KI.@kikernel backend() launch = false test_intrinsics_kernel(results) + kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results) @test KI.kernel_max_work_group_size(kernel) isa Int @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1 From 42f17d6ee9766662732eb0de896d1c0ff5b37ad4 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 17:17:29 -0400 Subject: [PATCH 44/44] Renaming --- src/KernelAbstractions.jl | 24 ++++++++++++------------ src/intrinsics.jl | 31 +++++++++++++++++-------------- src/pocl/backend.jl | 2 +- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index fb48fd7b..a0f57c74 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -200,8 +200,8 @@ Abstract type for all KernelAbstractions backends. abstract type Backend end include("intrinsics.jl") -import .KernelIntrinsics -export KernelIntrinsics +import .KernelIntrinsics: KernelIntrinsics, KI +export KernelIntrinsics, KI ### # Kernel language @@ -370,7 +370,7 @@ macro context() end # Defined to keep cpu support for `__print` -@generated function KernelIntrinsics._print(items...) +@generated function KI._print(items...) str = "" args = [] @@ -489,25 +489,25 @@ end ### @inline function __index_Local_Linear(ctx) - return KernelIntrinsics.get_local_id().x + return KI.get_local_id().x end @inline function __index_Group_Linear(ctx) - return KernelIntrinsics.get_group_id().x + return KI.get_group_id().x end @inline function __index_Global_Linear(ctx) - return KernelIntrinsics.get_global_id().x + return KI.get_global_id().x end @inline function __index_Local_Cartesian(ctx) - return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x] + return @inbounds workitems(__iterspace(ctx))[KI.get_local_id().x] end @inline function __index_Group_Cartesian(ctx) - return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x] + return @inbounds blocks(__iterspace(ctx))[KI.get_group_id().x] end @inline function __index_Global_Cartesian(ctx) - return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x) + return @inbounds expand(__iterspace(ctx), KI.get_group_id().x, KI.get_local_id().x) end @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...)) @@ -833,11 +833,11 @@ include("macros.jl") ### function Scratchpad end -SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims) +SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims) -__synchronize() = KernelIntrinsics.barrier() +__synchronize() = KI.barrier() -__print(args...) = KernelIntrinsics._print(args...) +__print(args...) = KI._print(args...) # Utils __size(args::Tuple) = Tuple{args...} diff --git a/src/intrinsics.jl b/src/intrinsics.jl index cd94d473..e0b2c46a 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -1,17 +1,20 @@ """ -# KernelIntrinics +# `KernelIntrinics`/`KI` -The `KernelIntrinics` module defines the API interface for backends to define various lower-level device and -host-side functionality. The `KernelIntrinsics` intrinsics are used to define the higher-level device-side +The `KernelIntrinics` (or `KI`) module defines the API interface for backends to define various lower-level device and +host-side functionality. The `KI` intrinsics are used to define the higher-level device-side intrinsics functionality in `KernelAbstractions`. -Both provide APIs for host and device-side functionality, but `KernelIntrinsics` focuses on on lower-level +Both provide APIs for host and device-side functionality, but `KI` focuses on on lower-level functionality that is shared amongst backends, while `KernelAbstractions` provides higher-level functionality such as writing kernels that work on arrays with an arbitrary number of dimensions, or convenience functions like allocating arrays on a backend. """ module KernelIntrinsics +const KI = KernelIntrinsics +export KI + import ..KernelAbstractions: Backend import GPUCompiler: split_kwargs, assign_args! @@ -238,12 +241,12 @@ converting them to their device side representation. function argconvert end """ - KI.gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + KI.kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} Low-level interface to compile a function invocation for the currently-active GPU, returning -a callable kernel object. For a higher-level interface, use [`KernelIntrinsics.@kernel`](@ref). +a callable kernel object. For a higher-level interface, use [`KI.@kernel`](@ref). -Currently, only `gpufunction` only supports the `name` keyword argument as it is the only one +Currently, `kernel_function` only supports the `name` keyword argument as it is the only one by all backends. Keyword arguments: @@ -252,27 +255,27 @@ Keyword arguments: !!! note Backend implementations **must** implement: ``` - gpufunction(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} + kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT} ``` """ -function gpufunction end +function kernel_function end const MACRO_KWARGS = [:launch] const COMPILER_KWARGS = [:name] const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize] """ - KernelIntrinsics.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) + KI.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...) High-level interface for executing code on a GPU. -The `KernelIntrinsics.@kernel` macro should prefix a call, with `func` a callable function or object that +The `KI.@kernel` macro should prefix a call, with `func` a callable function or object that should return nothing. It will be compiled to a function native to the specified `backend` upon first use, and to a certain extent arguments will be converted and managed automatically using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is called and launched according to the specified `backend`. -There are a few keyword arguments that influence the behavior of `KernelIntrinsics.@kernel`: +There are a few keyword arguments that influence the behavior of `KI.@kernel`: - `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned kernel object should be launched by calling it and passing arguments again. @@ -280,7 +283,7 @@ There are a few keyword arguments that influence the behavior of `KernelIntrinsi generated name. !!! note - `KernelIntrinsics.@kernel` differs from the `KernelAbstractions` macro in that this macro acts + `KI.@kernel` differs from the `KernelAbstractions` macro in that this macro acts a wrapper around backend kernel compilation/launching (such as `@cuda`, `@metal`, etc.). It is used when calling a function to be run on a specific backend, while `KernelAbstractions.@kernel` is used kernel definition for use with the original higher-level `KernelAbstractions` API. @@ -342,7 +345,7 @@ macro kernel(backend, ex...) $kernel_f = $argconvert($backend, $f_var) $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),)) $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...} - $kernel = $gpufunction($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) + $kernel = $kernel_function($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...)) if $launch $kernel($(var_exprs...); $(call_kwargs...)) end diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index 913fbe37..14fe6ae8 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -141,7 +141,7 @@ end KI.argconvert(::POCLBackend, arg) = clconvert(arg) -function KI.gpufunction(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} +function KI.kernel_function(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT} kern = clfunction(f, tt; name, kwargs...) return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern) end