diff --git a/examples/histogram.jl b/examples/histogram.jl
index 9edc0293..29d3bf57 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -12,16 +12,15 @@ function create_histogram(input)
     return histogram_output
 end
 
-# This a 1D histogram kernel where the histogramming happens on shmem
-@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input)
-    gid = @index(Group, Linear)
-    lid = @index(Local, Linear)
+# This a 1D histogram kernel where the histogramming happens on static shmem
+function histogram_kernel!(histogram_output, input, ::Val{gs}) where {gs}
+    gid = KI.get_group_id().x
+    lid = KI.get_local_id().x
 
-    @uniform gs = prod(@groupsize())
     tid = (gid - 1) * gs + lid
-    @uniform N = length(histogram_output)
+    N = length(histogram_output)
 
-    shared_histogram = @localmem eltype(input) (gs)
+    shared_histogram = KI.localmemory(eltype(input), gs)
 
     # This will go through all input elements and assign them to a location in
     # shmem. Note that if there is not enough shem, we create different shmem
@@ -32,7 +31,7 @@ end
 
         # Setting shared_histogram to 0
         @inbounds shared_histogram[lid] = 0
-        @synchronize()
+        KI.barrier()
 
         max_element = min_element + gs
         if max_element > N
@@ -46,21 +45,20 @@ end
             @atomic shared_histogram[bin] += 1
         end
 
-        @synchronize()
+        KI.barrier()
 
         if ((lid + min_element - 1) <= N)
             @atomic histogram_output[lid + min_element - 1] += shared_histogram[lid]
         end
 
     end
-
+    return
 end
 
 function histogram!(histogram_output, input, groupsize = 256)
     backend = get_backend(histogram_output)
     # Need static block size
-    kernel! = histogram_kernel!(backend, (groupsize,))
-    kernel!(histogram_output, input, ndrange = size(input))
+    KI.@kernel backend workgroupsize = groupsize numworkgroups = cld(length(input), groupsize) histogram_kernel!(histogram_output, input, Val(groupsize))
     return
 end
 
diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index ac56edb5..905b3a05 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -9,70 +9,68 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
 #  Metal sometimes supports fewer.
 const TILE_DIM = 16
 
-@kernel unsafe_indices = true function coalesced_matmul_kernel!(
-        output, @Const(input1), @Const(input2), N, R, M,
-        ::Val{BANK} = Val(1),
-    ) where {BANK}
-    gi, gj = @index(Group, NTuple)
-    i, j = @index(Local, NTuple)
-
-    TILE_DIM = @uniform @groupsize()[1]
+function coalesced_matmul_kernel!(
+        output, input1, input2, N, R, M,
+        ::Val{TDIM}, ::Val{BANK} = Val(1)
+    ) where {TDIM, BANK}
+    gi, gj, _ = KI.get_group_id()
+    i, j, _ = KI.get_local_id()
 
     # +1 to avoid bank conflicts on shared memory
-    tile1 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
-    tile2 = @localmem eltype(output) (TILE_DIM + BANK, TILE_DIM)
+    tile1 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))
+    tile2 = KI.localmemory(eltype(output), (TDIM + BANK, TDIM))
 
-    # private variable for tile output
-    outval = @private eltype(output) 1
-    @inbounds outval[1] = -zero(eltype(output))
+    # variable for tile output
+    outval = -zero(eltype(output))
 
-    @uniform N = size(output, 1)
+    N = size(output, 1)
     # number of tiles depends on inner dimension
-    @uniform NUM_TILES = div(R + TILE_DIM - 1, TILE_DIM)
+    NUM_TILES = div(R + TDIM - 1, TDIM)
 
     # loop over all tiles needed for this calculation
     for t in 0:(NUM_TILES - 1)
         # Can't use @index(Global), because we use a smaller ndrange
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
+        I = (gi - 1) * TDIM + i
+        J = (gj - 1) * TDIM + j
 
         # load inputs into tiles, with bounds checking for non-square matrices
-        if I <= N && t * TILE_DIM + j <= R
-            @inbounds tile1[i, j] = input1[I, t * TILE_DIM + j]
+        if I <= N && t * TDIM + j <= R
+            @inbounds tile1[i, j] = input1[I, t * TDIM + j]
         else
             @inbounds tile1[i, j] = 0.0
         end
         if t * TILE_DIM + i <= R && J <= M
-            @inbounds tile2[i, j] = input2[t * TILE_DIM + i, J]
+            @inbounds tile2[i, j] = input2[t * TDIM + i, J]
         else
             @inbounds tile2[i, j] = 0.0
         end
 
         # wait for all tiles to be loaded
-        @synchronize
+        KI.barrier()
 
         # get global values again
-        I = (gi - 1) * TILE_DIM + i
-        J = (gj - 1) * TILE_DIM + j
+        I = (gi - 1) * TDIM + i
+        J = (gj - 1) * TDIM + j
 
         # calculate value of spot in output, use temporary value to allow for vectorization
         out = zero(eltype(output))
-        @simd for k in 1:TILE_DIM
+        @simd for k in 1:TDIM
             @inbounds out += tile1[i, k] * tile2[k, j]
         end
-        outval[1] += out
+        outval += out
 
-        @synchronize
+        KI.barrier()
     end
 
     # get global indices again
-    I = (gi - 1) * TILE_DIM + i
-    J = (gj - 1) * TILE_DIM + j
+    I = (gi - 1) * TDIM + i
+    J = (gj - 1) * TDIM + j
 
     # save if inbounds
     if I <= N && J <= M
-        @inbounds output[I, J] = outval[1]
+        @inbounds output[I, J] = outval
     end
+    return
 end
 
 N = 1024
@@ -82,9 +80,10 @@ A = rand!(allocate(backend, Float32, N, R))
 B = rand!(allocate(backend, Float32, R, M))
 C = KernelAbstractions.zeros(backend, Float32, N, M)
 
-kern = coalesced_matmul_kernel!(backend, (TILE_DIM, TILE_DIM))
+workgroupsize = (TILE_DIM, TILE_DIM)
+numworkgroups = (cld(size(C, 1), TILE_DIM), cld(size(C, 2), TILE_DIM))
 
-kern(C, A, B, N, R, M, ndrange = size(C))
+KI.@kernel backend workgroupsize numworkgroups coalesced_matmul_kernel!(C, A, B, N, R, M, Val(TILE_DIM))
 KernelAbstractions.synchronize(backend)
 
 @test isapprox(A * B, C)
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 9af2d9d4..a0f57c74 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -194,6 +194,15 @@ function unsafe_free! end
 
 unsafe_free!(::AbstractArray) = return
 
+"""
+Abstract type for all KernelAbstractions backends.
+"""
+abstract type Backend end
+
+include("intrinsics.jl")
+import .KernelIntrinsics: KernelIntrinsics, KI
+export KernelIntrinsics, KI
+
 ###
 # Kernel language
 # - @localmem
@@ -360,6 +369,25 @@ macro context()
     return esc(:(__ctx__))
 end
 
+# Defined to keep cpu support for `__print`
+@generated function KI._print(items...)
+    str = ""
+    args = []
+
+    for i in 1:length(items)
+        item = :(items[$i])
+        T = items[i]
+        if T <: Val
+            item = QuoteNode(T.parameters[1])
+        end
+        push!(args, item)
+    end
+
+    return quote
+        print($(args...))
+    end
+end
+
 """
     @print(items...)
 
@@ -460,13 +488,27 @@ end
 # Internal kernel functions
 ###
 
-function __index_Local_Linear end
-function __index_Group_Linear end
-function __index_Global_Linear end
+@inline function __index_Local_Linear(ctx)
+    return KI.get_local_id().x
+end
+
+@inline function __index_Group_Linear(ctx)
+    return KI.get_group_id().x
+end
 
-function __index_Local_Cartesian end
-function __index_Group_Cartesian end
-function __index_Global_Cartesian end
+@inline function __index_Global_Linear(ctx)
+    return KI.get_global_id().x
+end
+
+@inline function __index_Local_Cartesian(ctx)
+    return @inbounds workitems(__iterspace(ctx))[KI.get_local_id().x]
+end
+@inline function __index_Group_Cartesian(ctx)
+    return @inbounds blocks(__iterspace(ctx))[KI.get_group_id().x]
+end
+@inline function __index_Global_Cartesian(ctx)
+    return @inbounds expand(__iterspace(ctx), KI.get_group_id().x, KI.get_local_id().x)
+end
 
 @inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...))
 @inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...))
@@ -482,11 +524,6 @@ constify(arg) = adapt(ConstAdaptor(), arg)
 # Backend hierarchy
 ###
 
-"""
-
-Abstract type for all KernelAbstractions backends.
-"""
-abstract type Backend end
 
 """
 Abstract type for all GPU based KernelAbstractions backends.
@@ -796,29 +833,11 @@ include("macros.jl")
 ###
 
 function Scratchpad end
-function SharedMemory end
-
-function __synchronize()
-    error("@synchronize used outside kernel or not captured")
-end
-
-@generated function __print(items...)
-    str = ""
-    args = []
+SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KI.localmemory(t, dims)
 
-    for i in 1:length(items)
-        item = :(items[$i])
-        T = items[i]
-        if T <: Val
-            item = QuoteNode(T.parameters[1])
-        end
-        push!(args, item)
-    end
+__synchronize() = KI.barrier()
 
-    return quote
-        print($(args...))
-    end
-end
+__print(args...) = KI._print(args...)
 
 # Utils
 __size(args::Tuple) = Tuple{args...}
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
new file mode 100644
index 00000000..2025bf5e
--- /dev/null
+++ b/src/intrinsics.jl
@@ -0,0 +1,365 @@
+"""
+# `KernelIntrinics`/`KI`
+
+The `KernelIntrinics` (or `KI`) module defines the API interface for backends to define various lower-level device and
+host-side functionality. The `KI` intrinsics are used to define the higher-level device-side
+intrinsics functionality in `KernelAbstractions`.
+
+Both provide APIs for host and device-side functionality, but `KI` focuses on on lower-level
+functionality that is shared amongst backends, while `KernelAbstractions` provides higher-level functionality
+such as writing kernels that work on arrays with an arbitrary number of dimensions, or convenience functions
+like allocating arrays on a backend.
+"""
+module KernelIntrinsics
+
+const KI = KernelIntrinsics
+export KI
+
+import ..KernelAbstractions: Backend
+import GPUCompiler: split_kwargs, assign_args!
+
+"""
+    get_global_size()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Return the number of global work-items specified.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_global_size()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
+"""
+function get_global_size end
+
+"""
+    get_global_id()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Returns the unique global work-item ID.
+
+!!! note
+    1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_global_id()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
+"""
+function get_global_id end
+
+"""
+    get_local_size()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Return the number of local work-items specified.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_local_size()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
+"""
+function get_local_size end
+
+"""
+    get_local_id()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Returns the unique local work-item ID.
+
+!!! note
+    1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_local_id()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
+"""
+function get_local_id end
+
+"""
+    get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Returns the number of groups.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
+"""
+function get_num_groups end
+
+"""
+    get_group_id()::@NamedTuple{x::Int, y::Int, z::Int}
+
+Returns the unique group ID.
+
+!!! note
+    1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_group_id()::@NamedTuple{x::Int, y::Int, z::Int}
+    ```
+"""
+function get_group_id end
+
+"""
+    localmemory(T, dims)
+
+Declare memory that is local to a workgroup.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override localmemory(T::DataType, ::Val{Dims}) where {T, Dims}
+    ```
+    As well as the on-device functionality.
+"""
+localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))
+
+"""
+    barrier()
+
+After a `barrier()` call, all read and writes to global and local memory
+from each thread in the workgroup are visible in from all other threads in the
+workgroup.
+
+This does **not** guarantee that a write from a thread in a certain workgroup will
+be visible to a thread in a different workgroup.
+
+!!! note
+    `barrier()` must be encountered by all workitems of a work-group executing the kernel or by none at all.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override barrier()
+    ```
+"""
+function barrier()
+    error("Group barrier used outside kernel or not captured")
+end
+
+"""
+    _print(args...)
+
+    Overloaded by backends to enable `KernelAbstractions.@print`
+    functionality.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override _print(args...)
+    ```
+    If the backend does not support printing,
+    define it to return `nothing`.
+"""
+function _print end
+
+
+"""
+    Kernel{Backend, Kern}
+
+Kernel closure struct that is used to represent the backend
+kernel on the host.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    (kernel::Kernel{<:NewBackend})(args...; numworkgroups=nothing, workgroupsize=nothing, kwargs...)
+    ```
+    As well as the on-device functionality.
+"""
+struct Kernel{B, Kern}
+    backend::B
+    kern::Kern
+end
+
+"""
+    kernel_max_work_group_size(kern; [max_work_items::Int])::Int
+
+The maximum workgroup size limit for a kernel as reported by the backend.
+This function should always be used to determine the workgroup size before
+launching a kernel.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    kernel_max_work_group_size(kern::Kernel{<:NewBackend}; max_work_items::Int=typemax(Int))::Int
+    ```
+    As well as the on-device functionality.
+"""
+function kernel_max_work_group_size end
+
+"""
+    max_work_group_size(backend, kern; [max_work_items::Int])::Int
+
+The maximum workgroup size limit for a kernel as reported by the backend.
+This function represents a theoretical maximum; `kernel_max_work_group_size`
+should be used before launching a kernel as some backends may error if
+kernel launch with too big a workgroup is attempted.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    max_work_group_size(backend::NewBackend)::Int
+    ```
+    As well as the on-device functionality.
+"""
+function max_work_group_size end
+
+"""
+    multiprocessor_count(backend::NewBackend)::Int
+
+The multiprocessor count for the current device used by `backend`.
+Used for certain algorithm optimizations.
+
+!!! note
+    Backend implementations **may** implement:
+    ```
+    multiprocessor_count(backend::NewBackend)::Int
+    ```
+    As well as the on-device functionality.
+"""
+multiprocessor_count(_) = 0
+
+"""
+    argconvert(::NewBackend, arg)
+
+This function is called for every argument to be passed to a kernel,
+converting them to their device side representation.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    argconvert(::NewBackend, arg)
+    ```
+"""
+function argconvert end
+
+"""
+    KI.kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+
+Low-level interface to compile a function invocation for the currently-active GPU, returning
+a callable kernel object. For a higher-level interface, use [`KI.@kernel`](@ref).
+
+Currently, `kernel_function` only supports the `name` keyword argument as it is the only one
+by all backends.
+
+Keyword arguments:
+- `name`: override the name that the kernel will have in the generated code
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    kernel_function(::NewBackend, f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
+    ```
+"""
+function kernel_function end
+
+const MACRO_KWARGS = [:launch]
+const COMPILER_KWARGS = [:name]
+const LAUNCH_KWARGS = [:numworkgroups, :workgroupsize]
+
+"""
+    KI.@kernel backend workgroupsize=... numworkgroups=... [kwargs...] func(args...)
+
+High-level interface for executing code on a GPU.
+
+The `KI.@kernel` macro should prefix a call, with `func` a callable function or object that
+should return nothing. It will be compiled to a function native to the specified `backend`
+upon first use, and to a certain extent arguments will be converted and managed automatically
+using `argconvert`. Finally, if `launch=true`, the newly created callable kernel object is
+called and launched according to the specified `backend`.
+
+There are a few keyword arguments that influence the behavior of `KI.@kernel`:
+
+- `launch`: whether to launch this kernel, defaults to `true`. If `false`, the returned
+  kernel object should be launched by calling it and passing arguments again.
+- `name`: the name of the kernel in the generated code. Defaults to an automatically-
+  generated name.
+
+!!! note
+    `KI.@kernel` differs from the `KernelAbstractions` macro in that this macro acts
+    a wrapper around backend kernel compilation/launching (such as `@cuda`, `@metal`, etc.). It is
+    used when calling a function to be run on a specific backend, while `KernelAbstractions.@kernel`
+    is used kernel definition for use with the original higher-level `KernelAbstractions` API.
+"""
+macro kernel(backend, ex...)
+    call = ex[end]
+    kwargs = map(ex[1:(end - 1)]) do kwarg
+        if kwarg isa Symbol
+            :($kwarg = $kwarg)
+        elseif Meta.isexpr(kwarg, :(=))
+            kwarg
+        else
+            throw(ArgumentError("Invalid keyword argument '$kwarg'"))
+        end
+    end
+
+    # destructure the kernel call
+    Meta.isexpr(call, :call) || throw(ArgumentError("final argument to @kikern should be a function call"))
+    f = call.args[1]
+    args = call.args[2:end]
+
+    code = quote end
+    vars, var_exprs = assign_args!(code, args)
+
+    # group keyword argument
+    macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs =
+        split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS)
+    if !isempty(other_kwargs)
+        key, val = first(other_kwargs).args
+        throw(ArgumentError("Unsupported keyword argument '$key'"))
+    end
+
+    # handle keyword arguments that influence the macro's behavior
+    launch = true
+    for kwarg in macro_kwargs
+        key, val = kwarg.args
+        if key === :launch
+            isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @kikern should be a Bool"))
+            launch = val::Bool
+        else
+            throw(ArgumentError("Unsupported keyword argument '$key'"))
+        end
+    end
+    if !launch && !isempty(call_kwargs)
+        error("@kikern with launch=false does not support launch-time keyword arguments; use them when calling the kernel")
+    end
+
+    # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5)
+    #        we esc() the whole thing now, necessitating gensyms...
+    @gensym f_var kernel_f kernel_args kernel_tt kernel
+
+    # convert the arguments, call the compiler and launch the kernel
+    # while keeping the original arguments alive
+    push!(
+        code.args,
+        quote
+            $f_var = $f
+            GC.@preserve $(vars...) $f_var begin
+                $kernel_f = $argconvert($backend, $f_var)
+                $kernel_args = Base.map(x -> $argconvert($backend, x), ($(var_exprs...),))
+                $kernel_tt = Tuple{Base.map(Core.Typeof, $kernel_args)...}
+                $kernel = $kernel_function($backend, $kernel_f, $kernel_tt; $(compiler_kwargs...))
+                if $launch
+                    $kernel($(var_exprs...); $(call_kwargs...))
+                end
+                $kernel
+            end
+        end
+    )
+
+    return esc(
+        quote
+            let
+                $code
+            end
+        end
+    )
+end
+
+end
diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
index 87fccdb9..14fe6ae8 100644
--- a/src/pocl/backend.jl
+++ b/src/pocl/backend.jl
@@ -2,9 +2,10 @@ module POCLKernels
 
 using ..POCL
 using ..POCL: @device_override, cl, method_table
-using ..POCL: device
+using ..POCL: device, clconvert, clfunction
 
 import KernelAbstractions as KA
+import KernelAbstractions.KernelIntrinsics as KI
 
 import StaticArrays
 
@@ -56,7 +57,7 @@ KA.functional(::POCLBackend) = true
 KA.pagelock!(::POCLBackend, x) = nothing
 
 KA.get_backend(::Array) = POCLBackend()
-KA.synchronize(::POCLBackend) = nothing
+KA.synchronize(::POCLBackend) = cl.finish(cl.queue())
 KA.supports_float64(::POCLBackend) = true
 KA.supports_unified(::POCLBackend) = true
 
@@ -138,31 +139,66 @@ function (obj::KA.Kernel{POCLBackend})(args...; ndrange = nothing, workgroupsize
     return nothing
 end
 
+KI.argconvert(::POCLBackend, arg) = clconvert(arg)
+
+function KI.kernel_function(::POCLBackend, f::F, tt::TT = Tuple{}; name = nothing, kwargs...) where {F, TT}
+    kern = clfunction(f, tt; name, kwargs...)
+    return KI.Kernel{POCLBackend, typeof(kern)}(POCLBackend(), kern)
+end
+
+function (obj::KI.Kernel{POCLBackend})(args...; numworkgroups = nothing, workgroupsize = nothing)
+    local_size = StaticArrays.MVector{3}((1, 1, 1))
+    if !isnothing(workgroupsize)
+        for (i, val) in enumerate(workgroupsize)
+            local_size[i] = val
+        end
+    end
+
+    global_size = StaticArrays.MVector{3}((1, 1, 1))
+    if !isnothing(numworkgroups)
+        for (i, val) in enumerate(numworkgroups)
+            global_size[i] = val * local_size[i]
+        end
+    end
+
+    return obj.kern(args...; local_size, global_size)
+end
+
+function KI.kernel_max_work_group_size(kernel::KI.Kernel{<:POCLBackend}; max_work_items::Int = typemax(Int))::Int
+    wginfo = cl.work_group_info(kernel.kern.fun, device())
+    return Int(min(wginfo.size, max_work_items))
+end
+function KI.max_work_group_size(::POCLBackend)::Int
+    return Int(device().max_work_group_size)
+end
+function KI.multiprocessor_count(::POCLBackend)::Int
+    return Int(device().max_compute_units)
+end
 
 ## Indexing Functions
 
-@device_override @inline function KA.__index_Local_Linear(ctx)
-    return get_local_id(1)
+@device_override @inline function KI.get_local_id()
+    return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3)))
 end
 
-@device_override @inline function KA.__index_Group_Linear(ctx)
-    return get_group_id(1)
+@device_override @inline function KI.get_group_id()
+    return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3)))
 end
 
-@device_override @inline function KA.__index_Global_Linear(ctx)
-    return get_global_id(1)
+@device_override @inline function KI.get_global_id()
+    return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3)))
 end
 
-@device_override @inline function KA.__index_Local_Cartesian(ctx)
-    @inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
+@device_override @inline function KI.get_local_size()
+    return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3)))
 end
 
-@device_override @inline function KA.__index_Group_Cartesian(ctx)
-    @inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
+@device_override @inline function KI.get_num_groups()
+    return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3)))
 end
 
-@device_override @inline function KA.__index_Global_Cartesian(ctx)
-    return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
+@device_override @inline function KI.get_global_size()
+    return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
 end
 
 @device_override @inline function KA.__validindex(ctx)
@@ -177,7 +213,7 @@ end
 
 ## Shared and Scratch Memory
 
-@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
+@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}) where {T, Dims}
     ptr = POCL.emit_localmemory(T, Val(prod(Dims)))
     CLDeviceArray(Dims, ptr)
 end
@@ -189,11 +225,11 @@ end
 
 ## Synchronization and Printing
 
-@device_override @inline function KA.__synchronize()
+@device_override @inline function KI.barrier()
     work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
 end
 
-@device_override @inline function KA.__print(args...)
+@device_override @inline function KI._print(args...)
     POCL._print(args...)
 end
 
diff --git a/test/intrinsics.jl b/test/intrinsics.jl
new file mode 100644
index 00000000..7a7ec1e0
--- /dev/null
+++ b/test/intrinsics.jl
@@ -0,0 +1,118 @@
+function test_intrinsics_kernel(results)
+    # Test all intrinsics return NamedTuples with x, y, z fields
+    global_size = KI.get_global_size()
+    global_id = KI.get_global_id()
+    local_size = KI.get_local_size()
+    local_id = KI.get_local_id()
+    num_groups = KI.get_num_groups()
+    group_id = KI.get_group_id()
+
+    if UInt32(global_id.x) <= UInt32(global_size.x)
+        results[1, global_id.x] = global_id.x
+        results[2, global_id.x] = local_id.x
+        results[3, global_id.x] = group_id.x
+        results[4, global_id.x] = global_size.x
+        results[5, global_id.x] = local_size.x
+        results[6, global_id.x] = num_groups.x
+    end
+    return
+end
+
+function intrinsics_testsuite(backend, AT)
+    @testset "KernelIntrinsics Tests" begin
+        @testset "Launch parameters" begin
+            # 1d
+            function launch_kernel1d(arr)
+                i, _, _ = KI.get_local_id()
+                gi, _, _ = KI.get_group_id()
+                ngi, _, _ = KI.get_num_groups()
+
+                arr[(gi - 1) * ngi + i] = 1.0f0
+                return
+            end
+            arr1d = AT(zeros(Float32, 4))
+            KI.@kernel backend() numworkgroups = 2 workgroupsize = 2 launch_kernel1d(arr1d)
+            KernelAbstractions.synchronize(backend())
+            @test all(Array(arr1d) .== 1)
+
+            # 1d tuple
+            arr1dt = AT(zeros(Float32, 4))
+            KI.@kernel backend() numworkgroups = (2,) workgroupsize = (2,) launch_kernel1d(arr1dt)
+            KernelAbstractions.synchronize(backend())
+            @test all(Array(arr1dt) .== 1)
+
+            # 2d
+            function launch_kernel2d(arr)
+                i, j, _ = KI.get_local_id()
+                gi, gj, _ = KI.get_group_id()
+                ngi, ngj, _ = KI.get_num_groups()
+
+                arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j] = 1.0f0
+                return
+            end
+            arr2d = AT(zeros(Float32, 4, 4))
+            KI.@kernel backend() numworkgroups = (2, 2) workgroupsize = (2, 2) launch_kernel2d(arr2d)
+            KernelAbstractions.synchronize(backend())
+            @test all(Array(arr2d) .== 1)
+
+            # 3d
+            function launch_kernel3d(arr)
+                i, j, k = KI.get_local_id()
+                gi, gj, gk = KI.get_group_id()
+                ngi, ngj, ngk = KI.get_num_groups()
+
+                arr[(gi - 1) * ngi + i, (gj - 1) * ngj + j, (gk - 1) * ngk + k] = 1.0f0
+                return
+            end
+            arr3d = AT(zeros(Float32, 4, 4, 4))
+            KI.@kernel backend() numworkgroups = (2, 2, 2) workgroupsize = (2, 2, 2) launch_kernel3d(arr3d)
+            KernelAbstractions.synchronize(backend())
+            @test all(Array(arr3d) .== 1)
+        end
+
+        @testset "Basic intrinsics functionality" begin
+
+            @test KI.max_work_group_size(backend()) isa Int
+            @test KI.multiprocessor_count(backend()) isa Int
+
+            # Test with small kernel
+            N = 16
+            results = AT(zeros(Int, 6, N))
+            kernel = KI.@kernel backend() launch = false test_intrinsics_kernel(results)
+
+            @test KI.kernel_max_work_group_size(kernel) isa Int
+            @test KI.kernel_max_work_group_size(kernel; max_work_items = 1) == 1
+
+            kernel(results, workgroupsize = 4, numworkgroups = 4)
+            KernelAbstractions.synchronize(backend())
+
+            host_results = Array(results)
+
+            # Verify results make sense
+            for i in 1:N
+                global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i]
+
+                # Global IDs should be 1-based and sequential
+                @test global_id_x == i
+
+                # Global size should match our ndrange
+                @test global_size_x == N
+
+                # Local size should be 4 (our workgroupsize)
+                @test local_size_x == 4
+
+                # Number of groups should be ceil(N/4) = 4
+                @test num_groups_x == 4
+
+                # Group ID should be 1-based
+                expected_group = div(i - 1, 4) + 1
+                @test group_id_x == expected_group
+
+                # Local ID should be 1-based within group
+                expected_local = ((i - 1) % 4) + 1
+                @test local_id_x == expected_local
+            end
+        end
+    end
+    return nothing
+end
diff --git a/test/testsuite.jl b/test/testsuite.jl
index 2418db99..31b801b3 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -1,6 +1,7 @@
 module Testsuite
 
 using ..KernelAbstractions
+import ..KernelAbstractions.KernelIntrinsics as KI
 using ..Test
 
 # We can't add test-dependencies withouth breaking backend packages
@@ -26,6 +27,7 @@ end
 
 
 include("test.jl")
+include("intrinsics.jl")
 include("localmem.jl")
 include("private.jl")
 include("unroll.jl")
@@ -47,6 +49,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{
         specialfunctions_testsuite(backend)
     end
 
+    @conditional_testset "Intrinsics" skip_tests begin
+        intrinsics_testsuite(backend, AT)
+    end
+
     @conditional_testset "Localmem" skip_tests begin
         localmem_testsuite(backend, AT)
     end