From 857669f2f8dbd3508399b72858886aa8928dfb2e Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 7 Nov 2025 14:50:33 -0400
Subject: [PATCH 01/12] KA 0.10

---
 Project.toml                        |  4 +-
 src/AcceleratedKernels.jl           |  1 -
 src/accumulate/accumulate_1d_gpu.jl | 77 ++++++++++++++---------------
 src/accumulate/accumulate_nd.jl     | 58 +++++++++++-----------
 src/foreachindex.jl                 | 15 +++---
 src/predicates.jl                   | 21 +++++---
 src/reduce/mapreduce_1d_gpu.jl      | 22 ++++-----
 src/reduce/mapreduce_nd.jl          | 46 ++++++++---------
 src/reduce/utilities.jl             | 20 ++++----
 src/sort/merge_sort.jl              | 34 ++++++-------
 src/sort/merge_sort_by_key.jl       | 39 ++++++++-------
 src/sort/merge_sortperm.jl          |  6 +--
 12 files changed, 171 insertions(+), 172 deletions(-)

diff --git a/Project.toml b/Project.toml
index 2fccea8..e56f795 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AcceleratedKernels"
 uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 authors = ["Andrei-Leonard Nicusan <leonard@evophase.co.uk> and contributors"]
-version = "0.4.3"
+version = "0.5"
 
 [deps]
 ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197"
@@ -19,7 +19,7 @@ AcceleratedKernelsoneAPIExt = "oneAPI"
 [compat]
 ArgCheck = "2"
 GPUArraysCore = "0.2.0"
-KernelAbstractions = "0.9.34"
+KernelAbstractions = "0.10"
 Markdown = "1"
 UnsafeAtomics = "0.3.0"
 julia = "1.10"
diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index d662c2a..d4655de 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -14,7 +14,6 @@ module AcceleratedKernels
 using ArgCheck: @argcheck
 using GPUArraysCore: AnyGPUArray, @allowscalar
 using KernelAbstractions
-using KernelAbstractions: @context
 import UnsafeAtomics
 
 
diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl
index 9202692..e1dc0f8 100644
--- a/src/accumulate/accumulate_1d_gpu.jl
+++ b/src/accumulate/accumulate_1d_gpu.jl
@@ -12,16 +12,17 @@ const ACC_FLAG_P::UInt8 = 1             # Only current block's prefix available
 end
 
 
-@kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_block!(
+function _accumulate_block!(
     op, v, init, neutral,
     inclusive,
     flags, prefixes,                # one per block
-)
+    ::Val{block_size}
+) where block_size
+    @inbounds begin
     # NOTE: shmem_size MUST be greater than 2 * block_size
     # NOTE: block_size MUST be a power of 2
     len = length(v)
-    @uniform block_size = @groupsize()[1]
-    temp = @localmem eltype(v) (0x2 * block_size + conflict_free_offset(0x2 * block_size),)
+    temp = KI.localmemory(eltype(v), 0x2 * block_size + conflict_free_offset(0x2 * block_size))
 
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
@@ -29,10 +30,10 @@ end
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
-    num_blocks = @ndrange()[1] ÷ block_size
+    num_blocks = KI.get_num_groups().x
     block_offset = iblock * block_size * 0x2            # Processing two elements per thread
 
     # Copy two elements from the main array; offset indices to avoid bank conflicts
@@ -59,7 +60,7 @@ end
     next_pow2 = block_size * 0x2
     d = next_pow2 >> 0x1
     while d > 0x0             # TODO: unroll this like in reduce.jl ?
-        @synchronize()
+        KI.barrier()
 
         if ithread < d
             _ai = offset * (0x2 * ithread + 0x1) - 0x1
@@ -84,7 +85,7 @@ end
     d = typeof(ithread)(1)
     while d < next_pow2
         offset = offset >> 0x1
-        @synchronize()
+        KI.barrier()
 
         if ithread < d
             _ai = offset * (0x2 * ithread + 0x1) - 0x1
@@ -103,10 +104,10 @@ end
     # Later blocks should always be inclusively-scanned
     if inclusive || iblock != 0x0
         # To compute an inclusive scan, shift elements left...
-        @synchronize()
+        KI.barrier()
         t1 = temp[ai + bank_offset_a + 0x1]
         t2 = temp[bi + bank_offset_b + 0x1]
-        @synchronize()
+        KI.barrier()
 
         if ai > 0x0
             temp[ai - 0x1 + conflict_free_offset(ai - 0x1) + 0x1] = t1
@@ -123,7 +124,7 @@ end
         end
     end
 
-    @synchronize()
+    KI.barrier()
 
     # Write this block's final prefix to global array and set flag to "block prefix computed"
     if bi == 0x2 * block_size - 0x1
@@ -145,15 +146,16 @@ end
     if block_offset + bi < len
         v[block_offset + bi + 0x1] = temp[bi + bank_offset_b + 0x1]
     end
+    end
+    nothing
 end
 
 
-@kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous!(
-    op, v, flags, @Const(prefixes),
-)
-
+function _accumulate_previous!(
+    op, v, flags, prefixes, ::Val{block_size}
+) where block_size
+    @inbounds begin
     len = length(v)
-    block_size = @groupsize()[1]
 
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
@@ -161,8 +163,8 @@ end
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1 + 0x1              # Skipping first block
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1 + 0x1              # Skipping first block
+    ithread = KI.get_local_id().x - 0x1
     block_offset = iblock * block_size * 0x2                # Processing two elements per thread
 
     # Each block looks back to find running prefix sum
@@ -197,7 +199,7 @@ end
     # There are two synchronization concerns here:
     # 1. Withing a group we want to ensure that all writed to `v` have occured before setting the flag.
     # 2. Between groups we need to use a fence and atomic load/store to ensure that memory operations are not re-ordered
-    @synchronize() # within-block
+    KI.barrier() # within-block
     # Note: This fence is needed to ensure that the flag is not set before copying into v.
     #       See https://doc.rust-lang.org/std/sync/atomic/fn.fence.html
     #       for more details.
@@ -206,15 +208,17 @@ end
     if ithread == 0x0
         UnsafeAtomics.store!(pointer(flags, iblock + 0x1), convert(eltype(flags), ACC_FLAG_A), UnsafeAtomics.monotonic)
     end
+    end
+    nothing
 end
 
 
-@kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous_coupled_preblocks!(
-    op, v, prefixes,
-)
+function _accumulate_previous_coupled_preblocks!(
+    op, v, prefixes, ::Val{block_size}
+) where block_size
+    @inbounds begin
     # No decoupled lookback
     len = length(v)
-    block_size = @groupsize()[1]
 
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
@@ -222,8 +226,8 @@ end
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1 + 0x1              # Skipping first block
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1 + 0x1              # Skipping first block
+    ithread = KI.get_local_id().x - 0x1
     block_offset = iblock * block_size * 0x2                # Processing two elements per thread
 
     # Each block looks back to find running prefix sum
@@ -250,6 +254,8 @@ end
     if block_offset + bi < len
         v[block_offset + bi + 0x1] = op(running_prefix, v[block_offset + bi + 0x1])
     end
+    end
+    nothing
 end
 
 
@@ -298,14 +304,10 @@ function accumulate_1d_gpu!(
         flags = temp_flags
     end
 
-    kernel1! = _accumulate_block!(backend, block_size)
-    kernel1!(op, v, init, neutral, inclusive, flags, prefixes,
-             ndrange=num_blocks * block_size)
+    KI.@kernel backend workgroupsize=block_size numworkgroups=num_blocks _accumulate_block!(op, v, init, neutral, inclusive, flags, prefixes, Val(block_size))
 
     if num_blocks > 1
-        kernel2! = _accumulate_previous!(backend, block_size)
-        kernel2!(op, v, flags, prefixes,
-                 ndrange=(num_blocks - 1) * block_size)
+        KI.@kernel backend workgroupsize=block_size numworkgroups=(num_blocks-1) _accumulate_previous!(op, v, flags, prefixes, Val(block_size))
     end
 
     return v
@@ -349,22 +351,17 @@ function accumulate_1d_gpu!(
         prefixes = temp
     end
 
-    kernel1! = _accumulate_block!(backend, block_size)
-    kernel1!(op, v, init, neutral, inclusive, nothing, prefixes,
-             ndrange=num_blocks * block_size)
+    KI.@kernel backend workgroupsize=block_size numworkgroups=num_blocks _accumulate_block!(op, v, init, neutral, inclusive, nothing, prefixes, Val(block_size))
 
     if num_blocks > 1
 
         # Accumulate prefixes of all blocks; use neutral as init here to not reinclude init
         num_blocks_prefixes = (length(prefixes) + elems_per_block - 1) ÷ elems_per_block
-        kernel1!(op, prefixes, neutral, neutral, true, nothing, nothing,
-                 ndrange=num_blocks_prefixes * block_size)
+        KI.@kernel backend workgroupsize=block_size numworkgroups=num_blocks_prefixes _accumulate_block!(op, prefixes, neutral, neutral, true, nothing, nothing, Val(block_size))
 
         # Prefixes are pre-accumulated (completely accumulated if num_blocks_prefixes == 1, or
         # partially, which we will account for in the coupled lookback)
-        kernel2! = _accumulate_previous_coupled_preblocks!(backend, block_size)
-        kernel2!(op, v, prefixes,
-                 ndrange=(num_blocks - 1) * block_size)
+        KI.@kernel backend workgroupsize=block_size numworkgroups=(num_blocks-1) _accumulate_previous_coupled_preblocks!(op, v, prefixes, Val(block_size))
     end
 
     return v
diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl
index 83b210d..d045948 100644
--- a/src/accumulate/accumulate_nd.jl
+++ b/src/accumulate/accumulate_nd.jl
@@ -51,18 +51,14 @@ function accumulate_nd!(
         if length_outer >= length_dims
             # One thread per outer dimension
             blocks = (length_outer + block_size - 1) ÷ block_size
-            kernel1! = _accumulate_nd_by_thread!(backend, block_size)
-            kernel1!(
-                v, op, init, dims, inclusive,
-                ndrange=(block_size * blocks,),
+            KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_thread!(
+                v, op, init, dims, inclusive, Val(block_size)
             )
         else
             # One block per outer dimension
             blocks = length_outer
-            kernel2! = _accumulate_nd_by_block!(backend, block_size)
-            kernel2!(
-                v, op, init, neutral, dims, inclusive,
-                ndrange=(block_size, blocks),
+            KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_block!(
+                v, op, init, neutral, dims, inclusive, Val(block_size)
             )
         end
     end
@@ -121,9 +117,11 @@ function _accumulate_nd_cpu_sections!(
 end
 
 
-@kernel inbounds=true cpu=false unsafe_indices=true function _accumulate_nd_by_thread!(
+function _accumulate_nd_by_thread!(
     v, op, init, dims, inclusive,
-)
+    ::Val{block_size}
+) where block_size
+    @inbounds begin
     # One thread per outer dimension element, when there are more outer elements than in the
     # reduced dim e.g. accumulate(+, rand(3, 1000), dims=1) => only 3 elements in the accumulated
     # dim
@@ -135,16 +133,14 @@ end
     length_dims = vsizes[dims]
     length_outer = length(v) ÷ length_dims
 
-    block_size = @groupsize()[1]
-
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
     # indexing). Internal calculations will be done using zero indexing except when actually
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
     # Each thread handles one outer element
     tid = ithread + iblock * block_size
@@ -178,12 +174,16 @@ end
             end
         end
     end
+    end
+    nothing
 end
 
 
-@kernel inbounds=true cpu=false unsafe_indices=true function _accumulate_nd_by_block!(
+function _accumulate_nd_by_block!(
     v, op, init, neutral, dims, inclusive,
-)
+    ::Val{block_size}
+) where block_size
+    @inbounds begin
     # NOTE: shmem_size MUST be greater than 2 * block_size
     # NOTE: block_size MUST be a power of 2
 
@@ -198,10 +198,8 @@ end
     length_dims = vsizes[dims]
     length_outer = length(v) ÷ length_dims
 
-    @uniform block_size = @groupsize()[1]
-
-    temp = @localmem eltype(v) (0x2 * block_size + conflict_free_offset(0x2 * block_size),)
-    running_prefix = @localmem eltype(v) (1,)
+    temp = KI.localmemory(eltype(v), 0x2 * block_size + conflict_free_offset(0x2 * block_size))
+    running_prefix = KI.localmemory(eltype(v), 1)
 
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
@@ -209,8 +207,8 @@ end
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
     # Each block handles one outer element; guaranteed to have exact number of blocks, so no need
     # for `if iblock < length_outer`
@@ -234,7 +232,7 @@ end
     if ithread == 0x0
         running_prefix[0x1] = neutral
     end
-    @synchronize()
+    KI.barrier()
 
     while ichunk < num_chunks
         block_offset = ichunk * block_size * 0x2            # Processing two elements per thread
@@ -271,7 +269,7 @@ end
         next_pow2 = block_size * 0x2
         d = next_pow2 >> 0x1
         while d > 0x0             # TODO: unroll this like in reduce.jl ?
-            @synchronize()
+            KI.barrier()
 
             if ithread < d
                 _ai = offset * (0x2 * ithread + 0x1) - 0x1
@@ -296,7 +294,7 @@ end
         d = typeof(ithread)(1)
         while d < next_pow2
             offset = offset >> 0x1
-            @synchronize()
+            KI.barrier()
 
             if ithread < d
                 _ai = offset * (0x2 * ithread + 0x1) - 0x1
@@ -315,10 +313,10 @@ end
         # Later blocks should always be inclusively-scanned
         if inclusive || ichunk != 0x0
             # To compute an inclusive scan, shift elements left...
-            @synchronize()
+            KI.barrier()
             t1 = temp[ai + bank_offset_a + 0x1]
             t2 = temp[bi + bank_offset_b + 0x1]
-            @synchronize()
+            KI.barrier()
 
             if ai > 0x0
                 temp[ai - 0x1 + conflict_free_offset(ai - 0x1) + 0x1] = t1
@@ -344,7 +342,7 @@ end
         end
 
         _running_prefix = running_prefix[0x1]
-        @synchronize()
+        KI.barrier()
 
         if block_offset + ai < length_dims
             total = op(_running_prefix, temp[ai + bank_offset_a + 0x1])
@@ -367,8 +365,10 @@ end
         if bi == 0x2 * block_size - 0x1
             running_prefix[0x1] = total
         end
-        @synchronize()
+        KI.barrier()
 
         ichunk += 0x1
     end
+    end
+    nothing
 end
diff --git a/src/foreachindex.jl b/src/foreachindex.jl
index 1f8aa47..b6409b6 100644
--- a/src/foreachindex.jl
+++ b/src/foreachindex.jl
@@ -1,14 +1,16 @@
-@kernel inbounds=true cpu=false unsafe_indices=true function _forindices_global!(f, indices)
+function _forindices_global!(f, indices, ::Val{N}) where N
 
     # Calculate global index
-    N = @groupsize()[1]
-    iblock = @index(Group, Linear)
-    ithread = @index(Local, Linear)
+    iblock = KI.get_group_id().x
+    ithread = KI.get_local_id().x
     i = ithread + (iblock - 0x1) * N
+    # i = get_global_id().x
+
 
     if i <= length(indices)
         f(indices[i])
     end
+    nothing
 end
 
 
@@ -21,9 +23,8 @@ function _forindices_gpu(
 )
     # GPU implementation
     @argcheck block_size > 0
-    blocks = (length(indices) + block_size - 1) ÷ block_size
-    _forindices_global!(backend, block_size)(f, indices, ndrange=(block_size * blocks,))
-    nothing
+    blocks = max((length(indices) + block_size - 1) ÷ block_size, 1)
+    KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _forindices_global!(f, indices, Val(block_size))
 end
 
 
diff --git a/src/predicates.jl b/src/predicates.jl
index cfd63f8..e8576b4 100644
--- a/src/predicates.jl
+++ b/src/predicates.jl
@@ -6,26 +6,27 @@ Base.@kwdef struct MapReduce{T <: Union{Nothing, AbstractArray}} <: PredicatesAl
 end
 
 
-@kernel cpu=false inbounds=true function _any_global!(out, pred, @Const(v))
-    temp = @localmem Int8 (1,)
-    i = @index(Global, Linear)
+function _any_global!(out, pred, v)
+    temp = KI.localmemory(Int8, 1)
+    i = KI.get_global_id().x
 
     # Technically this is a race, but it doesn't matter as all threads would write the same value.
     # For example, CUDA F4.2 says "If a non-atomic instruction executed by a warp writes to the
     # same location in global memory for more than one of the threads of the warp, only one thread
     # performs a write and which thread does it is undefined."
     temp[0x1] = 0x0
-    @synchronize()
+    KI.barrier()
 
     # The ndrange check already protects us from out of bounds access
-    if pred(v[i])
+    if i <= length(v) && pred(v[i])
         temp[0x1] = 0x1
     end
 
-    @synchronize()
+    KI.barrier()
     if temp[0x1] != 0x0
         out[0x1] = 0x1
     end
+    nothing
 end
 
 
@@ -127,7 +128,9 @@ function _any_impl(
         # CUDA). If not cooperative, we need to do a mapreduce
         if alg === ConcurrentWrite()
             out = KernelAbstractions.zeros(backend, Int8, 1)
-            _any_global!(backend, block_size)(out, pred, v, ndrange=length(v))
+            workgroupsize = min(length(v), block_size)
+            numworkgroups = cld(length(v), workgroupsize)
+            KI.@kernel backend numworkgroups workgroupsize _any_global!(out, pred, v)
             outh = @allowscalar(out[1])
             return outh == 0 ? false : true
         else
@@ -261,7 +264,9 @@ function _all_impl(
         # CUDA). If not cooperative, we need to do a mapreduce
         if alg === ConcurrentWrite()
             out = KernelAbstractions.zeros(backend, Int8, 1)
-            _any_global!(backend, block_size)(out, (!pred), v, ndrange=length(v))
+            workgroupsize = min(length(v), block_size)
+            numworkgroups = cld(length(v), workgroupsize)
+            KI.@kernel backend numworkgroups workgroupsize _any_global!(out, (!pred), v)
             outh = @allowscalar(out[1])
             return outh == 0 ? true : false
         else
diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl
index 39e7c41..bb0a225 100644
--- a/src/reduce/mapreduce_1d_gpu.jl
+++ b/src/reduce/mapreduce_1d_gpu.jl
@@ -1,7 +1,6 @@
-@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_block!(@Const(src), dst, f, op, neutral)
-
-    @uniform N = @groupsize()[1]
-    sdata = @localmem eltype(dst) (N,)
+function _mapreduce_block!(src, dst, f, op, neutral, ::Val{N}) where N
+    @inbounds begin
+    sdata = KI.localmemory(eltype(dst), N)
 
     len = length(src)
 
@@ -11,8 +10,8 @@
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
     i = ithread + iblock * (N * 0x2)
     if i >= len
@@ -23,9 +22,9 @@
         sdata[ithread + 0x1] = op(f(src[i + 0x1]), f(src[i + N + 0x1]))
     end
 
-    @synchronize()
+    KI.barrier()
 
-    @inline reduce_group!(@context, op, sdata, N, ithread)
+    @inline reduce_group!(op, sdata, N, ithread)
 
     # Code below would work on NVidia GPUs with warp size of 32, but create race conditions and
     # return incorrect results on Intel Graphics. It would be useful to have a way to statically
@@ -43,6 +42,8 @@
     if ithread == 0x0
         dst[iblock + 0x1] = sdata[0x1]
     end
+    end
+    nothing
 end
 
 
@@ -91,8 +92,7 @@ function mapreduce_1d_gpu(
     src_view = @view src[1:end]
     dst_view = @view dst[1:blocks]
 
-    kernel! = _mapreduce_block!(backend, block_size)
-    kernel!(src_view, dst_view, f, op, neutral, ndrange=(block_size * blocks,))
+    KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_block!(src_view, dst_view, f, op, neutral, Val(block_size))
 
     # As long as we still have blocks to process, swap between the src and dst pointers at
     # the beginning of the first and second halves of dst
@@ -110,7 +110,7 @@ function mapreduce_1d_gpu(
         blocks = (len + num_per_block - 1) ÷ num_per_block
 
         # Each block produces one reduced value
-        kernel!(p1, p2, identity, op, neutral, ndrange=(block_size * blocks,))
+        KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_block!(p1, p2, identity, op, neutral, Val(block_size))
         len = blocks
 
         if len < switch_below
diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl
index 231d0dc..71fad0d 100644
--- a/src/reduce/mapreduce_nd.jl
+++ b/src/reduce/mapreduce_nd.jl
@@ -131,18 +131,14 @@ function mapreduce_nd(
         #     while the other dimensions are processed in parallel, independently
         if dst_size >= src_sizes[dims]
             blocks = (dst_size + block_size - 1) ÷ block_size
-            kernel1! = _mapreduce_nd_by_thread!(backend, block_size)
-            kernel1!(
-                src, dst, f, op, init, dims,
-                ndrange=(block_size * blocks,),
+            KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_thread!(
+                src, dst, f, op, init, dims, Val(block_size)
             )
         else
             # One block per output element
             blocks = dst_size
-            kernel2! = _mapreduce_nd_by_block!(backend, block_size)
-            kernel2!(
-                src, dst, f, op, init, neutral, dims,
-                ndrange=(block_size * blocks,),
+            KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_block!(
+                src, dst, f, op, init, neutral, dims, Val(block_size)
             )
         end
     end
@@ -193,11 +189,12 @@ function _mapreduce_nd_cpu_sections!(
 end
 
 
-@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_thread!(
-    @Const(src), dst,
+function _mapreduce_nd_by_thread!(
+    src, dst,
     f, op,
     init, dims,
-)
+    ::Val{N}
+) where N
     # One thread per output element, when there are more outer elements than in the reduced dim
     # e.g. reduce(+, rand(3, 1000), dims=1) => only 3 elements in the reduced dim
     src_sizes = size(src)
@@ -210,16 +207,14 @@ end
 
     ndims = length(src_sizes)
 
-    N = @groupsize()[1]
-
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
     # indexing). Internal calculations will be done using zero indexing except when actually
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
     # Each thread handles one output element
     tid = ithread + iblock * N
@@ -259,15 +254,16 @@ end
         end
         dst[tid + 0x1] = res
     end
+    nothing
 end
 
 
-@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_block!(
-    @Const(src), dst,
+function _mapreduce_nd_by_block!(
+    src, dst,
     f, op,
     init, neutral,
-    dims,
-)
+    dims, ::Val{N}
+) where N
     # One block per output element, when there are more elements in the reduced dim than in outer
     # e.g. reduce(+, rand(3, 1000), dims=2) => only 3 elements in outer dimensions
     src_sizes = size(src)
@@ -280,8 +276,7 @@ end
 
     ndims = length(src_sizes)
 
-    @uniform N = @groupsize()[1]
-    sdata = @localmem eltype(dst) (N,)
+    sdata = KI.localmemory(eltype(dst), N)
 
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
@@ -289,8 +284,8 @@ end
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
     # Each block handles one output element - thus, iblock ∈ [0, output_size)
 
@@ -330,11 +325,12 @@ end
 
     # Store partial result in shared memory; now we are down to a single block to reduce within
     sdata[ithread + 0x1] = partial
-    @synchronize()
+    KI.barrier()
 
-    @inline reduce_group!(@context, op, sdata, N, ithread)
+    @inline reduce_group!(op, sdata, N, ithread)
 
     if ithread == 0x0
         dst[iblock + 0x1] = op(init, sdata[0x1])
     end
+    nothing
 end
diff --git a/src/reduce/utilities.jl b/src/reduce/utilities.jl
index 48f387e..ad099a5 100644
--- a/src/reduce/utilities.jl
+++ b/src/reduce/utilities.jl
@@ -44,59 +44,59 @@ function _mapreduce_nd_apply_init!(
     end
 end
 
-@inline function reduce_group!(@context, op, sdata, N, ithread)
+@inline function reduce_group!(op, sdata, N, ithread)
     if N >= 512u16
         if ithread < 256u16
             sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 256u16 + 0x1])
         end
-        @synchronize()
+        KI.barrier()
     end
     if N >= 256u16
         if ithread < 128u16
             sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 128u16 + 0x1])
         end
-        @synchronize()
+        KI.barrier()
     end
     if N >= 128u16
         if ithread < 64u16
             sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 64u16 + 0x1])
         end
-        @synchronize()
+        KI.barrier()
     end
     if N >= 64u16
         if ithread < 32u16
             sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 32u16 + 0x1])
         end
-        @synchronize()
+        KI.barrier()
     end
     if N >= 32u16
         if ithread < 16u16
             sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 16u16 + 0x1])
         end
-        @synchronize()
+        KI.barrier()
     end
     if N >= 16u16
         if ithread < 8u16
             sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 8u16 + 0x1])
         end
-        @synchronize()
+        KI.barrier()
     end
     if N >= 8u16
         if ithread < 4u16
             sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 4u16 + 0x1])
         end
-        @synchronize()
+        KI.barrier()
     end
     if N >= 4u16
         if ithread < 2u16
             sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 2u16 + 0x1])
         end
-        @synchronize()
+        KI.barrier()
     end
     if N >= 2u16
         if ithread < 1u16
             sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 1u16 + 0x1])
         end
-        @synchronize()
+        KI.barrier()
     end
 end
diff --git a/src/sort/merge_sort.jl b/src/sort/merge_sort.jl
index 5fb7b20..f348542 100644
--- a/src/sort/merge_sort.jl
+++ b/src/sort/merge_sort.jl
@@ -1,7 +1,6 @@
-@kernel inbounds=true cpu=false unsafe_indices=true function _merge_sort_block!(vec, comp)
+function _merge_sort_block!(vec, comp, ::Val{N}) where N
 
-    @uniform N = @groupsize()[1]
-    s_buf = @localmem eltype(vec) (N * 0x2,)
+    s_buf = KI.localmemory(eltype(vec), N * 0x2)
 
     T = eltype(vec)
     I = typeof(N)
@@ -13,8 +12,8 @@
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
     i = ithread + iblock * N * 0x2
     i < len && (s_buf[ithread + 0x1] = vec[i + 0x1])
@@ -22,7 +21,7 @@
     i = ithread + N + iblock * N * 0x2
     i < len && (s_buf[ithread + N + 0x1] = vec[i + 0x1])
 
-    @synchronize()
+    KI.barrier()
 
     half_size_group = typeof(ithread)(1)
     size_group = typeof(ithread)(2)
@@ -56,12 +55,12 @@
             pos2 = ithread % half_size_group + _upper_bound_s0(s_buf, v2, lo, hi, comp) - lo
         end
 
-        @synchronize()
+        KI.barrier()
 
         pos1 != typemax(I) && (s_buf[gid * size_group + pos1 + 0x1] = v1)
         pos2 != typemax(I) && (s_buf[gid * size_group + pos2 + 0x1] = v2)
 
-        @synchronize()
+        KI.barrier()
 
         half_size_group = half_size_group << 0x1
         size_group = size_group << 0x1
@@ -72,14 +71,13 @@
 
     i = ithread + N + iblock * N * 0x2
     i < len && (vec[i + 0x1] = s_buf[ithread + N + 0x1])
+    nothing
 end
 
 
-@kernel inbounds=true cpu=false unsafe_indices=true function _merge_sort_global!(
-    @Const(vec_in), vec_out, comp, half_size_group,
-)
+function _merge_sort_global!(vec_in, vec_out, comp, half_size_group, ::Val{N}) where N
+    @inbounds begin
     len = length(vec_in)
-    N = @groupsize()[1]
 
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
@@ -87,8 +85,8 @@ end
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
     idx = ithread + iblock * N
     size_group = half_size_group * 0x2
@@ -120,6 +118,8 @@ end
             vec_out[pos_out + 0x1] = vec_in[pos_in + 0x1]
         end
     end
+    end
+    nothing
 end
 
 
@@ -160,7 +160,7 @@ function merge_sort!(
 
     # Block level
     blocks = (length(v) + block_size * 2 - 1) ÷ (block_size * 2)
-    _merge_sort_block!(backend, block_size)(v, comp, ndrange=(block_size * blocks,))
+    KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _merge_sort_block!(v, comp, Val(block_size))
 
     # Global level
     half_size_group = Int32(block_size * 2)
@@ -170,12 +170,12 @@ function merge_sort!(
         p1 = v
         p2 = isnothing(temp) ? similar(v) : temp
 
-        kernel! = _merge_sort_global!(backend, block_size)
+        kernel! = KI.@kernel backend launch = false _merge_sort_global!(p1, p2, comp, half_size_group, Val(block_size))
 
         niter = 0
         while len > half_size_group
             blocks = ((len + half_size_group - 1) ÷ half_size_group + 1) ÷ 2 * (half_size_group ÷ block_size)
-            kernel!(p1, p2, comp, half_size_group, ndrange=(block_size * blocks,))
+            kernel!(p1, p2, comp, half_size_group, Val(block_size); workgroupsize=block_size, numworkgroups = blocks)
 
             half_size_group = half_size_group << 1;
             size_group = size_group << 1;
diff --git a/src/sort/merge_sort_by_key.jl b/src/sort/merge_sort_by_key.jl
index f6de5f3..418e30f 100644
--- a/src/sort/merge_sort_by_key.jl
+++ b/src/sort/merge_sort_by_key.jl
@@ -1,8 +1,7 @@
-@kernel inbounds=true cpu=false unsafe_indices=true function _merge_sort_by_key_block!(keys, values, comp)
+function _merge_sort_by_key_block!(keys, values, comp, ::Val{N}) where N
 
-    @uniform N = @groupsize()[1]
-    s_keys = @localmem eltype(keys) (N * 0x2,)
-    s_values = @localmem eltype(values) (N * 0x2,)
+    s_keys = KI.localmemory(eltype(keys), N * 0x2)
+    s_values = KI.localmemory(eltype(values), N * 0x2)
 
     I = typeof(N)
     len = length(keys)
@@ -13,8 +12,8 @@
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
     i = ithread + iblock * N * 0x2
     if i < len
@@ -28,7 +27,7 @@
         s_values[ithread + N + 0x1] = values[i + 0x1]
     end
 
-    @synchronize()
+    KI.barrier()
 
     half_size_group = typeof(ithread)(1)
     size_group = typeof(ithread)(2)
@@ -66,7 +65,7 @@
             pos2 = ithread % half_size_group + _upper_bound_s0(s_keys, k2, lo, hi, comp) - lo
         end
 
-        @synchronize()
+        KI.barrier()
 
         if pos1 != typemax(I)
             s_keys[gid * size_group + pos1 + 0x1] = k1
@@ -77,7 +76,7 @@
             s_values[gid * size_group + pos2 + 0x1] = v2
         end
 
-        @synchronize()
+        KI.barrier()
 
         half_size_group = half_size_group << 0x1
         size_group = size_group << 0x1
@@ -94,17 +93,18 @@
         keys[i + 0x1] = s_keys[ithread + N + 0x1]
         values[i + 0x1] = s_values[ithread + N + 0x1]
     end
+    nothing
 end
 
 
-@kernel inbounds=true cpu=false unsafe_indices=true function _merge_sort_by_key_global!(
-    @Const(keys_in), keys_out,
-    @Const(values_in), values_out,
+function _merge_sort_by_key_global!(
+    keys_in, keys_out,
+    values_in, values_out,
     comp, half_size_group,
-)
+    ::Val{N}
+) where N
 
     len = length(keys_in)
-    N = @groupsize()[1]
 
     # NOTE: for many index calculations in this library, computation using zero-indexing leads to
     # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero
@@ -112,8 +112,8 @@ end
     # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive.
 
     # Group (block) and local (thread) indices
-    iblock = @index(Group, Linear) - 0x1
-    ithread = @index(Local, Linear) - 0x1
+    iblock = KI.get_group_id().x - 0x1
+    ithread = KI.get_local_id().x - 0x1
 
     idx = ithread + iblock * N
     size_group = half_size_group * 0x2
@@ -150,6 +150,7 @@ end
             values_out[pos_out + 0x1] = values_in[pos_in + 0x1]
         end
     end
+    nothing
 end
 
 
@@ -201,7 +202,7 @@ function merge_sort_by_key!(
 
     # Block level
     blocks = (length(keys) + block_size * 2 - 1) ÷ (block_size * 2)
-    _merge_sort_by_key_block!(backend, block_size)(keys, values, comp, ndrange=(block_size * blocks,))
+    KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _merge_sort_by_key_block!(keys, values, comp, Val(block_size))
 
     # Global level
     half_size_group = Int32(block_size * 2)
@@ -214,12 +215,12 @@ function merge_sort_by_key!(
         pv1 = values
         pv2 = isnothing(temp_values) ? similar(values) : temp_values
 
-        kernel! = _merge_sort_by_key_global!(backend, block_size)
+        kernel! = KI.@kernel backend launch = false _merge_sort_by_key_global!(pk1, pk2, pv1, pv2, comp, half_size_group, Val(block_size))
 
         niter = 0
         while len > half_size_group
             blocks = ((len + half_size_group - 1) ÷ half_size_group + 1) ÷ 2 * (half_size_group ÷ block_size)
-            kernel!(pk1, pk2, pv1, pv2, comp, half_size_group, ndrange=(block_size * blocks,))
+            kernel!(pk1, pk2, pv1, pv2, comp, half_size_group, Val(block_size); workgroupsize=block_size, numworkgroups=blocks)
 
             half_size_group = half_size_group << 1;
             size_group = size_group << 1;
diff --git a/src/sort/merge_sortperm.jl b/src/sort/merge_sortperm.jl
index 6b97061..bff4593 100644
--- a/src/sort/merge_sortperm.jl
+++ b/src/sort/merge_sortperm.jl
@@ -133,7 +133,7 @@ function merge_sortperm_lowmem!(
 
     # Block level
     blocks = (length(ix) + block_size * 2 - 1) ÷ (block_size * 2)
-    _merge_sort_block!(backend, block_size)(ix, comp, ndrange=(block_size * blocks,))
+    KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _merge_sort_block!(ix, comp, Val(block_size))
 
     # Global level
     half_size_group = Int32(block_size * 2)
@@ -143,12 +143,12 @@ function merge_sortperm_lowmem!(
         p1 = ix
         p2 = isnothing(temp) ? similar(ix) : temp
 
-        kernel! = _merge_sort_global!(backend, block_size)
+        kernel! = KI.@kernel backend launch = false _merge_sort_global!(p1, p2, comp, half_size_group, Val(block_size))
 
         niter = 0
         while len > half_size_group
             blocks = ((len + half_size_group - 1) ÷ half_size_group + 1) ÷ 2 * (half_size_group ÷ block_size)
-            kernel!(p1, p2, comp, half_size_group, ndrange=(block_size * blocks,))
+            kernel!(p1, p2, comp, half_size_group; workgroupsize=block_size, numworkgroups=blocks)
 
             half_size_group = half_size_group << 1;
             size_group = size_group << 1;

From d8bdc95642dd67e6fdaec47593c72b4d0b81accb Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 14 Nov 2025 12:19:16 -0400
Subject: [PATCH 02/12] [Temp (maybe)] benchmarks improvements

---
 benchmark/benchmark_graphs_nb.jl | 513 +++++++++++++++++--------------
 benchmark/runbenchmarks.jl       |  11 +-
 2 files changed, 293 insertions(+), 231 deletions(-)

diff --git a/benchmark/benchmark_graphs_nb.jl b/benchmark/benchmark_graphs_nb.jl
index 5beb8b6..6ee481a 100644
--- a/benchmark/benchmark_graphs_nb.jl
+++ b/benchmark/benchmark_graphs_nb.jl
@@ -1,5 +1,5 @@
 ### A Pluto.jl notebook ###
-# v0.20.13
+# v0.20.21
 
 using Markdown
 using InteractiveUtils
@@ -27,6 +27,12 @@ md"""
 """
 end
 
+# ╔═╡ 8601e5de-180c-45b5-b0c6-1f8d807df6d0
+
+
+# ╔═╡ e27f7b92-79f2-4351-bbc5-46d6e5a9fd67
+
+
 # ╔═╡ 0f77fb5f-e894-43e4-94f5-4ed93af7ba9b
 begin
 	function plot_benches(df, cat, t; ylabel="Time (ns)")
@@ -53,11 +59,35 @@ begin
 			push!(df, (Category=vals[1], T=vals[2],Bench=vals[3],Time=res[2]["time"]))
 		end
 	end
-	benchresults = JSON.parsefile("benchmarkresults.json")
-	benchres_df = getbenches(benchresults)
-	benchres_df.alg  .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchres_df.Bench]
-	benchres_df.test .= [x[6:end] for x in benchres_df.Bench]
-	sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
+	make_res_df(file) = make_res_df(x -> startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK"), file)
+	function make_res_df(alg_f, file)
+		benchresults = JSON.parsefile(file)
+		benchres_df = getbenches(benchresults)
+		benchres_df.alg  .= [alg_f(x) for x in benchres_df.Bench]
+		benchres_df.test .= [x[6:end] for x in benchres_df.Bench]
+		
+		sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
+		return benchres_df
+	end
+end
+
+# ╔═╡ 002b7672-9431-4510-ba89-84be098a2f9f
+# begin
+# 	benchresultspre = JSON.parsefile("benchmarkresultsstd.json")
+# 	benchrespre_df = getbenches(benchresultspre)
+# 	benchrespre_df.alg  .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchrespre_df.Bench]
+# 	benchrespre_df.test .= [x[6:end] for x in benchrespre_df.Bench]
+
+# 	benchrespre_df.alg  .= [x == Symbol("1Base") ? Symbol("2Base") : Symbol("4AccK") for x in benchrespre_df.alg]
+	
+# 	sort!(benchrespre_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
+# end
+
+# ╔═╡ b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2
+begin
+	benchrespre_df = make_res_df(x->startswith(x, "base") ? Symbol("1BasePre") : Symbol("4AccKPre"), "benchmarkresultspre.json")
+	benchres_df = make_res_df(x->startswith(x, "base") ? Symbol("2Base") : Symbol("5AccK"), "benchmarkresultsnew.json")
+	benchresi32_df = make_res_df(x->startswith(x, "base") ? Symbol("3BaseI32") : Symbol("6AccKI32"), "benchmarkresults.json")
 end
 
 # ╔═╡ d4accca6-f650-453c-bb75-a8e4cac568c1
@@ -73,6 +103,38 @@ Type: $(@bind typ Select(unique(benchres_df.T);))
 # ╔═╡ 3cd5fd0a-6f16-4cb3-87ba-43b86224b81c
 plot_benches(benchres_df, cat, typ)
 
+# ╔═╡ cde9391a-44dd-49ee-8730-4b9ad58c3d90
+plot_benches([benchres_df;benchrespre_df;benchresi32_df], cat, typ)
+
+# ╔═╡ 4681accf-eaed-47e0-9d26-ab968df83c8a
+# begin
+# make_res_df(file) = make_res_df(x -> startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK"), file)
+# function make_res_df(alg_f, file)
+# 	benchresults = JSON.parsefile("benchmarkresultsstd.json")
+# 	benchres_df = getbenches(benchresults)
+# 	benchres_df.alg  .= [alg_f(x) for x in benchres_df.Bench]
+# 	benchres_df.test .= [x[6:end] for x in benchres_df.Bench]
+	
+# 	sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
+# 	return benchres_df
+# end
+# end
+
+# ╔═╡ 7a3a4783-8424-488d-8122-1d65680703ac
+# begin
+# 	benchresultspre = JSON.parsefile("benchmarkresultsstd.json")
+# 	benchrespre_df = getbenches(benchresultspre)
+# 	benchrespre_df.alg  .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchrespre_df.Bench]
+# 	benchrespre_df.test .= [x[6:end] for x in benchrespre_df.Bench]
+
+# 	benchrespre_df.alg  .= [x == Symbol("1Base") ? Symbol("2Base") : Symbol("4AccK") for x in benchrespre_df.alg]
+	
+# 	sort!(benchrespre_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
+# end
+
+# ╔═╡ f3ac43c7-f6d0-4aa6-9d43-c66907de4fa0
+benchresboth_df = sort!([benchres_df;benchrespre_df], [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
+
 # ╔═╡ 00000000-0000-0000-0000-000000000001
 PLUTO_PROJECT_TOML_CONTENTS = """
 [deps]
@@ -82,19 +144,19 @@ PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
 
 [compat]
-DataFrames = "~1.7.0"
+DataFrames = "~1.8.1"
 JSON = "~0.21.4"
-PlutoUI = "~0.7.68"
-StatsPlots = "~0.15.7"
+PlutoUI = "~0.7.71"
+StatsPlots = "~0.15.8"
 """
 
 # ╔═╡ 00000000-0000-0000-0000-000000000002
 PLUTO_MANIFEST_TOML_CONTENTS = """
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.11.6"
+julia_version = "1.12.1"
 manifest_format = "2.0"
-project_hash = "52f7847a79e6136f8251449142cb9cc15e7eaf99"
+project_hash = "baee9e8f260630c1154d35b552a51b54a1819d5e"
 
 [[deps.AbstractFFTs]]
 deps = ["LinearAlgebra"]
@@ -115,9 +177,9 @@ version = "1.3.2"
 
 [[deps.Adapt]]
 deps = ["LinearAlgebra", "Requires"]
-git-tree-sha1 = "f7817e2e585aa6d924fd714df1e2a84be7896c60"
+git-tree-sha1 = "7e35fca2bdfba44d797c53dfe63a51fabf39bfc0"
 uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-version = "4.3.0"
+version = "4.4.0"
 weakdeps = ["SparseArrays", "StaticArrays"]
 
     [deps.Adapt.extensions]
@@ -179,9 +241,9 @@ version = "1.18.5+0"
 
 [[deps.ChainRulesCore]]
 deps = ["Compat", "LinearAlgebra"]
-git-tree-sha1 = "06ee8d1aa558d2833aa799f6f0b31b30cada405f"
+git-tree-sha1 = "e4c6a16e77171a5f5e25e9646617ab1c276c5607"
 uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
-version = "1.25.2"
+version = "1.26.0"
 weakdeps = ["SparseArrays"]
 
     [deps.ChainRulesCore.extensions]
@@ -201,9 +263,9 @@ version = "0.7.8"
 
 [[deps.ColorSchemes]]
 deps = ["ColorTypes", "ColorVectorSpace", "Colors", "FixedPointNumbers", "PrecompileTools", "Random"]
-git-tree-sha1 = "403f2d8e209681fcbd9468a8514efff3ea08452e"
+git-tree-sha1 = "b0fd3f56fa442f81e0a47815c92245acfaaa4e34"
 uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
-version = "3.29.0"
+version = "3.31.0"
 
 [[deps.ColorTypes]]
 deps = ["FixedPointNumbers", "Random"]
@@ -233,9 +295,9 @@ version = "0.13.1"
 
 [[deps.Compat]]
 deps = ["TOML", "UUIDs"]
-git-tree-sha1 = "8ae8d32e09f0dcf42a36b90d4e17f5dd2e4c4215"
+git-tree-sha1 = "9d8a54ce4b17aa5bdce0ea5c34bc5e7c340d16ad"
 uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
-version = "4.16.0"
+version = "4.18.1"
 weakdeps = ["Dates", "LinearAlgebra"]
 
     [deps.Compat.extensions]
@@ -244,7 +306,7 @@ weakdeps = ["Dates", "LinearAlgebra"]
 [[deps.CompilerSupportLibraries_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
-version = "1.1.1+0"
+version = "1.3.0+1"
 
 [[deps.ConcurrentUtilities]]
 deps = ["Serialization", "Sockets"]
@@ -269,15 +331,15 @@ version = "1.16.0"
 
 [[deps.DataFrames]]
 deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
-git-tree-sha1 = "fb61b4812c49343d7ef0b533ba982c46021938a6"
+git-tree-sha1 = "d8928e9169ff76c6281f39a659f9bca3a573f24c"
 uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-version = "1.7.0"
+version = "1.8.1"
 
 [[deps.DataStructures]]
-deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
-git-tree-sha1 = "4e1fe97fdaed23e9dc21d4d664bea76b65fc50a0"
+deps = ["OrderedCollections"]
+git-tree-sha1 = "e357641bb3e0638d353c4b29ea0e40ea644066a6"
 uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
-version = "0.18.22"
+version = "0.19.3"
 
 [[deps.DataValueInterfaces]]
 git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
@@ -319,9 +381,9 @@ version = "1.11.0"
 
 [[deps.Distributions]]
 deps = ["AliasTables", "FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsAPI", "StatsBase", "StatsFuns"]
-git-tree-sha1 = "3e6d038b77f22791b8e3472b7c633acea1ecac06"
+git-tree-sha1 = "3bc002af51045ca3b47d2e1787d6ce02e68b943a"
 uuid = "31c24e10-a181-5473-b8eb-7969acd0382f"
-version = "0.25.120"
+version = "0.25.122"
 
     [deps.Distributions.extensions]
     DistributionsChainRulesCoreExt = "ChainRulesCore"
@@ -357,27 +419,27 @@ version = "0.1.11"
 
 [[deps.Expat_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "d55dffd9ae73ff72f1c0482454dcf2ec6c6c4a63"
+git-tree-sha1 = "27af30de8b5445644e8ffe3bcb0d72049c089cf1"
 uuid = "2e619515-83b5-522b-bb60-26c02a35a201"
-version = "2.6.5+0"
+version = "2.7.3+0"
 
 [[deps.FFMPEG]]
 deps = ["FFMPEG_jll"]
-git-tree-sha1 = "53ebe7511fa11d33bec688a9178fac4e49eeee00"
+git-tree-sha1 = "95ecf07c2eea562b5adbd0696af6db62c0f52560"
 uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a"
-version = "0.4.2"
+version = "0.4.5"
 
 [[deps.FFMPEG_jll]]
 deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "JLLWrappers", "LAME_jll", "Libdl", "Ogg_jll", "OpenSSL_jll", "Opus_jll", "PCRE2_jll", "Zlib_jll", "libaom_jll", "libass_jll", "libfdk_aac_jll", "libvorbis_jll", "x264_jll", "x265_jll"]
-git-tree-sha1 = "466d45dc38e15794ec7d5d63ec03d776a9aff36e"
+git-tree-sha1 = "ccc81ba5e42497f4e76553a5545665eed577a663"
 uuid = "b22a6f82-2f65-5046-a5b2-351ab43fb4e5"
-version = "4.4.4+1"
+version = "8.0.0+0"
 
 [[deps.FFTW]]
-deps = ["AbstractFFTs", "FFTW_jll", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"]
-git-tree-sha1 = "797762812ed063b9b94f6cc7742bc8883bb5e69e"
+deps = ["AbstractFFTs", "FFTW_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"]
+git-tree-sha1 = "97f08406df914023af55ade2f843c39e99c5d969"
 uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
-version = "1.9.0"
+version = "1.10.0"
 
 [[deps.FFTW_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
@@ -391,9 +453,9 @@ version = "1.11.0"
 
 [[deps.FillArrays]]
 deps = ["LinearAlgebra"]
-git-tree-sha1 = "6a70198746448456524cb442b8af316927ff3e1a"
+git-tree-sha1 = "5bfcd42851cf2f1b303f51525a54dc5e98d408a3"
 uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
-version = "1.13.0"
+version = "1.15.0"
 weakdeps = ["PDMats", "SparseArrays", "Statistics"]
 
     [deps.FillArrays.extensions]
@@ -409,9 +471,9 @@ version = "0.8.5"
 
 [[deps.Fontconfig_jll]]
 deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Zlib_jll"]
-git-tree-sha1 = "301b5d5d731a0654825f1f2e906990f7141a106b"
+git-tree-sha1 = "f85dac9a96a01087df6e3a749840015a0ca3817d"
 uuid = "a3f928ae-7b40-5064-980b-68af3947d34b"
-version = "2.16.0+0"
+version = "2.17.1+0"
 
 [[deps.Format]]
 git-tree-sha1 = "9c68794ef81b08086aeb32eeaf33531668d5f5fc"
@@ -443,15 +505,15 @@ version = "3.4.0+2"
 
 [[deps.GR]]
 deps = ["Artifacts", "Base64", "DelimitedFiles", "Downloads", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Preferences", "Printf", "Qt6Wayland_jll", "Random", "Serialization", "Sockets", "TOML", "Tar", "Test", "p7zip_jll"]
-git-tree-sha1 = "1828eb7275491981fa5f1752a5e126e8f26f8741"
+git-tree-sha1 = "f52c27dd921390146624f3aab95f4e8614ad6531"
 uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71"
-version = "0.73.17"
+version = "0.73.18"
 
 [[deps.GR_jll]]
 deps = ["Artifacts", "Bzip2_jll", "Cairo_jll", "FFMPEG_jll", "Fontconfig_jll", "FreeType2_jll", "GLFW_jll", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Libtiff_jll", "Pixman_jll", "Qt6Base_jll", "Zlib_jll", "libpng_jll"]
-git-tree-sha1 = "27299071cc29e409488ada41ec7643e0ab19091f"
+git-tree-sha1 = "4b0406b866ea9fdbaf1148bc9c0b887e59f9af68"
 uuid = "d2c73de3-f751-5644-a686-071e5b155ba9"
-version = "0.73.17+0"
+version = "0.73.18+0"
 
 [[deps.GettextRuntime_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Libiconv_jll"]
@@ -459,11 +521,17 @@ git-tree-sha1 = "45288942190db7c5f760f59c04495064eedf9340"
 uuid = "b0724c58-0f36-5564-988d-3bb0596ebc4a"
 version = "0.22.4+0"
 
+[[deps.Ghostscript_jll]]
+deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Zlib_jll"]
+git-tree-sha1 = "38044a04637976140074d0b0621c1edf0eb531fd"
+uuid = "61579ee1-b43e-5ca0-a5da-69d92c66a64b"
+version = "9.55.1+0"
+
 [[deps.Glib_jll]]
 deps = ["Artifacts", "GettextRuntime_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Libiconv_jll", "Libmount_jll", "PCRE2_jll", "Zlib_jll"]
-git-tree-sha1 = "35fbd0cefb04a516104b8e183ce0df11b70a3f1a"
+git-tree-sha1 = "50c11ffab2a3d50192a228c313f05b5b5dc5acb2"
 uuid = "7746bdde-850d-59dc-9ae8-88ece973131d"
-version = "2.84.3+0"
+version = "2.86.0+0"
 
 [[deps.Graphite2_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
@@ -478,9 +546,9 @@ version = "1.0.2"
 
 [[deps.HTTP]]
 deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "PrecompileTools", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"]
-git-tree-sha1 = "ed5e9c58612c4e081aecdb6e1a479e18462e041e"
+git-tree-sha1 = "5e6fe50ae7f23d171f44e311c2960294aaa0beb5"
 uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
-version = "1.10.17"
+version = "1.10.19"
 
 [[deps.HarfBuzz_jll]]
 deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll"]
@@ -513,9 +581,9 @@ uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89"
 version = "0.2.5"
 
 [[deps.InlineStrings]]
-git-tree-sha1 = "6a9fde685a7ac1eb3495f8e812c5a7c3711c2d5e"
+git-tree-sha1 = "8f3d257792a522b4601c24a577954b0a8cd7334d"
 uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
-version = "1.4.3"
+version = "1.4.5"
 
     [deps.InlineStrings.extensions]
     ArrowTypesExt = "ArrowTypes"
@@ -527,9 +595,9 @@ version = "1.4.3"
 
 [[deps.IntelOpenMP_jll]]
 deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl"]
-git-tree-sha1 = "0f14a5456bdc6b9731a5682f439a672750a09e48"
+git-tree-sha1 = "ec1debd61c300961f98064cfb21287613ad7f303"
 uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0"
-version = "2025.0.4+0"
+version = "2025.2.0+0"
 
 [[deps.InteractiveUtils]]
 deps = ["Markdown"]
@@ -537,24 +605,28 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 version = "1.11.0"
 
 [[deps.Interpolations]]
-deps = ["Adapt", "AxisAlgorithms", "ChainRulesCore", "LinearAlgebra", "OffsetArrays", "Random", "Ratios", "Requires", "SharedArrays", "SparseArrays", "StaticArrays", "WoodburyMatrices"]
-git-tree-sha1 = "88a101217d7cb38a7b481ccd50d21876e1d1b0e0"
+deps = ["Adapt", "AxisAlgorithms", "ChainRulesCore", "LinearAlgebra", "OffsetArrays", "Random", "Ratios", "SharedArrays", "SparseArrays", "StaticArrays", "WoodburyMatrices"]
+git-tree-sha1 = "65d505fa4c0d7072990d659ef3fc086eb6da8208"
 uuid = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
-version = "0.15.1"
-weakdeps = ["Unitful"]
+version = "0.16.2"
 
     [deps.Interpolations.extensions]
+    InterpolationsForwardDiffExt = "ForwardDiff"
     InterpolationsUnitfulExt = "Unitful"
 
+    [deps.Interpolations.weakdeps]
+    ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+    Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d"
+
 [[deps.InvertedIndices]]
 git-tree-sha1 = "6da3c4316095de0f5ee2ebd875df8721e7e0bdbe"
 uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
 version = "1.3.1"
 
 [[deps.IrrationalConstants]]
-git-tree-sha1 = "e2222959fbc6c19554dc15174c81bf7bf3aa691c"
+git-tree-sha1 = "b2d91fe939cae05960e760110b328288867b5758"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
-version = "0.2.4"
+version = "0.2.6"
 
 [[deps.IteratorInterfaceExtensions]]
 git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
@@ -569,9 +641,9 @@ version = "0.1.11"
 
 [[deps.JLLWrappers]]
 deps = ["Artifacts", "Preferences"]
-git-tree-sha1 = "a007feb38b422fbdab534406aeca1b86823cb4d6"
+git-tree-sha1 = "0533e564aae234aff59ab625543145446d8b6ec2"
 uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
-version = "1.7.0"
+version = "1.7.1"
 
 [[deps.JSON]]
 deps = ["Dates", "Mmap", "Parsers", "Unicode"]
@@ -581,21 +653,26 @@ version = "0.21.4"
 
 [[deps.JpegTurbo_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "eac1206917768cb54957c65a615460d87b455fc1"
+git-tree-sha1 = "4255f0032eafd6451d707a51d5f0248b8a165e4d"
 uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8"
-version = "3.1.1+0"
+version = "3.1.3+0"
+
+[[deps.JuliaSyntaxHighlighting]]
+deps = ["StyledStrings"]
+uuid = "ac6e5ff7-fb65-4e79-a425-ec3bc9c03011"
+version = "1.12.0"
 
 [[deps.KernelDensity]]
 deps = ["Distributions", "DocStringExtensions", "FFTW", "Interpolations", "StatsBase"]
-git-tree-sha1 = "7d703202e65efa1369de1279c162b915e245eed1"
+git-tree-sha1 = "ba51324b894edaf1df3ab16e2cc6bc3280a2f1a7"
 uuid = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
-version = "0.6.9"
+version = "0.6.10"
 
 [[deps.LAME_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "170b660facf5df5de098d866564877e119141cbd"
+git-tree-sha1 = "059aabebaa7c82ccb853dd4a0ee9d17796f7e1bc"
 uuid = "c1c5ebd0-6772-5130-a774-d5fcae4a789d"
-version = "3.100.2+0"
+version = "3.100.3+0"
 
 [[deps.LERC_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
@@ -621,10 +698,10 @@ uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
 version = "1.4.0"
 
 [[deps.Latexify]]
-deps = ["Format", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "OrderedCollections", "Requires"]
-git-tree-sha1 = "4f34eaabe49ecb3fb0d58d6015e32fd31a733199"
+deps = ["Format", "Ghostscript_jll", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "OrderedCollections", "Requires"]
+git-tree-sha1 = "44f93c47f9cd6c7e431f2f2091fcba8f01cd7e8f"
 uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316"
-version = "0.16.8"
+version = "0.16.10"
 
     [deps.Latexify.extensions]
     DataFramesExt = "DataFrames"
@@ -649,24 +726,24 @@ uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
 version = "0.6.4"
 
 [[deps.LibCURL_jll]]
-deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "OpenSSL_jll", "Zlib_jll", "nghttp2_jll"]
 uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
-version = "8.6.0+0"
+version = "8.11.1+1"
 
 [[deps.LibGit2]]
-deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
+deps = ["LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 version = "1.11.0"
 
 [[deps.LibGit2_jll]]
-deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "OpenSSL_jll"]
 uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5"
-version = "1.7.2+0"
+version = "1.9.0+0"
 
 [[deps.LibSSH2_jll]]
-deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
+deps = ["Artifacts", "Libdl", "OpenSSL_jll"]
 uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
-version = "1.11.0+1"
+version = "1.11.3+1"
 
 [[deps.Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
@@ -692,26 +769,26 @@ version = "1.18.0+0"
 
 [[deps.Libmount_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "a31572773ac1b745e0343fe5e2c8ddda7a37e997"
+git-tree-sha1 = "3acf07f130a76f87c041cfb2ff7d7284ca67b072"
 uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9"
-version = "2.41.0+0"
+version = "2.41.2+0"
 
 [[deps.Libtiff_jll]]
 deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"]
-git-tree-sha1 = "4ab7581296671007fc33f07a721631b8855f4b1d"
+git-tree-sha1 = "f04133fe05eff1667d2054c53d59f9122383fe05"
 uuid = "89763e89-9b03-5906-acba-b20f662cd828"
-version = "4.7.1+0"
+version = "4.7.2+0"
 
 [[deps.Libuuid_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "321ccef73a96ba828cd51f2ab5b9f917fa73945a"
+git-tree-sha1 = "2a7a12fc0a4e7fb773450d17975322aa77142106"
 uuid = "38a345b3-de98-5d2b-a5d3-14cd9215e700"
-version = "2.41.0+0"
+version = "2.41.2+0"
 
 [[deps.LinearAlgebra]]
 deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
 uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-version = "1.11.0"
+version = "1.12.0"
 
 [[deps.LogExpFunctions]]
 deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
@@ -735,9 +812,9 @@ version = "1.11.0"
 
 [[deps.LoggingExtras]]
 deps = ["Dates", "Logging"]
-git-tree-sha1 = "f02b56007b064fbfddb4c9cd60161b6dd0f40df3"
+git-tree-sha1 = "f00544d95982ea270145636c181ceda21c4e2575"
 uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36"
-version = "1.1.0"
+version = "1.2.0"
 
 [[deps.MIMEs]]
 git-tree-sha1 = "c64d943587f7187e751162b3b84445bbbd79f691"
@@ -746,9 +823,9 @@ version = "1.1.0"
 
 [[deps.MKL_jll]]
 deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "oneTBB_jll"]
-git-tree-sha1 = "5de60bc6cb3899cd318d80d627560fae2e2d99ae"
+git-tree-sha1 = "282cadc186e7b2ae0eeadbd7a4dffed4196ae2aa"
 uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
-version = "2025.0.1+1"
+version = "2025.2.0+0"
 
 [[deps.MacroTools]]
 git-tree-sha1 = "1e0228a030642014fe5cfe68c2c0a818f9e3f522"
@@ -756,7 +833,7 @@ uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 version = "0.5.16"
 
 [[deps.Markdown]]
-deps = ["Base64"]
+deps = ["Base64", "JuliaSyntaxHighlighting", "StyledStrings"]
 uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
 version = "1.11.0"
 
@@ -767,14 +844,15 @@ uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
 version = "1.1.9"
 
 [[deps.MbedTLS_jll]]
-deps = ["Artifacts", "Libdl"]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "3cce3511ca2c6f87b19c34ffc623417ed2798cbd"
 uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-version = "2.28.6+0"
+version = "2.28.10+0"
 
 [[deps.Measures]]
-git-tree-sha1 = "c13304c81eec1ed3af7fc20e75fb6b26092a1102"
+git-tree-sha1 = "b513cedd20d9c914783d8ad83d08120702bf2c77"
 uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
-version = "0.3.2"
+version = "0.3.3"
 
 [[deps.Missings]]
 deps = ["DataAPI"]
@@ -788,7 +866,7 @@ version = "1.11.0"
 
 [[deps.MozillaCACerts_jll]]
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-version = "2023.12.12"
+version = "2025.5.20"
 
 [[deps.MultivariateStats]]
 deps = ["Arpack", "Distributions", "LinearAlgebra", "SparseArrays", "Statistics", "StatsAPI", "StatsBase"]
@@ -804,13 +882,13 @@ version = "1.1.3"
 
 [[deps.NearestNeighbors]]
 deps = ["Distances", "StaticArrays"]
-git-tree-sha1 = "8a3271d8309285f4db73b4f662b1b290c715e85e"
+git-tree-sha1 = "ca7e18198a166a1f3eb92a3650d53d94ed8ca8a1"
 uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
-version = "0.4.21"
+version = "0.4.22"
 
 [[deps.NetworkOptions]]
 uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
-version = "1.2.0"
+version = "1.3.0"
 
 [[deps.Observables]]
 git-tree-sha1 = "7438a59546cf62428fc9d1bc94729146d37a7225"
@@ -827,32 +905,31 @@ weakdeps = ["Adapt"]
     OffsetArraysAdaptExt = "Adapt"
 
 [[deps.Ogg_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "887579a3eb005446d514ab7aeac5d1d027658b8f"
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "b6aa4566bb7ae78498a5e68943863fa8b5231b59"
 uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051"
-version = "1.3.5+1"
+version = "1.3.6+0"
 
 [[deps.OpenBLAS_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
 uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
-version = "0.3.27+1"
+version = "0.3.29+0"
 
 [[deps.OpenLibm_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
-version = "0.8.5+0"
+version = "0.8.7+0"
 
 [[deps.OpenSSL]]
-deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"]
-git-tree-sha1 = "f1a7e086c677df53e064e0fdd2c9d0b0833e3f6e"
+deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "NetworkOptions", "OpenSSL_jll", "Sockets"]
+git-tree-sha1 = "386b47442468acfb1add94bf2d85365dea10cbab"
 uuid = "4d8831e6-92b7-49fb-bdf8-b643e874388c"
-version = "1.5.0"
+version = "1.6.0"
 
 [[deps.OpenSSL_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "9216a80ff3682833ac4b733caa8c00390620ba5d"
+deps = ["Artifacts", "Libdl"]
 uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
-version = "3.5.0+0"
+version = "3.5.1+0"
 
 [[deps.OpenSpecFun_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl"]
@@ -862,9 +939,9 @@ version = "0.5.6+0"
 
 [[deps.Opus_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "6703a85cb3781bd5909d48730a67205f3f31a575"
+git-tree-sha1 = "c392fc5dd032381919e3b22dd32d6443760ce7ea"
 uuid = "91d4177d-7536-5919-b921-800302f37372"
-version = "1.3.3+0"
+version = "1.5.2+0"
 
 [[deps.OrderedCollections]]
 git-tree-sha1 = "05868e21324cede2207c6f0f466b4bfef6d5e7ee"
@@ -874,19 +951,23 @@ version = "1.8.1"
 [[deps.PCRE2_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "efcefdf7-47ab-520b-bdef-62a2eaa19f15"
-version = "10.42.0+1"
+version = "10.44.0+1"
 
 [[deps.PDMats]]
 deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"]
-git-tree-sha1 = "f07c06228a1c670ae4c87d1276b92c7c597fdda0"
+git-tree-sha1 = "d922b4d80d1e12c658da7785e754f4796cc1d60d"
 uuid = "90014a1f-27ba-587c-ab20-58faa44d9150"
-version = "0.11.35"
+version = "0.11.36"
+weakdeps = ["StatsBase"]
+
+    [deps.PDMats.extensions]
+    StatsBaseExt = "StatsBase"
 
 [[deps.Pango_jll]]
 deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "FriBidi_jll", "Glib_jll", "HarfBuzz_jll", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "275a9a6d85dc86c24d03d1837a0010226a96f540"
+git-tree-sha1 = "0662b083e11420952f2e62e17eddae7fc07d5997"
 uuid = "36c8627f-9965-5494-a995-c6b170f724f3"
-version = "1.56.3+0"
+version = "1.57.0+0"
 
 [[deps.Parsers]]
 deps = ["Dates", "PrecompileTools", "UUIDs"]
@@ -903,7 +984,7 @@ version = "0.44.2+0"
 [[deps.Pkg]]
 deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "Random", "SHA", "TOML", "Tar", "UUIDs", "p7zip_jll"]
 uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.11.0"
+version = "1.12.0"
 weakdeps = ["REPL"]
 
     [deps.Pkg.extensions]
@@ -917,15 +998,15 @@ version = "3.3.0"
 
 [[deps.PlotUtils]]
 deps = ["ColorSchemes", "Colors", "Dates", "PrecompileTools", "Printf", "Random", "Reexport", "StableRNGs", "Statistics"]
-git-tree-sha1 = "3ca9a356cd2e113c420f2c13bea19f8d3fb1cb18"
+git-tree-sha1 = "26ca162858917496748aad52bb5d3be4d26a228a"
 uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043"
-version = "1.4.3"
+version = "1.4.4"
 
 [[deps.Plots]]
-deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "JLFzf", "JSON", "LaTeXStrings", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "PrecompileTools", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "RelocatableFolders", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "TOML", "UUIDs", "UnicodeFun", "UnitfulLatexify", "Unzip"]
-git-tree-sha1 = "28ea788b78009c695eb0d637587c81d26bdf0e36"
+deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "JLFzf", "JSON", "LaTeXStrings", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "PrecompileTools", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "RelocatableFolders", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "TOML", "UUIDs", "UnicodeFun", "Unzip"]
+git-tree-sha1 = "12ce661880f8e309569074a61d3767e5756a199f"
 uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
-version = "1.40.14"
+version = "1.41.1"
 
     [deps.Plots.extensions]
     FileIOExt = "FileIO"
@@ -943,9 +1024,9 @@ version = "1.40.14"
 
 [[deps.PlutoUI]]
 deps = ["AbstractPlutoDingetjes", "Base64", "ColorTypes", "Dates", "Downloads", "FixedPointNumbers", "Hyperscript", "HypertextLiteral", "IOCapture", "InteractiveUtils", "JSON", "Logging", "MIMEs", "Markdown", "Random", "Reexport", "URIs", "UUIDs"]
-git-tree-sha1 = "ec9e63bd098c50e4ad28e7cb95ca7a4860603298"
+git-tree-sha1 = "8329a3a4f75e178c11c1ce2342778bcbbbfa7e3c"
 uuid = "7f904dfe-b85e-4ff6-b463-dae2292396a8"
-version = "0.7.68"
+version = "0.7.71"
 
 [[deps.PooledArrays]]
 deps = ["DataAPI", "Future"]
@@ -955,21 +1036,21 @@ version = "1.4.3"
 
 [[deps.PrecompileTools]]
 deps = ["Preferences"]
-git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f"
+git-tree-sha1 = "07a921781cab75691315adc645096ed5e370cb77"
 uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
-version = "1.2.1"
+version = "1.3.3"
 
 [[deps.Preferences]]
 deps = ["TOML"]
-git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
+git-tree-sha1 = "0f27480397253da18fe2c12a4ba4eb9eb208bf3d"
 uuid = "21216c6a-2e73-6563-6e65-726566657250"
-version = "1.4.3"
+version = "1.5.0"
 
 [[deps.PrettyTables]]
-deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"]
-git-tree-sha1 = "1101cd475833706e4d0e7b122218257178f48f34"
+deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "REPL", "Reexport", "StringManipulation", "Tables"]
+git-tree-sha1 = "6b8e2f0bae3f678811678065c09571c1619da219"
 uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
-version = "2.4.0"
+version = "3.1.0"
 
 [[deps.Printf]]
 deps = ["Unicode"]
@@ -983,9 +1064,9 @@ version = "1.3.0"
 
 [[deps.Qt6Base_jll]]
 deps = ["Artifacts", "CompilerSupportLibraries_jll", "Fontconfig_jll", "Glib_jll", "JLLWrappers", "Libdl", "Libglvnd_jll", "OpenSSL_jll", "Vulkan_Loader_jll", "Xorg_libSM_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Xorg_libxcb_jll", "Xorg_xcb_util_cursor_jll", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_keysyms_jll", "Xorg_xcb_util_renderutil_jll", "Xorg_xcb_util_wm_jll", "Zlib_jll", "libinput_jll", "xkbcommon_jll"]
-git-tree-sha1 = "eb38d376097f47316fe089fc62cb7c6d85383a52"
+git-tree-sha1 = "34f7e5d2861083ec7596af8b8c092531facf2192"
 uuid = "c0090381-4147-56d7-9ebc-da0b1113ec56"
-version = "6.8.2+1"
+version = "6.8.2+2"
 
 [[deps.Qt6Declarative_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Qt6Base_jll", "Qt6ShaderTools_jll"]
@@ -1001,9 +1082,9 @@ version = "6.8.2+1"
 
 [[deps.Qt6Wayland_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Qt6Base_jll", "Qt6Declarative_jll"]
-git-tree-sha1 = "e1d5e16d0f65762396f9ca4644a5f4ddab8d452b"
+git-tree-sha1 = "8f528b0851b5b7025032818eb5abbeb8a736f853"
 uuid = "e99dba38-086e-5de3-a5b1-6e4c66e897c3"
-version = "6.8.2+1"
+version = "6.8.2+2"
 
 [[deps.QuadGK]]
 deps = ["DataStructures", "LinearAlgebra"]
@@ -1018,7 +1099,7 @@ version = "2.11.2"
     Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 
 [[deps.REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets", "StyledStrings", "Unicode"]
+deps = ["InteractiveUtils", "JuliaSyntaxHighlighting", "Markdown", "Sockets", "StyledStrings", "Unicode"]
 uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 version = "1.11.0"
 
@@ -1068,9 +1149,9 @@ version = "1.3.1"
 
 [[deps.Rmath]]
 deps = ["Random", "Rmath_jll"]
-git-tree-sha1 = "852bd0f55565a9e973fcfee83a84413270224dc4"
+git-tree-sha1 = "5b3d50eb374cea306873b371d3f8d3915a018f0b"
 uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa"
-version = "0.8.0"
+version = "0.9.0"
 
 [[deps.Rmath_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
@@ -1120,20 +1201,20 @@ version = "1.11.0"
 
 [[deps.SortingAlgorithms]]
 deps = ["DataStructures"]
-git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085"
+git-tree-sha1 = "64d974c2e6fdf07f8155b5b2ca2ffa9069b608d9"
 uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
-version = "1.2.1"
+version = "1.2.2"
 
 [[deps.SparseArrays]]
 deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
 uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
-version = "1.11.0"
+version = "1.12.0"
 
 [[deps.SpecialFunctions]]
 deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
-git-tree-sha1 = "41852b8679f78c8d8961eeadc8f62cef861a52e3"
+git-tree-sha1 = "f2685b435df2613e25fc10ad8c26dddb8640f547"
 uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
-version = "2.5.1"
+version = "2.6.1"
 weakdeps = ["ChainRulesCore"]
 
     [deps.SpecialFunctions.extensions]
@@ -1141,15 +1222,15 @@ weakdeps = ["ChainRulesCore"]
 
 [[deps.StableRNGs]]
 deps = ["Random"]
-git-tree-sha1 = "95af145932c2ed859b63329952ce8d633719f091"
+git-tree-sha1 = "4f96c596b8c8258cc7d3b19797854d368f243ddc"
 uuid = "860ef19b-820b-49d6-a774-d7a799459cd3"
-version = "1.0.3"
+version = "1.0.4"
 
 [[deps.StaticArrays]]
 deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"]
-git-tree-sha1 = "0feb6b9031bd5c51f9072393eb5ab3efd31bf9e4"
+git-tree-sha1 = "b8693004b385c842357406e3af647701fe783f98"
 uuid = "90137ffa-7385-5640-81b9-e52037218182"
-version = "1.9.13"
+version = "1.9.15"
 weakdeps = ["ChainRulesCore", "Statistics"]
 
     [deps.StaticArrays.extensions]
@@ -1157,9 +1238,9 @@ weakdeps = ["ChainRulesCore", "Statistics"]
     StaticArraysStatisticsExt = "Statistics"
 
 [[deps.StaticArraysCore]]
-git-tree-sha1 = "192954ef1208c7019899fbf8049e717f92959682"
+git-tree-sha1 = "6ab403037779dae8c514bad259f32a447262455a"
 uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
-version = "1.4.3"
+version = "1.4.4"
 
 [[deps.Statistics]]
 deps = ["LinearAlgebra"]
@@ -1179,15 +1260,15 @@ version = "1.7.1"
 
 [[deps.StatsBase]]
 deps = ["AliasTables", "DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
-git-tree-sha1 = "b81c5035922cc89c2d9523afc6c54be512411466"
+git-tree-sha1 = "064b532283c97daae49e544bb9cb413c26511f8c"
 uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
-version = "0.34.5"
+version = "0.34.8"
 
 [[deps.StatsFuns]]
 deps = ["HypergeometricFunctions", "IrrationalConstants", "LogExpFunctions", "Reexport", "Rmath", "SpecialFunctions"]
-git-tree-sha1 = "8e45cecc66f3b42633b8ce14d431e8e57a3e242e"
+git-tree-sha1 = "91f091a8716a6bb38417a6e6f274602a19aaa685"
 uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
-version = "1.5.0"
+version = "1.5.2"
 
     [deps.StatsFuns.extensions]
     StatsFunsChainRulesCoreExt = "ChainRulesCore"
@@ -1199,9 +1280,9 @@ version = "1.5.0"
 
 [[deps.StatsPlots]]
 deps = ["AbstractFFTs", "Clustering", "DataStructures", "Distributions", "Interpolations", "KernelDensity", "LinearAlgebra", "MultivariateStats", "NaNMath", "Observables", "Plots", "RecipesBase", "RecipesPipeline", "Reexport", "StatsBase", "TableOperations", "Tables", "Widgets"]
-git-tree-sha1 = "3b1dcbf62e469a67f6733ae493401e53d92ff543"
+git-tree-sha1 = "88cf3587711d9ad0a55722d339a013c4c56c5bbc"
 uuid = "f3b207a7-027a-5e70-b257-86293d7955fd"
-version = "0.15.7"
+version = "0.15.8"
 
 [[deps.StringManipulation]]
 deps = ["PrecompileTools"]
@@ -1220,7 +1301,7 @@ uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9"
 [[deps.SuiteSparse_jll]]
 deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
 uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
-version = "7.7.0+0"
+version = "7.8.3+2"
 
 [[deps.TOML]]
 deps = ["Dates"]
@@ -1267,14 +1348,14 @@ uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
 version = "0.11.3"
 
 [[deps.Tricks]]
-git-tree-sha1 = "6cae795a5a9313bbb4f60683f7263318fc7d1505"
+git-tree-sha1 = "311349fd1c93a31f783f977a71e8b062a57d4101"
 uuid = "410a4b4d-49e4-4fbc-ab6d-cb71b17b3775"
-version = "0.1.10"
+version = "0.1.13"
 
 [[deps.URIs]]
-git-tree-sha1 = "24c1c558881564e2217dcf7840a8b2e10caeb0f9"
+git-tree-sha1 = "bef26fb046d031353ef97a82e3fdb6afe7f21b1a"
 uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
-version = "1.6.0"
+version = "1.6.1"
 
 [[deps.UUIDs]]
 deps = ["Random", "SHA"]
@@ -1291,30 +1372,6 @@ git-tree-sha1 = "53915e50200959667e78a92a418594b428dffddf"
 uuid = "1cfade01-22cf-5700-b092-accc4b62d6e1"
 version = "0.4.1"
 
-[[deps.Unitful]]
-deps = ["Dates", "LinearAlgebra", "Random"]
-git-tree-sha1 = "d2282232f8a4d71f79e85dc4dd45e5b12a6297fb"
-uuid = "1986cc42-f94f-5a68-af5c-568840ba703d"
-version = "1.23.1"
-
-    [deps.Unitful.extensions]
-    ConstructionBaseUnitfulExt = "ConstructionBase"
-    ForwardDiffExt = "ForwardDiff"
-    InverseFunctionsUnitfulExt = "InverseFunctions"
-    PrintfExt = "Printf"
-
-    [deps.Unitful.weakdeps]
-    ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
-    ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-    InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
-    Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[deps.UnitfulLatexify]]
-deps = ["LaTeXStrings", "Latexify", "Unitful"]
-git-tree-sha1 = "af305cc62419f9bd61b6644d19170a4d258c7967"
-uuid = "45397f5d-5981-4c77-b2b3-fc36d6e9b728"
-version = "1.7.0"
-
 [[deps.Unzip]]
 git-tree-sha1 = "ca0969166a028236229f63514992fc073799bb78"
 uuid = "41fe7b60-77ed-43a1-b4f0-825fd5a5650d"
@@ -1327,16 +1384,10 @@ uuid = "a44049a8-05dd-5a78-86c9-5fde0876e88c"
 version = "1.3.243+0"
 
 [[deps.Wayland_jll]]
-deps = ["Artifacts", "EpollShim_jll", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll", "XML2_jll"]
-git-tree-sha1 = "53ab3e9c94f4343c68d5905565be63002e13ec8c"
+deps = ["Artifacts", "EpollShim_jll", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll"]
+git-tree-sha1 = "96478df35bbc2f3e1e791bc7a3d0eeee559e60e9"
 uuid = "a2964d1f-97da-50d4-b82a-358c7fce9d89"
-version = "1.23.1+1"
-
-[[deps.Wayland_protocols_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "54b8a029ac145ebe8299463447fd1590b2b1d92f"
-uuid = "2381bf8a-dfd0-557d-9999-79630e7b1b91"
-version = "1.44.0+0"
+version = "1.24.0+0"
 
 [[deps.Widgets]]
 deps = ["Colors", "Dates", "Observables", "OrderedCollections"]
@@ -1350,12 +1401,6 @@ git-tree-sha1 = "c1a7aa6219628fcd757dede0ca95e245c5cd9511"
 uuid = "efce3f68-66dc-5838-9240-27a6d6f5f9b6"
 version = "1.0.0"
 
-[[deps.XML2_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"]
-git-tree-sha1 = "b8b243e47228b4a3877f1dd6aee0c5d56db7fcf4"
-uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
-version = "2.13.6+1"
-
 [[deps.XZ_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
 git-tree-sha1 = "fee71455b0aaa3440dfdd54a9a36ccef829be7d4"
@@ -1406,9 +1451,9 @@ version = "1.3.7+0"
 
 [[deps.Xorg_libXfixes_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libX11_jll"]
-git-tree-sha1 = "9caba99d38404b285db8801d5c45ef4f4f425a6d"
+git-tree-sha1 = "75e00946e43621e09d431d9b95818ee751e6b2ef"
 uuid = "d091e8ba-531a-589c-9de9-94069b037ed8"
-version = "6.0.1+0"
+version = "6.0.2+0"
 
 [[deps.Xorg_libXi_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libXext_jll", "Xorg_libXfixes_jll"]
@@ -1448,9 +1493,9 @@ version = "1.1.3+0"
 
 [[deps.Xorg_xcb_util_cursor_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_jll", "Xorg_xcb_util_renderutil_jll"]
-git-tree-sha1 = "c5bf2dad6a03dfef57ea0a170a1fe493601603f2"
+git-tree-sha1 = "9750dc53819eba4e9a20be42349a6d3b86c7cdf8"
 uuid = "e920d4aa-a673-5f3a-b3d7-f755a4d47c43"
-version = "0.1.5+0"
+version = "0.1.6+0"
 
 [[deps.Xorg_xcb_util_image_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_xcb_util_jll"]
@@ -1503,7 +1548,7 @@ version = "1.6.0+0"
 [[deps.Zlib_jll]]
 deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.13+1"
+version = "1.3.1+2"
 
 [[deps.Zstd_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
@@ -1525,20 +1570,20 @@ version = "0.61.1+0"
 
 [[deps.libaom_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "522c1df09d05a71785765d19c9524661234738e9"
+git-tree-sha1 = "371cc681c00a3ccc3fbc5c0fb91f58ba9bec1ecf"
 uuid = "a4ae2306-e953-59d6-aa16-d00cac43593b"
-version = "3.11.0+0"
+version = "3.13.1+0"
 
 [[deps.libass_jll]]
 deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "HarfBuzz_jll", "JLLWrappers", "Libdl", "Zlib_jll"]
-git-tree-sha1 = "e17c115d55c5fbb7e52ebedb427a0dca79d4484e"
+git-tree-sha1 = "125eedcb0a4a0bba65b657251ce1d27c8714e9d6"
 uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0"
-version = "0.15.2+0"
+version = "0.17.4+0"
 
 [[deps.libblastrampoline_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
-version = "5.11.0+0"
+version = "5.15.0+0"
 
 [[deps.libdecor_jll]]
 deps = ["Artifacts", "Dbus_jll", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pango_jll", "Wayland_jll", "xkbcommon_jll"]
@@ -1554,9 +1599,9 @@ version = "1.13.4+0"
 
 [[deps.libfdk_aac_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "8a22cf860a7d27e4f3498a0fe0811a7957badb38"
+git-tree-sha1 = "646634dd19587a56ee2f1199563ec056c5f228df"
 uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280"
-version = "2.0.3+0"
+version = "2.0.4+0"
 
 [[deps.libinput_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "eudev_jll", "libevdev_jll", "mtdev_jll"]
@@ -1566,15 +1611,15 @@ version = "1.28.1+0"
 
 [[deps.libpng_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"]
-git-tree-sha1 = "cd155272a3738da6db765745b89e466fa64d0830"
+git-tree-sha1 = "07b6a107d926093898e82b3b1db657ebe33134ec"
 uuid = "b53b4c65-9356-5827-b1ea-8c7a1a84506f"
-version = "1.6.49+0"
+version = "1.6.50+0"
 
 [[deps.libvorbis_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Ogg_jll", "Pkg"]
-git-tree-sha1 = "490376214c4721cdaca654041f635213c6165cb3"
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Ogg_jll"]
+git-tree-sha1 = "11e1772e7f3cc987e9d3de991dd4f6b2602663a5"
 uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a"
-version = "1.3.7+2"
+version = "1.3.8+0"
 
 [[deps.mtdev_jll]]
 deps = ["Artifacts", "JLLWrappers", "Libdl"]
@@ -1585,43 +1630,51 @@ version = "1.1.7+0"
 [[deps.nghttp2_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
-version = "1.59.0+0"
+version = "1.64.0+1"
 
 [[deps.oneTBB_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl"]
-git-tree-sha1 = "d5a767a3bb77135a99e433afe0eb14cd7f6914c3"
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl"]
+git-tree-sha1 = "1350188a69a6e46f799d3945beef36435ed7262f"
 uuid = "1317d2d5-d96f-522e-a858-c73665f53c3e"
-version = "2022.0.0+0"
+version = "2022.0.0+1"
 
 [[deps.p7zip_jll]]
 deps = ["Artifacts", "Libdl"]
 uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
-version = "17.4.0+2"
+version = "17.5.0+2"
 
 [[deps.x264_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "4fea590b89e6ec504593146bf8b988b2c00922b2"
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "14cc7083fc6dff3cc44f2bc435ee96d06ed79aa7"
 uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a"
-version = "2021.5.5+0"
+version = "10164.0.1+0"
 
 [[deps.x265_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
-git-tree-sha1 = "ee567a171cce03570d77ad3a43e90218e38937a9"
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "e7b67590c14d487e734dcb925924c5dc43ec85f3"
 uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76"
-version = "3.5.0+0"
+version = "4.1.0+0"
 
 [[deps.xkbcommon_jll]]
-deps = ["Artifacts", "JLLWrappers", "Libdl", "Wayland_jll", "Wayland_protocols_jll", "Xorg_libxcb_jll", "Xorg_xkeyboard_config_jll"]
-git-tree-sha1 = "c950ae0a3577aec97bfccf3381f66666bc416729"
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxcb_jll", "Xorg_xkeyboard_config_jll"]
+git-tree-sha1 = "fbf139bce07a534df0e699dbb5f5cc9346f95cc1"
 uuid = "d8fb68d0-12a3-5cfd-a85a-d49703b185fd"
-version = "1.8.1+0"
+version = "1.9.2+0"
 """
 
 # ╔═╡ Cell order:
 # ╟─7259dbee-52ac-11f0-3192-dd97323b274a
 # ╟─d4accca6-f650-453c-bb75-a8e4cac568c1
 # ╟─51dbf656-5846-440e-aff6-ea7d57e62e5c
-# ╟─3cd5fd0a-6f16-4cb3-87ba-43b86224b81c
-# ╟─0f77fb5f-e894-43e4-94f5-4ed93af7ba9b
+# ╠═3cd5fd0a-6f16-4cb3-87ba-43b86224b81c
+# ╠═cde9391a-44dd-49ee-8730-4b9ad58c3d90
+# ╠═8601e5de-180c-45b5-b0c6-1f8d807df6d0
+# ╠═e27f7b92-79f2-4351-bbc5-46d6e5a9fd67
+# ╠═0f77fb5f-e894-43e4-94f5-4ed93af7ba9b
+# ╠═002b7672-9431-4510-ba89-84be098a2f9f
+# ╠═b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2
+# ╠═4681accf-eaed-47e0-9d26-ab968df83c8a
+# ╠═7a3a4783-8424-488d-8122-1d65680703ac
+# ╠═f3ac43c7-f6d0-4aa6-9d43-c66907de4fa0
 # ╟─00000000-0000-0000-0000-000000000001
 # ╟─00000000-0000-0000-0000-000000000002
diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl
index 93736bb..8a8dd98 100644
--- a/benchmark/runbenchmarks.jl
+++ b/benchmark/runbenchmarks.jl
@@ -119,7 +119,15 @@ end
 
 @info "Preparing benchmarks"
 warmup(SUITE; verbose=false)
-tune!(SUITE)
+
+if isfile("params.json")
+    @info "Loading params"
+    loadparams!(SUITE, BenchmarkTools.load("params.json")[1], :evals, :samples);
+else
+    @info "Tuning suite"
+    tune!(SUITE)
+    BenchmarkTools.save("params.json", params(SUITE));
+end
 
 reclaim_mem()
 
@@ -127,6 +135,7 @@ reclaim_mem()
 results = run(SUITE, verbose=true)
 
 BenchmarkTools.save("benchmarkresults.json", median(results))
+BenchmarkTools.save("benchmarkresultsstd.json", std(results))
 
 # save plots for each file/datatype
 # for l1 in keys(results)

From 59fa29e8e31897e19493b82b1971f2ae080ff2f1 Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:36:39 -0400
Subject: [PATCH 03/12] Extra shape tests

---
 benchmark/accumulate_nd.jl | 20 +++++++++++---------
 benchmark/mapreduce_nd.jl  | 23 +++++++++++------------
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/benchmark/accumulate_nd.jl b/benchmark/accumulate_nd.jl
index e8747db..24f253e 100644
--- a/benchmark/accumulate_nd.jl
+++ b/benchmark/accumulate_nd.jl
@@ -12,17 +12,19 @@ for T in [UInt32, Int64, Float32]
 
     local randrange = T == Float32 ? T : T(1):T(100)
 
-    _group["base_dims=1"] = @benchmarkable @sb(Base.accumulate(+, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
-    _group["acck_dims=1"] = @benchmarkable @sb(AK.accumulate(+, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+    for (suff, (n1, n2)) in (("L", (3, 1_000_000)), ("", (512, 1000)))
+        _group["base_dims=1$suff"] = @benchmarkable @sb(Base.accumulate(+, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["acck_dims=1$suff"] = @benchmarkable @sb(AK.accumulate(+, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
 
-    _group["base_dims=2"] = @benchmarkable @sb(Base.accumulate(+, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
-    _group["acck_dims=2"] = @benchmarkable @sb(AK.accumulate(+, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["base_dims=2$suff"] = @benchmarkable @sb(Base.accumulate(+, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["acck_dims=2$suff"] = @benchmarkable @sb(AK.accumulate(+, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
 
-    T == Float32 || continue
+        T == Float32 || continue
 
-    _group["base_sincos_dims=1"] = @benchmarkable @sb(Base.accumulate(acc_f, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
-    _group["acck_sincos_dims=1"] = @benchmarkable @sb(AK.accumulate(acc_f, v, init=$T(0), neutral=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["base_sincos_dims=1$suff"] = @benchmarkable @sb(Base.accumulate(acc_f, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["acck_sincos_dims=1$suff"] = @benchmarkable @sb(AK.accumulate(acc_f, v, init=$T(0), neutral=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
 
-    _group["base_sincos_dims=2"] = @benchmarkable @sb(Base.accumulate(acc_f, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
-    _group["acck_sincos_dims=2"] = @benchmarkable @sb(AK.accumulate(acc_f, v, init=$T(0), neutral=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["base_sincos_dims=2$suff"] = @benchmarkable @sb(Base.accumulate(acc_f, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["acck_sincos_dims=2$suff"] = @benchmarkable @sb(AK.accumulate(acc_f, v, init=$T(0), neutral=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+    end
 end
diff --git a/benchmark/mapreduce_nd.jl b/benchmark/mapreduce_nd.jl
index 26578be..2aef7c1 100644
--- a/benchmark/mapreduce_nd.jl
+++ b/benchmark/mapreduce_nd.jl
@@ -1,24 +1,23 @@
 group = addgroup!(SUITE, "mapreduce_nd")
 
-n1 = 3
-n2 = 1_000_000
-
 for T in [UInt32, Int64, Float32]
     local _group = addgroup!(group, "$T")
 
     local randrange = T == Float32 ? T : T(1):T(100)
 
-    _group["base_dims=1"] = @benchmarkable @sb(Base.reduce(+, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
-    _group["acck_dims=1"] = @benchmarkable @sb(AK.reduce(+, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+    for (suff, (n1, n2)) in (("L", (3, 1_000_000)), ("", (512, 1000)))
+        _group["base_dims=1$(suff)"] = @benchmarkable @sb(Base.reduce(+, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["acck_dims=1$(suff)"] = @benchmarkable @sb(AK.reduce(+, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
 
-    _group["base_dims=2"] = @benchmarkable @sb(Base.reduce(+, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
-    _group["acck_dims=2"] = @benchmarkable @sb(AK.reduce(+, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["base_dims=2$(suff)"] = @benchmarkable @sb(Base.reduce(+, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["acck_dims=2$(suff)"] = @benchmarkable @sb(AK.reduce(+, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
 
-    T == Float32 || continue
+        T == Float32 || continue
 
-    _group["base_dims=1_sin"] = @benchmarkable @sb(Base.mapreduce(sin, +, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
-    _group["acck_dims=1_sin"] = @benchmarkable @sb(AK.mapreduce(sin, +, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["base_dims=1$(suff)_sin"] = @benchmarkable @sb(Base.mapreduce(sin, +, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["acck_dims=1$(suff)_sin"] = @benchmarkable @sb(AK.mapreduce(sin, +, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
 
-    _group["base_dims=2_sin"] = @benchmarkable @sb(Base.mapreduce(sin, +, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
-    _group["acck_dims=2_sin"] = @benchmarkable @sb(AK.mapreduce(sin, +, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["base_dims=2$(suff)_sin"] = @benchmarkable @sb(Base.mapreduce(sin, +, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+        _group["acck_dims=2$(suff)_sin"] = @benchmarkable @sb(AK.mapreduce(sin, +, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2)))
+    end
 end

From 3321852e3f4e26b369812f3918fee31993279109 Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:37:14 -0400
Subject: [PATCH 04/12] Support specifying benchmark output file

---
 benchmark/runbenchmarks.jl | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl
index 8a8dd98..4e59dc5 100644
--- a/benchmark/runbenchmarks.jl
+++ b/benchmark/runbenchmarks.jl
@@ -10,6 +10,7 @@ rng = StableRNG(123)
 # parse command line args
 BACKENDS = ["--CUDA", "--oneAPI", "--AMDGPU", "--Metal", "--OpenCL", "--CPU"]
 b_opt_idx = in.(ARGS, Ref(BACKENDS))
+out_opt_idx = findall(x -> endswith(x, ".json",), ARGS)
 
 if !@isdefined(backend_arg)
     backend_arg = if sum(b_opt_idx) == 0
@@ -22,7 +23,9 @@ if !@isdefined(backend_arg)
 end
 backend_arg in BACKENDS || throw(ArgumentError("\"$backend_arg\" is not a valid backend."))
 
-other_args = ARGS[.!b_opt_idx]
+other_args_idx = copy(b_opt_idx)
+other_args_idx[out_opt_idx] .= true
+other_args = ARGS[.!other_args_idx]
 # other_args = ["accumulate_1"]
 
 bench_to_include = isempty(other_args) ? nothing : other_args
@@ -134,8 +137,15 @@ reclaim_mem()
 @info "Running benchmarks"
 results = run(SUITE, verbose=true)
 
-BenchmarkTools.save("benchmarkresults.json", median(results))
-BenchmarkTools.save("benchmarkresultsstd.json", std(results))
+
+result_file = if isempty((out_opt_idx))
+    "benchmarkresults"
+else
+    first(splitext(ARGS[first(out_opt_idx)]))
+end
+
+BenchmarkTools.save("$(result_file).json", median(results))
+BenchmarkTools.save("$(result_file)std.json", std(results))
 
 # save plots for each file/datatype
 # for l1 in keys(results)

From 73acfe21b4821c07098829f302c79bcab3a6b95b Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:56:14 -0400
Subject: [PATCH 05/12] Show standard error and more improvements

---
 benchmark/benchmark_graphs_nb.jl | 102 ++++++++++---------------------
 1 file changed, 33 insertions(+), 69 deletions(-)

diff --git a/benchmark/benchmark_graphs_nb.jl b/benchmark/benchmark_graphs_nb.jl
index 6ee481a..4007a91 100644
--- a/benchmark/benchmark_graphs_nb.jl
+++ b/benchmark/benchmark_graphs_nb.jl
@@ -27,18 +27,13 @@ md"""
 """
 end
 
-# ╔═╡ 8601e5de-180c-45b5-b0c6-1f8d807df6d0
-
-
-# ╔═╡ e27f7b92-79f2-4351-bbc5-46d6e5a9fd67
-
-
 # ╔═╡ 0f77fb5f-e894-43e4-94f5-4ed93af7ba9b
 begin
-	function plot_benches(df, cat, t; ylabel="Time (ns)")
+	function plot_benches(df, cat, t; ylabel="Time (ns)", kwargs...)
 		df = filter(x -> isequal(cat, x.Category), df)
 		df = filter(x -> isequal(t, x.T), df)
-		groupedbar(df.Time; group=df.alg, ylabel, title="$cat/$t", xticks=(1:length(unique(df.test)),unique(df.test)), xtickfontsize=6, xrotation = 30)
+		yerror = "Std" in names(df) ? df.Std : nothing
+		groupedbar(df.Time; group=df.alg, ylabel, yerror, title="$cat/$t", xticks=(1:length(unique(df.test)),unique(df.test)), xtickfontsize=6, xrotation = 30, kwargs...)
 	end
 	function getbenches(res)
 		_res = res[2][1]
@@ -60,34 +55,42 @@ begin
 		end
 	end
 	make_res_df(file) = make_res_df(x -> startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK"), file)
-	function make_res_df(alg_f, file)
+	function _make_res_df(alg_f, file)
 		benchresults = JSON.parsefile(file)
 		benchres_df = getbenches(benchresults)
 		benchres_df.alg  .= [alg_f(x) for x in benchres_df.Bench]
 		benchres_df.test .= [x[6:end] for x in benchres_df.Bench]
-		
-		sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
 		return benchres_df
 	end
+	function make_res_df(alg_f, file, filestd=true)
+		benchres_df = _make_res_df(alg_f, file)
+
+		final_df = if filestd
+			sbenchres_df = _make_res_df(alg_f, "$(first(splitext(file)))std.json")
+			rename!(sbenchres_df, :Time => :Std)
+			
+			innerjoin(benchres_df, sbenchres_df; on=[:Category, :T, :Bench, :alg, :test])
+		else
+			benchres_df
+		end
+		final_df = sort!(final_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
+		
+		remove_trail(x) = first(split(x, "_"))
+		final_df.Category .= remove_trail.(final_df.Category)
+		
+		sort!(final_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
+		return final_df
+	end
 end
 
-# ╔═╡ 002b7672-9431-4510-ba89-84be098a2f9f
-# begin
-# 	benchresultspre = JSON.parsefile("benchmarkresultsstd.json")
-# 	benchrespre_df = getbenches(benchresultspre)
-# 	benchrespre_df.alg  .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchrespre_df.Bench]
-# 	benchrespre_df.test .= [x[6:end] for x in benchrespre_df.Bench]
-
-# 	benchrespre_df.alg  .= [x == Symbol("1Base") ? Symbol("2Base") : Symbol("4AccK") for x in benchrespre_df.alg]
-	
-# 	sort!(benchrespre_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
-# end
-
 # ╔═╡ b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2
 begin
-	benchrespre_df = make_res_df(x->startswith(x, "base") ? Symbol("1BasePre") : Symbol("4AccKPre"), "benchmarkresultspre.json")
-	benchres_df = make_res_df(x->startswith(x, "base") ? Symbol("2Base") : Symbol("5AccK"), "benchmarkresultsnew.json")
-	benchresi32_df = make_res_df(x->startswith(x, "base") ? Symbol("3BaseI32") : Symbol("6AccKI32"), "benchmarkresults.json")
+	benchreska09_df = make_res_df(x->startswith(x, "base") ? Symbol("1Baseka0.9") : Symbol("5AccKka0.9"), "benchmarkresultska0.9.json")
+	benchreska10_df = make_res_df(x->startswith(x, "base") ? Symbol("2Baseka0.10") : Symbol("6AccKka0.10"), "benchmarkresultska0.10.json")
+	benchreski10_df = make_res_df(x->startswith(x, "base") ? Symbol("3Baseki0.10") : Symbol("7AccKki0.10"), "benchmarkresultski0.10.json")
+	benchreski10heur_df = make_res_df(x->startswith(x, "base") ? Symbol("4Baseki0.10heur") : Symbol("8AccKkiheur0.10"), "benchmarkresultski0.10heur.json")
+	benchres_df = [benchreska09_df;benchreska10_df;benchreski10_df;benchreski10heur_df]
+	benchres_df = benchres_df[.!occursin.(Ref("sin"), benchres_df.Bench), :]
 end
 
 # ╔═╡ d4accca6-f650-453c-bb75-a8e4cac568c1
@@ -100,40 +103,8 @@ md"
 Type: $(@bind typ Select(unique(benchres_df.T);))
 "
 
-# ╔═╡ 3cd5fd0a-6f16-4cb3-87ba-43b86224b81c
-plot_benches(benchres_df, cat, typ)
-
 # ╔═╡ cde9391a-44dd-49ee-8730-4b9ad58c3d90
-plot_benches([benchres_df;benchrespre_df;benchresi32_df], cat, typ)
-
-# ╔═╡ 4681accf-eaed-47e0-9d26-ab968df83c8a
-# begin
-# make_res_df(file) = make_res_df(x -> startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK"), file)
-# function make_res_df(alg_f, file)
-# 	benchresults = JSON.parsefile("benchmarkresultsstd.json")
-# 	benchres_df = getbenches(benchresults)
-# 	benchres_df.alg  .= [alg_f(x) for x in benchres_df.Bench]
-# 	benchres_df.test .= [x[6:end] for x in benchres_df.Bench]
-	
-# 	sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
-# 	return benchres_df
-# end
-# end
-
-# ╔═╡ 7a3a4783-8424-488d-8122-1d65680703ac
-# begin
-# 	benchresultspre = JSON.parsefile("benchmarkresultsstd.json")
-# 	benchrespre_df = getbenches(benchresultspre)
-# 	benchrespre_df.alg  .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchrespre_df.Bench]
-# 	benchrespre_df.test .= [x[6:end] for x in benchrespre_df.Bench]
-
-# 	benchrespre_df.alg  .= [x == Symbol("1Base") ? Symbol("2Base") : Symbol("4AccK") for x in benchrespre_df.alg]
-	
-# 	sort!(benchrespre_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
-# end
-
-# ╔═╡ f3ac43c7-f6d0-4aa6-9d43-c66907de4fa0
-benchresboth_df = sort!([benchres_df;benchrespre_df], [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)])
+plot_benches(benchres_df, cat, typ)
 
 # ╔═╡ 00000000-0000-0000-0000-000000000001
 PLUTO_PROJECT_TOML_CONTENTS = """
@@ -1666,15 +1637,8 @@ version = "1.9.2+0"
 # ╟─7259dbee-52ac-11f0-3192-dd97323b274a
 # ╟─d4accca6-f650-453c-bb75-a8e4cac568c1
 # ╟─51dbf656-5846-440e-aff6-ea7d57e62e5c
-# ╠═3cd5fd0a-6f16-4cb3-87ba-43b86224b81c
-# ╠═cde9391a-44dd-49ee-8730-4b9ad58c3d90
-# ╠═8601e5de-180c-45b5-b0c6-1f8d807df6d0
-# ╠═e27f7b92-79f2-4351-bbc5-46d6e5a9fd67
-# ╠═0f77fb5f-e894-43e4-94f5-4ed93af7ba9b
-# ╠═002b7672-9431-4510-ba89-84be098a2f9f
-# ╠═b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2
-# ╠═4681accf-eaed-47e0-9d26-ab968df83c8a
-# ╠═7a3a4783-8424-488d-8122-1d65680703ac
-# ╠═f3ac43c7-f6d0-4aa6-9d43-c66907de4fa0
+# ╟─cde9391a-44dd-49ee-8730-4b9ad58c3d90
+# ╟─0f77fb5f-e894-43e4-94f5-4ed93af7ba9b
+# ╟─b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2
 # ╟─00000000-0000-0000-0000-000000000001
 # ╟─00000000-0000-0000-0000-000000000002

From 8dce5fa5654996e2150903e693de631f69586349 Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:57:25 -0400
Subject: [PATCH 06/12] Implement algorithm selection heuristic

---
 src/accumulate/accumulate_nd.jl | 17 +++++++++--------
 src/reduce/mapreduce_nd.jl      | 16 +++++++++-------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl
index d045948..77458d0 100644
--- a/src/accumulate/accumulate_nd.jl
+++ b/src/accumulate/accumulate_nd.jl
@@ -38,17 +38,18 @@ function accumulate_nd!(
     if !use_gpu_algorithm(backend, prefer_threads)
         _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
     else
-        # On GPUs we have two parallelisation approaches, based on which dimension has more elements:
-        #   - If the dimension we are accumulating along has more elements than the "outer" dimensions,
-        #     (e.g. accumulate(+, rand(3, 1000), dims=2)), we use a block of threads per outer
-        #     dimension - thus, a block of threads reduces the dims axis
-        #   - If the other dimensions have more elements (e.g. reduce(+, rand(3, 1000), dims=1)), we
-        #     use a single thread per outer dimension - thus, a thread reduces the dims axis
-        #     sequentially, while the other dimensions are processed in parallel, independently
+        # On GPUs we have two parallelisation approaches, based on destination dimension and current hardware:
+        #   - If the other dimensions have more elements than the product of the device's compute units and
+        #     maximum number of threads , we use a single thread per outer dimension - thus, a thread reduces
+        #     the dims axis sequentially, while the other dimensions are processed in parallel, independently
+        #   - If the dimension we are accumulating along has more elements, we use a block of threads per outer
+        #     element - thus, a block of threads reduces the dims axis
         length_dims = vsizes[dims]
         length_outer = length(v) ÷ length_dims
 
-        if length_outer >= length_dims
+        serial_threshold = KI.max_work_group_size(backend) * KI.multiprocessor_count(backend)
+
+        if length_outer >= serial_threshold
             # One thread per outer dimension
             blocks = (length_outer + block_size - 1) ÷ block_size
             KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_thread!(
diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl
index 71fad0d..da00582 100644
--- a/src/reduce/mapreduce_nd.jl
+++ b/src/reduce/mapreduce_nd.jl
@@ -123,13 +123,15 @@ function mapreduce_nd(
             min_elems=min_elems,
         )
     else
-        # On GPUs we have two parallelisation approaches, based on which dimension has more elements:
-        #   - If the dimension we are reducing has more elements, (e.g. reduce(+, rand(3, 1000), dims=2)),
-        #     we use a block of threads per dst element - thus, a block of threads reduces the dims axis
-        #   - If the other dimensions have more elements (e.g. reduce(+, rand(3, 1000), dims=1)), we
-        #     use a single thread per dst element - thus, a thread reduces the dims axis sequentially,
-        #     while the other dimensions are processed in parallel, independently
-        if dst_size >= src_sizes[dims]
+        # On GPUs we have two parallelisation approaches, based on destination dimension and current hardware:
+        #   - If the other dimensions have more elements than the product of the device's compute units and
+        #     maximum number of threads , we use a single thread per dst element - thus, a thread reduces
+        #     the dims axis sequentially, while the other dimensions are processed in parallel, independently
+        #   - If the dimension we are reducing has more elements, we use a block of threads per dst
+        #     element - thus, a block of threads reduces the dims axis
+        by_thread_threshold = KI.max_work_group_size(backend) * KI.multiprocessor_count(backend)
+
+        if dst_size >= by_thread_threshold
             blocks = (dst_size + block_size - 1) ÷ block_size
             KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_thread!(
                 src, dst, f, op, init, dims, Val(block_size)

From e8bc09cee2fae5f1f379bd10bbd75b63b8bb6d36 Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 16 Nov 2025 15:27:53 -0400
Subject: [PATCH 07/12] Only check `block_size` argument when on GPU

---
 src/accumulate/accumulate_nd.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl
index 77458d0..a078f12 100644
--- a/src/accumulate/accumulate_nd.jl
+++ b/src/accumulate/accumulate_nd.jl
@@ -13,10 +13,6 @@ function accumulate_nd!(
     # GPU settings
     block_size::Int,
 )
-    # Correctness checks
-    @argcheck block_size > 0
-    @argcheck ispow2(block_size)
-
     # Degenerate cases begin; order of priority matters
 
     # Invalid dims
@@ -38,6 +34,10 @@ function accumulate_nd!(
     if !use_gpu_algorithm(backend, prefer_threads)
         _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
     else
+        # Correctness checks
+        @argcheck block_size > 0
+        @argcheck ispow2(block_size)
+        
         # On GPUs we have two parallelisation approaches, based on destination dimension and current hardware:
         #   - If the other dimensions have more elements than the product of the device's compute units and
         #     maximum number of threads , we use a single thread per outer dimension - thus, a thread reduces

From e12445ab0b4817bc104164883d044618a8abb1a5 Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 16 Nov 2025 16:39:53 -0400
Subject: [PATCH 08/12] jyf

---
 src/AcceleratedKernels.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl
index d4655de..5f14229 100644
--- a/src/AcceleratedKernels.jl
+++ b/src/AcceleratedKernels.jl
@@ -14,6 +14,7 @@ module AcceleratedKernels
 using ArgCheck: @argcheck
 using GPUArraysCore: AnyGPUArray, @allowscalar
 using KernelAbstractions
+import KernelAbstractions.KernelIntrinsics as KI
 import UnsafeAtomics
 
 

From dcb01d9af68bee711a112c96e24290362c543a46 Mon Sep 17 00:00:00 2001
From: Christian <28689358+christiangnrd@users.noreply.github.com>
Date: Sun, 16 Nov 2025 16:42:47 -0400
Subject: [PATCH 09/12] Progress

---
 benchmark/Project.toml              |  1 +
 src/accumulate/accumulate.jl        |  8 ++---
 src/accumulate/accumulate_1d_cpu.jl |  2 +-
 src/accumulate/accumulate_1d_gpu.jl |  4 +--
 src/accumulate/accumulate_nd.jl     | 36 +++++++++++++--------
 src/foreachindex.jl                 | 31 +++++++++---------
 src/reduce/mapreduce_1d_cpu.jl      |  2 +-
 src/reduce/mapreduce_1d_gpu.jl      | 49 +++++++++++++++++++----------
 src/reduce/mapreduce_nd.jl          | 34 +++++++++++++-------
 src/reduce/reduce.jl                |  6 ++--
 src/reduce/utilities.jl             |  2 +-
 src/utils.jl                        |  4 +++
 12 files changed, 110 insertions(+), 69 deletions(-)

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index a441514..1b8be73 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -2,6 +2,7 @@
 AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
 BenchmarkPlots = "ab8c0f59-4072-4e0d-8f91-a91e1495eb26"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl
index e60be03..f9858c3 100644
--- a/src/accumulate/accumulate.jl
+++ b/src/accumulate/accumulate.jl
@@ -43,7 +43,7 @@ include("accumulate_nd.jl")
         alg::AccumulateAlgorithm=ScanPrefixes(),
 
         # GPU settings
-        block_size::Int=256,
+        block_size::Union{Nothing, Int}=nothing,
         temp::Union{Nothing, AbstractArray}=nothing,
         temp_flags::Union{Nothing, AbstractArray}=nothing,
     )
@@ -63,7 +63,7 @@ include("accumulate_nd.jl")
         alg::AccumulateAlgorithm=ScanPrefixes(),
 
         # GPU settings
-        block_size::Int=256,
+        block_size::Union{Nothing, Int}=nothing,
         temp::Union{Nothing, AbstractArray}=nothing,
         temp_flags::Union{Nothing, AbstractArray}=nothing,
     )
@@ -162,7 +162,7 @@ function _accumulate_impl!(
     prefer_threads::Bool=true,
 
     # GPU settings
-    block_size::Int=256,
+    block_size::Union{Nothing, Int}=nothing,
     temp::Union{Nothing, AbstractArray}=nothing,
     temp_flags::Union{Nothing, AbstractArray}=nothing,
 )
@@ -209,7 +209,7 @@ end
         alg::AccumulateAlgorithm=ScanPrefixes(),
 
         # GPU settings
-        block_size::Int=256,
+        block_size::Union{Nothing, Int}=nothing,
         temp::Union{Nothing, AbstractArray}=nothing,
         temp_flags::Union{Nothing, AbstractArray}=nothing,
     )
diff --git a/src/accumulate/accumulate_1d_cpu.jl b/src/accumulate/accumulate_1d_cpu.jl
index 9f45ada..3d7b98c 100644
--- a/src/accumulate/accumulate_1d_cpu.jl
+++ b/src/accumulate/accumulate_1d_cpu.jl
@@ -9,7 +9,7 @@ function accumulate_1d_cpu!(
     min_elems::Int,
 
     # GPU settings - not used
-    block_size::Int,
+    block_size::Union{Nothing, Int},
     temp::Union{Nothing, AbstractArray},
     temp_flags::Union{Nothing, AbstractArray},
 )
diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl
index e1dc0f8..2fb226b 100644
--- a/src/accumulate/accumulate_1d_gpu.jl
+++ b/src/accumulate/accumulate_1d_gpu.jl
@@ -271,7 +271,7 @@ function accumulate_1d_gpu!(
     min_elems::Int,
 
     # GPU settings
-    block_size::Int,
+    block_size::Union{Nothing, Int}=256,
     temp::Union{Nothing, AbstractArray},
     temp_flags::Union{Nothing, AbstractArray},
 )
@@ -326,7 +326,7 @@ function accumulate_1d_gpu!(
     min_elems::Int,
 
     # GPU settings
-    block_size::Int,
+    block_size::Union{Nothing, Int}=256,
     temp::Union{Nothing, AbstractArray},
     temp_flags::Union{Nothing, AbstractArray},
 )
diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl
index a078f12..2cc70e5 100644
--- a/src/accumulate/accumulate_nd.jl
+++ b/src/accumulate/accumulate_nd.jl
@@ -11,7 +11,7 @@ function accumulate_nd!(
     prefer_threads::Bool=true,
 
     # GPU settings
-    block_size::Int,
+    block_size::Union{Nothing, Int},
 )
     # Degenerate cases begin; order of priority matters
 
@@ -35,9 +35,10 @@ function accumulate_nd!(
         _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems)
     else
         # Correctness checks
-        @argcheck block_size > 0
-        @argcheck ispow2(block_size)
-        
+        max_block_size = get_max_block_size(backend, block_size)
+        @argcheck max_block_size > 0
+        @argcheck ispow2(max_block_size)
+
         # On GPUs we have two parallelisation approaches, based on destination dimension and current hardware:
         #   - If the other dimensions have more elements than the product of the device's compute units and
         #     maximum number of threads , we use a single thread per outer dimension - thus, a thread reduces
@@ -50,16 +51,25 @@ function accumulate_nd!(
         serial_threshold = KI.max_work_group_size(backend) * KI.multiprocessor_count(backend)
 
         if length_outer >= serial_threshold
-            # One thread per outer dimension
-            blocks = (length_outer + block_size - 1) ÷ block_size
-            KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_thread!(
-                v, op, init, dims, inclusive, Val(block_size)
+            kernel = KI.@kernel backend launch = false _accumulate_nd_by_thread!(
+                v, op, init, dims, inclusive
+            )
+            workgroupsize = block_size_pow_2(kernel, block_size)
+            numworkgroups = (length_outer + workgroupsize - 1) ÷ workgroupsize
+            kernel(
+                v, op, init, dims, inclusive; workgroupsize, numworkgroups
             )
         else
+            kernel = KI.@kernel backend launch = false _accumulate_nd_by_block!(
+                v, op, init, neutral, dims, inclusive, Val(max_block_size)
+            )
+
+            workgroupsize = block_size_pow_2(kernel, block_size)
+
             # One block per outer dimension
-            blocks = length_outer
-            KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_block!(
-                v, op, init, neutral, dims, inclusive, Val(block_size)
+            numworkgroups = length_outer
+            kernel(
+                v, op, init, neutral, dims, inclusive, Val(max_block_size); workgroupsize, numworkgroups
             )
         end
     end
@@ -120,8 +130,7 @@ end
 
 function _accumulate_nd_by_thread!(
     v, op, init, dims, inclusive,
-    ::Val{block_size}
-) where block_size
+)
     @inbounds begin
     # One thread per outer dimension element, when there are more outer elements than in the
     # reduced dim e.g. accumulate(+, rand(3, 1000), dims=1) => only 3 elements in the accumulated
@@ -142,6 +151,7 @@ function _accumulate_nd_by_thread!(
     # Group (block) and local (thread) indices
     iblock = KI.get_group_id().x - 0x1
     ithread = KI.get_local_id().x - 0x1
+    block_size = KI.get_local_size().x
 
     # Each thread handles one outer element
     tid = ithread + iblock * block_size
diff --git a/src/foreachindex.jl b/src/foreachindex.jl
index b6409b6..2963568 100644
--- a/src/foreachindex.jl
+++ b/src/foreachindex.jl
@@ -1,11 +1,5 @@
-function _forindices_global!(f, indices, ::Val{N}) where N
-
-    # Calculate global index
-    iblock = KI.get_group_id().x
-    ithread = KI.get_local_id().x
-    i = ithread + (iblock - 0x1) * N
-    # i = get_global_id().x
-
+function _forindices_global!(f, indices)
+    i = KI.get_global_id().x
 
     if i <= length(indices)
         f(indices[i])
@@ -19,12 +13,17 @@ function _forindices_gpu(
     indices,
     backend::Backend;
 
-    block_size::Int=256,
+    block_size::Union{Nothing, Int}=nothing,
 )
     # GPU implementation
-    @argcheck block_size > 0
-    blocks = max((length(indices) + block_size - 1) ÷ block_size, 1)
-    KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _forindices_global!(f, indices, Val(block_size))
+    max_block_size = get_max_block_size(backend, block_size)
+    @argcheck max_block_size > 0
+    kernel = KI.@kernel backend launch = false _forindices_global!(f, indices)
+
+    workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size
+    numworkgroups = max((length(indices) + workgroupsize - 1) ÷ workgroupsize, 1)
+
+    kernel(f, indices; workgroupsize, numworkgroups)
 end
 
 
@@ -50,7 +49,7 @@ end
         min_elems=1,
 
         # GPU settings
-        block_size=256,
+        block_size::Union{Nothing, Int}=nothing,
     )
 
 Parallelised `for` loop over the indices of an iterable.
@@ -129,7 +128,7 @@ function foreachindex(
     prefer_threads::Bool=true,
 
     # GPU settings
-    block_size=256,
+    block_size::Union{Nothing, Int}=nothing,
 )
     if use_gpu_algorithm(backend, prefer_threads)
         _forindices_gpu(f, eachindex(itr), backend; block_size)
@@ -148,7 +147,7 @@ end
         min_elems=1,
 
         # GPU settings
-        block_size=256,
+        block_size::Union{Nothing, Int}=nothing,
     )
 
 Parallelised `for` loop over the indices along axis `dims` of an iterable.
@@ -223,7 +222,7 @@ function foraxes(
     prefer_threads::Bool=true,
 
     # GPU settings
-    block_size=256,
+    block_size::Union{Nothing, Int}=nothing,
 )
     if isnothing(dims)
         return foreachindex(
diff --git a/src/reduce/mapreduce_1d_cpu.jl b/src/reduce/mapreduce_1d_cpu.jl
index 95a93f2..43233ca 100644
--- a/src/reduce/mapreduce_1d_cpu.jl
+++ b/src/reduce/mapreduce_1d_cpu.jl
@@ -8,7 +8,7 @@ function mapreduce_1d_cpu(
     min_elems::Int,
 
     # GPU settings - ignored here
-    block_size::Int,
+    block_size::Union{Nothing, Int},
     temp::Union{Nothing, AbstractArray},
     switch_below::Int,
 )
diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl
index bb0a225..906676f 100644
--- a/src/reduce/mapreduce_1d_gpu.jl
+++ b/src/reduce/mapreduce_1d_gpu.jl
@@ -1,6 +1,7 @@
 function _mapreduce_block!(src, dst, f, op, neutral, ::Val{N}) where N
     @inbounds begin
     sdata = KI.localmemory(eltype(dst), N)
+    N_actual = KI.get_local_size().x
 
     len = length(src)
 
@@ -13,18 +14,18 @@ function _mapreduce_block!(src, dst, f, op, neutral, ::Val{N}) where N
     iblock = KI.get_group_id().x - 0x1
     ithread = KI.get_local_id().x - 0x1
 
-    i = ithread + iblock * (N * 0x2)
+    i = ithread + iblock * (N_actual * 0x2)
     if i >= len
         sdata[ithread + 0x1] = neutral
-    elseif i + N >= len
+    elseif i + N_actual >= len
         sdata[ithread + 0x1] = f(src[i + 0x1])
     else
-        sdata[ithread + 0x1] = op(f(src[i + 0x1]), f(src[i + N + 0x1]))
+        sdata[ithread + 0x1] = op(f(src[i + 0x1]), f(src[i + N_actual + 0x1]))
     end
 
     KI.barrier()
 
-    @inline reduce_group!(op, sdata, N, ithread)
+    @inline reduce_group!(op, sdata, N_actual, ithread)
 
     # Code below would work on NVidia GPUs with warp size of 32, but create race conditions and
     # return incorrect results on Intel Graphics. It would be useful to have a way to statically
@@ -57,11 +58,13 @@ function mapreduce_1d_gpu(
     min_elems::Int,
 
     # GPU settings
-    block_size::Int,
+    block_size::Union{Nothing, Int},
     temp::Union{Nothing, AbstractArray},
     switch_below::Int,
 )
-    @argcheck 1 <= block_size <= 1024
+    min_block_size = 16
+    max_block_size = min(1024, get_max_block_size(backend, block_size))
+    @argcheck 1 <= max_block_size <= 1024
     @argcheck switch_below >= 0
 
     # Degenerate cases
@@ -74,29 +77,37 @@ function mapreduce_1d_gpu(
     end
 
     # Each thread will handle two elements
-    num_per_block = 2 * block_size
-    blocks = (len + num_per_block - 1) ÷ num_per_block
+    # max_num_per_block = 2 * max_block_size
+    min_num_per_block = 2 * min_block_size
+    max_blocks = (len + min_num_per_block - 1) ÷ min_num_per_block
 
     if !isnothing(temp)
         @argcheck get_backend(temp) === backend
         @argcheck eltype(temp) === typeof(init)
-        @argcheck length(temp) >= blocks * 2
+        @argcheck length(temp) >= max_blocks * 2
         dst = temp
     else
         # Figure out type for destination
         dst_type = typeof(init)
-        dst = KernelAbstractions.allocate(backend, dst_type, blocks * 2)
+        dst = KernelAbstractions.allocate(backend, dst_type, max_blocks * 2)
     end
 
     # Later the kernel will be compiled for views anyways, so use same types
     src_view = @view src[1:end]
-    dst_view = @view dst[1:blocks]
+    dst_view = @view dst[1:max_blocks]
 
-    KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_block!(src_view, dst_view, f, op, neutral, Val(block_size))
+    kernel = KI.@kernel backend launch = false _mapreduce_block!(src_view, dst_view, f, op, neutral, Val(max_block_size))
+
+    workgroupsize = block_size_pow_2(kernel, block_size)
+    numworkgroups = (len + workgroupsize - 1) ÷ workgroupsize
+
+    dst_view = @view dst[1:numworkgroups]
+
+    kernel(src_view, dst_view, f, op, neutral, Val(max_block_size); numworkgroups, workgroupsize)
 
     # As long as we still have blocks to process, swap between the src and dst pointers at
     # the beginning of the first and second halves of dst
-    len = blocks
+    len = numworkgroups
     if len < switch_below
         h_src = Vector(@view(dst[1:len]))
         return Base.reduce(op, h_src; init)
@@ -104,14 +115,18 @@ function mapreduce_1d_gpu(
 
     # Now all src elements have been passed through f; just do final reduction, no map needed
     p1 = @view dst[1:len]
-    p2 = @view dst[blocks + 1:end]
+    p2 = @view dst[numworkgroups + 1:end]
 
     while len > 1
-        blocks = (len + num_per_block - 1) ÷ num_per_block
+        kernel = KI.@kernel backend launch = false _mapreduce_block!(p1, p2, identity, op, neutral, Val(max_block_size))
+
+        workgroupsize = block_size_pow_2(kernel, block_size)
+        numworkgroups = (len + workgroupsize - 1) ÷ workgroupsize
+
+        kernel(p1, p2, identity, op, neutral, Val(max_block_size); numworkgroups, workgroupsize)
 
         # Each block produces one reduced value
-        KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_block!(p1, p2, identity, op, neutral, Val(block_size))
-        len = blocks
+        len = numworkgroups
 
         if len < switch_below
             h_src = Vector(@view(p2[1:len]))
diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl
index da00582..e74eb28 100644
--- a/src/reduce/mapreduce_nd.jl
+++ b/src/reduce/mapreduce_nd.jl
@@ -10,10 +10,12 @@ function mapreduce_nd(
     prefer_threads::Bool=true,
 
     # GPU settings
-    block_size::Int,
+    block_size::Union{Nothing, Int},
     temp::Union{Nothing, AbstractArray},
 )
-    @argcheck 1 <= block_size <= 1024
+
+    max_block_size = min(1024, get_max_block_size(backend, block_size))
+    @argcheck 1 <= max_block_size <= 1024
 
     # Degenerate cases begin; order of priority matters
 
@@ -132,15 +134,25 @@ function mapreduce_nd(
         by_thread_threshold = KI.max_work_group_size(backend) * KI.multiprocessor_count(backend)
 
         if dst_size >= by_thread_threshold
-            blocks = (dst_size + block_size - 1) ÷ block_size
-            KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_thread!(
-                src, dst, f, op, init, dims, Val(block_size)
+            kernel = KI.@kernel backend launch = false _mapreduce_nd_by_thread!(
+                src, dst, f, op, init, dims
+            )
+            workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size
+            numworkgroups = (dst_size + workgroupsize - 1) ÷ workgroupsize
+            kernel(
+                src, dst, f, op, init, dims; workgroupsize, numworkgroups
             )
         else
+            kernel = KI.@kernel backend launch = false _mapreduce_nd_by_block!(
+                src, dst, f, op, init, neutral, dims, Val(max_block_size)
+            )
+
+            workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size
+
             # One block per output element
-            blocks = dst_size
-            KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_block!(
-                src, dst, f, op, init, neutral, dims, Val(block_size)
+            numworkgroups = dst_size
+            kernel(
+                src, dst, f, op, init, neutral, dims, Val(max_block_size); workgroupsize, numworkgroups
             )
         end
     end
@@ -194,9 +206,8 @@ end
 function _mapreduce_nd_by_thread!(
     src, dst,
     f, op,
-    init, dims,
-    ::Val{N}
-) where N
+    init, dims
+)
     # One thread per output element, when there are more outer elements than in the reduced dim
     # e.g. reduce(+, rand(3, 1000), dims=1) => only 3 elements in the reduced dim
     src_sizes = size(src)
@@ -217,6 +228,7 @@ function _mapreduce_nd_by_thread!(
     # Group (block) and local (thread) indices
     iblock = KI.get_group_id().x - 0x1
     ithread = KI.get_local_id().x - 0x1
+    N = KI.get_local_size().x
 
     # Each thread handles one output element
     tid = ithread + iblock * N
diff --git a/src/reduce/reduce.jl b/src/reduce/reduce.jl
index 230ded1..4b33526 100644
--- a/src/reduce/reduce.jl
+++ b/src/reduce/reduce.jl
@@ -17,7 +17,7 @@ include("mapreduce_nd.jl")
         min_elems::Int=1,
 
         # GPU settings
-        block_size::Int=256,
+        block_size::Union{Nothing, Int}=nothing,
         temp::Union{Nothing, AbstractArray}=nothing,
         switch_below::Int=0,
     )
@@ -100,7 +100,7 @@ end
         min_elems::Int=1,
 
         # GPU settings
-        block_size::Int=256,
+        block_size::Union{Nothing, Int}=nothing,
         temp::Union{Nothing, AbstractArray}=nothing,
         switch_below::Int=0,
     )
@@ -178,7 +178,7 @@ function _mapreduce_impl(
     prefer_threads::Bool=true,
 
     # GPU settings
-    block_size::Int=256,
+    block_size::Union{Nothing, Int}=nothing,
     temp::Union{Nothing, AbstractArray}=nothing,
     switch_below::Int=0,
 )
diff --git a/src/reduce/utilities.jl b/src/reduce/utilities.jl
index ad099a5..035fabc 100644
--- a/src/reduce/utilities.jl
+++ b/src/reduce/utilities.jl
@@ -37,7 +37,7 @@ function _mapreduce_nd_apply_init!(
     init,
     max_tasks=Threads.nthreads(),
     min_elems=1,
-    block_size=256,
+    block_size=nothing,
 )
     foreachindex(dst, backend; max_tasks, min_elems, block_size) do i
         dst[i] = op(init, f(src[i]))
diff --git a/src/utils.jl b/src/utils.jl
index d644120..9c405ce 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -8,6 +8,10 @@ const CPU_BACKEND = get_backend([])
     return backend != CPU_BACKEND || !prefer_threads
 end
 
+@inline get_max_block_size(backend, block_size) = isnothing(block_size) ? KI.max_work_group_size(backend) : block_size
+@inline block_size_pow_2(kernel, block_size) = isnothing(block_size) ? 2^floor(Int, log2(KI.kernel_max_work_group_size(kernel))) : block_size
+
+
 """
     struct TypeWrap{T} end
     TypeWrap(T) = TypeWrap{T}()

From a4610f7354afb3fd60267308ddb438cc9d1f8559 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 19 Nov 2025 20:56:53 -0400
Subject: [PATCH 10/12] Fix test on Metal

---
 test/accumulate.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/accumulate.jl b/test/accumulate.jl
index 5022e38..41baa25 100644
--- a/test/accumulate.jl
+++ b/test/accumulate.jl
@@ -175,7 +175,7 @@ end
             init = rand(-1000:1000)
             s = AK.accumulate(+, v; prefer_threads, init=Float32(init), dims)
             sh = Array(s)
-            @test all(sh .≈ accumulate(+, vh; init=Float32(init), dims))
+            @test sh ≈ accumulate(+, vh; init=Float32(init), dims)
         end
     end
 

From 4ba365c0f7f848896565fe49a6461436166c20e5 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 19 Nov 2025 20:57:26 -0400
Subject: [PATCH 11/12] Fix?

---
 src/reduce/mapreduce_1d_gpu.jl | 8 ++++++--
 src/reduce/mapreduce_nd.jl     | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl
index 906676f..411a094 100644
--- a/src/reduce/mapreduce_1d_gpu.jl
+++ b/src/reduce/mapreduce_1d_gpu.jl
@@ -99,7 +99,9 @@ function mapreduce_1d_gpu(
     kernel = KI.@kernel backend launch = false _mapreduce_block!(src_view, dst_view, f, op, neutral, Val(max_block_size))
 
     workgroupsize = block_size_pow_2(kernel, block_size)
-    numworkgroups = (len + workgroupsize - 1) ÷ workgroupsize
+
+    num_per_block = 2 * workgroupsize
+    numworkgroups = (len + num_per_block - 1) ÷ num_per_block
 
     dst_view = @view dst[1:numworkgroups]
 
@@ -121,7 +123,9 @@ function mapreduce_1d_gpu(
         kernel = KI.@kernel backend launch = false _mapreduce_block!(p1, p2, identity, op, neutral, Val(max_block_size))
 
         workgroupsize = block_size_pow_2(kernel, block_size)
-        numworkgroups = (len + workgroupsize - 1) ÷ workgroupsize
+
+        num_per_block = 2 * workgroupsize
+        numworkgroups = (len + num_per_block - 1) ÷ num_per_block
 
         kernel(p1, p2, identity, op, neutral, Val(max_block_size); numworkgroups, workgroupsize)
 
diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl
index e74eb28..f45e4ce 100644
--- a/src/reduce/mapreduce_nd.jl
+++ b/src/reduce/mapreduce_nd.jl
@@ -137,7 +137,7 @@ function mapreduce_nd(
             kernel = KI.@kernel backend launch = false _mapreduce_nd_by_thread!(
                 src, dst, f, op, init, dims
             )
-            workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size
+            workgroupsize = block_size_pow_2(kernel, block_size)
             numworkgroups = (dst_size + workgroupsize - 1) ÷ workgroupsize
             kernel(
                 src, dst, f, op, init, dims; workgroupsize, numworkgroups
@@ -147,7 +147,7 @@ function mapreduce_nd(
                 src, dst, f, op, init, neutral, dims, Val(max_block_size)
             )
 
-            workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size
+            workgroupsize = block_size_pow_2(kernel, block_size)
 
             # One block per output element
             numworkgroups = dst_size

From 1272f37a0a16fcaf0702c6b09c359bcc404297e5 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Wed, 19 Nov 2025 20:56:41 -0400
Subject: [PATCH 12/12] TEMP

---
 test/reduce.jl   | 272 +++++++++++++++++++++++------------------------
 test/runtests.jl |  22 ++--
 2 files changed, 147 insertions(+), 147 deletions(-)

diff --git a/test/reduce.jl b/test/reduce.jl
index 9fe8b5c..4bcd5ab 100644
--- a/test/reduce.jl
+++ b/test/reduce.jl
@@ -20,7 +20,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     end
 
     # Fuzzy correctness testing
-    for _ in 1:1000
+    for _ in 1:1#0
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Int32, num_elems))
         s = redmin(v)
@@ -28,7 +28,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
         @test s == minimum(vh)
     end
 
-    for _ in 1:1000
+    for _ in 1:1#0
         num_elems = rand(1:100_000)
         v = array_from_host(rand(UInt32, num_elems))
         s = redmin(v)
@@ -36,7 +36,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
         @test s == minimum(vh)
     end
 
-    for _ in 1:1000
+    for _ in 1:1#0
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
         s = redmin(v)
@@ -56,7 +56,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     end
 
     # Fuzzy correctness testing
-    for _ in 1:1000
+    for _ in 1:1#0
         num_elems = rand(1:100_000)
         v = array_from_host(rand(1:100, num_elems), Int32)
         s = redsum(v)
@@ -64,7 +64,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
         @test s == sum(vh)
     end
 
-    for _ in 1:1000
+    for _ in 1:1#0
         num_elems = rand(1:100_000)
         v = array_from_host(rand(1:100, num_elems), UInt32)
         s = redsum(v)
@@ -72,7 +72,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
         @test s == sum(vh)
     end
 
-    for _ in 1:1000
+    for _ in 1:1#0
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
         s = redsum(v)
@@ -81,7 +81,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     end
 
     # Allowing N-dimensional arrays, still reduced as 1D
-    for _ in 1:100
+    for _ in 1:1
         n1 = rand(1:100)
         n2 = rand(1:100)
         n3 = rand(1:100)
@@ -92,7 +92,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     end
 
     # Ensuring that the init value is respected
-    for _ in 1:100
+    for _ in 1:1
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Int32(1):Int32(100), num_elems))
         s = AK.reduce(+, v; prefer_threads, init=Int32(10))
@@ -101,7 +101,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     end
 
     # Testing with switch_below - i.e. finishing on the CPU
-    for _ in 1:100
+    for _ in 1:1
         num_elems = rand(1:100_000)
         v = array_from_host(rand(1:100, num_elems), Int32)
         switch_below = rand(1:100)
@@ -112,7 +112,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0)
     end
 
     # Test with unmaterialised ranges
-    for _ in 1:100
+    for _ in 1:1
         num_elems = rand(1:1000)
         v = 1:num_elems
         s = AK.reduce(+, v, BACKEND; prefer_threads, init=Int32(0))
@@ -168,7 +168,7 @@ end
     end
 
     # Fuzzy correctness testing
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -181,7 +181,7 @@ end
         end
     end
 
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -194,7 +194,7 @@ end
         end
     end
 
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -208,7 +208,7 @@ end
     end
 
     # Ensuring that the init value is respected
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:4
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -255,114 +255,114 @@ end
 end
 
 
-@testset "mapreduce_1d" begin
-    Random.seed!(0)
-
-    function minbox(s)
-        # Extract coordinates into tuple and reduce to find dimensionwise minima
-        AK.mapreduce(
-            p -> (p.x, p.y),
-            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
-            s;
-            prefer_threads,
-            init=(typemax(Float32), typemax(Float32)),
-            neutral=(typemax(Float32), typemax(Float32)),
-        )
-    end
-
-    function minbox_base(s)
-        # Extract coordinates into tuple and reduce to find dimensionwise minima
-        Base.mapreduce(
-            p -> (p.x, p.y),
-            (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
-            s;
-            init=(typemax(Float32), typemax(Float32)),
-        )
-    end
-
-    # Fuzzy correctness testing
-    for _ in 1:1000
-        num_elems = rand(1:100_000)
-        v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:num_elems])
-        mgpu = minbox(v)
-
-        vh = Array(v)
-        mcpu = minbox(vh)
-        mbase = minbox_base(vh)
-
-        @test typeof(mgpu) === typeof(mcpu) === typeof(mbase)
-        @test mgpu[1] ≈ mcpu[1] ≈ mbase[1]
-        @test mgpu[2] ≈ mcpu[2] ≈ mbase[2]
-    end
-
-    # Allowing N-dimensional arrays, still reduced as 1D
-    for _ in 1:100
-        n1 = rand(1:100)
-        n2 = rand(1:100)
-        n3 = rand(1:100)
-
-        v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:n1, _ in 1:n2, _ in 1:n3])
-        mgpu = minbox(v)
-
-        vh = Array(v)
-        mcpu = minbox(vh)
-        mbase = minbox_base(vh)
-
-        @test typeof(mgpu) === typeof(mcpu) === typeof(mbase)
-        @test mgpu[1] ≈ mcpu[1] ≈ mbase[1]
-        @test mgpu[2] ≈ mcpu[2] ≈ mbase[2]
-    end
-
-    # Ensuring that the init value is respected
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(Int32(1):Int32(100), num_elems))
-        s = AK.mapreduce(abs, +, v; prefer_threads, init=Int32(10))
-        vh = Array(v)
-        @test s == sum(vh) + 10
-    end
-
-    # Testing with switch_below - i.e. finishing on the CPU
-    for _ in 1:100
-        num_elems = rand(1:100_000)
-        v = array_from_host(rand(-100:-1, num_elems), Int32)
-        switch_below = rand(1:100)
-        init = rand(1:100)
-        s = AK.mapreduce(abs, +, v; prefer_threads, switch_below=switch_below, init=Int32(init))
-        vh = Array(v)
-        @test s == mapreduce(abs, +, vh; init)
-    end
-
-    # Test with unmaterialised ranges
-    for _ in 1:100
-        num_elems = rand(1:1000)
-        v = 1:num_elems
-        s = AK.mapreduce(abs, +, v, BACKEND; prefer_threads, init=Int32(0))
-        vh = Array(v)
-        @test s == mapreduce(abs, +, vh)
-    end
-
-    # Testing different settings, enforcing change of type between f and op
-    f(s, temp) = AK.mapreduce(
-        p -> (p.x, p.y),
-        (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
-        s;
-        prefer_threads,
-        init=(typemax(Float32), typemax(Float32)),
-        neutral=(typemax(Float32), typemax(Float32)),
-        block_size=64,
-        temp=temp,
-        switch_below=50,
-        max_tasks=10,
-        min_elems=100,
-    )
-    v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:10_042])
-    temp = similar(v, Tuple{Float32, Float32})
-    f(v, temp)
-
-    # Test that undefined kwargs are not accepted
-    @test_throws MethodError AK.mapreduce(-, +, v; prefer_threads, init=10, bad=:kwarg)
-end
+# @testset "mapreduce_1d" begin
+#     Random.seed!(0)
+
+#     function minbox(s)
+#         # Extract coordinates into tuple and reduce to find dimensionwise minima
+#         AK.mapreduce(
+#             p -> (p.x, p.y),
+#             (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
+#             s;
+#             prefer_threads,
+#             init=(typemax(Float32), typemax(Float32)),
+#             neutral=(typemax(Float32), typemax(Float32)),
+#         )
+#     end
+
+#     function minbox_base(s)
+#         # Extract coordinates into tuple and reduce to find dimensionwise minima
+#         Base.mapreduce(
+#             p -> (p.x, p.y),
+#             (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
+#             s;
+#             init=(typemax(Float32), typemax(Float32)),
+#         )
+#     end
+
+#     # Fuzzy correctness testing
+#     for _ in 1:1#0
+#         num_elems = rand(1:100_000)
+#         v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:num_elems])
+#         mgpu = minbox(v)
+
+#         vh = Array(v)
+#         mcpu = minbox(vh)
+#         mbase = minbox_base(vh)
+
+#         @test typeof(mgpu) === typeof(mcpu) === typeof(mbase)
+#         @test mgpu[1] ≈ mcpu[1] ≈ mbase[1]
+#         @test mgpu[2] ≈ mcpu[2] ≈ mbase[2]
+#     end
+
+#     # Allowing N-dimensional arrays, still reduced as 1D
+#     for _ in 1:1
+#         n1 = rand(1:100)
+#         n2 = rand(1:100)
+#         n3 = rand(1:100)
+
+#         v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:n1, _ in 1:n2, _ in 1:n3])
+#         mgpu = minbox(v)
+
+#         vh = Array(v)
+#         mcpu = minbox(vh)
+#         mbase = minbox_base(vh)
+
+#         @test typeof(mgpu) === typeof(mcpu) === typeof(mbase)
+#         @test mgpu[1] ≈ mcpu[1] ≈ mbase[1]
+#         @test mgpu[2] ≈ mcpu[2] ≈ mbase[2]
+#     end
+
+#     # Ensuring that the init value is respected
+#     for _ in 1:1
+#         num_elems = rand(1:100_000)
+#         v = array_from_host(rand(Int32(1):Int32(100), num_elems))
+#         s = AK.mapreduce(abs, +, v; prefer_threads, init=Int32(10))
+#         vh = Array(v)
+#         @test s == sum(vh) + 10
+#     end
+
+#     # Testing with switch_below - i.e. finishing on the CPU
+#     for _ in 1:1
+#         num_elems = rand(1:100_000)
+#         v = array_from_host(rand(-100:-1, num_elems), Int32)
+#         switch_below = rand(1:100)
+#         init = rand(1:100)
+#         s = AK.mapreduce(identity, +, v; prefer_threads, switch_below=switch_below, init=Int32(init), block_size=256)
+#         vh = Array(v)
+#         @test s == mapreduce(identity, +, vh; init)
+#     end
+
+#     # Test with unmaterialised ranges
+#     for _ in 1:1
+#         num_elems = rand(1:1000)
+#         v = 1:num_elems
+#         s = AK.mapreduce(abs, +, v, BACKEND; prefer_threads, init=Int32(0))
+#         vh = Array(v)
+#         @test s == mapreduce(abs, +, vh)
+#     end
+
+#     # Testing different settings, enforcing change of type between f and op
+#     f(s, temp) = AK.mapreduce(
+#         p -> (p.x, p.y),
+#         (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]),
+#         s;
+#         prefer_threads,
+#         init=(typemax(Float32), typemax(Float32)),
+#         neutral=(typemax(Float32), typemax(Float32)),
+#         block_size=64,
+#         temp=temp,
+#         switch_below=50,
+#         max_tasks=10,
+#         min_elems=100,
+#     )
+#     v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:10_042])
+#     temp = similar(v, Tuple{Float32, Float32})
+#     f(v, temp)
+
+#     # Test that undefined kwargs are not accepted
+#     @test_throws MethodError AK.mapreduce(-, +, v; prefer_threads, init=10, bad=:kwarg)
+# end
 
 
 @testset "mapreduce_nd" begin
@@ -385,7 +385,7 @@ end
     end
 
     # Fuzzy correctness testing
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -423,7 +423,7 @@ end
     end
 
     # Fuzzy correctness testing
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -444,7 +444,7 @@ end
     end
 
     # Ensuring that the init value is respected
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:4
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -500,13 +500,13 @@ end
     @test AK.sum(v; prefer_threads) == sum(Array(v))
 
     # Fuzzy testing
-    for _ in 1:100
+    for _ in 1:1
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
         @test AK.sum(v; prefer_threads) ≈ sum(Array(v))
     end
 
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -545,13 +545,13 @@ end
     @test AK.prod(v; prefer_threads) == prod(Array(v))
 
     # Fuzzy testing
-    for _ in 1:100
+    for _ in 1:1
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
         @test AK.prod(v; prefer_threads) ≈ prod(Array(v))
     end
 
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:10)
             n2 = rand(1:10)
@@ -590,13 +590,13 @@ end
     @test AK.minimum(v; prefer_threads) == minimum(Array(v))
 
     # Fuzzy testing
-    for _ in 1:100
+    for _ in 1:1
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
         @test AK.minimum(v; prefer_threads) == minimum(Array(v))
     end
 
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -635,13 +635,13 @@ end
     @test AK.maximum(v; prefer_threads) == maximum(Array(v))
 
     # Fuzzy testing
-    for _ in 1:100
+    for _ in 1:1
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
         @test AK.maximum(v; prefer_threads) == maximum(Array(v))
     end
 
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -680,13 +680,13 @@ end
     @test AK.count(x->x>50, v; prefer_threads) == count(x->x>50, Array(v))
 
     # Fuzzy testing
-    for _ in 1:100
+    for _ in 1:1
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Float32, num_elems))
         @test AK.count(x->x>0.5, v; prefer_threads) == count(x->x>0.5, Array(v))
     end
 
-    for _ in 1:100
+    for _ in 1:1
         for dims in 1:3
             n1 = rand(1:100)
             n2 = rand(1:100)
@@ -706,7 +706,7 @@ end
     end
 
     # Counting booleans directly
-    for _ in 1:100
+    for _ in 1:1
         num_elems = rand(1:100_000)
         v = array_from_host(rand(Bool, num_elems))
         @test AK.count(v; prefer_threads) == count(Array(v))
diff --git a/test/runtests.jl b/test/runtests.jl
index 716fd8e..1544bb3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -31,7 +31,7 @@ elseif "--AMDGPU" in ARGS
     const BACKEND = ROCBackend()
     TEST_DL[] = true
 elseif "--Metal" in ARGS
-    Pkg.add("Metal")
+    # Pkg.add("Metal")
     using Metal
     Metal.versioninfo()
     const BACKEND = MetalBackend()
@@ -61,16 +61,16 @@ function array_from_host(backend, h_arr::AbstractArray, dtype=nothing)
     d_arr
 end
 
-@testset "Aqua" begin
-    using Aqua
-    Aqua.test_all(AK)
-end
+# @testset "Aqua" begin
+#     using Aqua
+#     Aqua.test_all(AK)
+# end
 
-include("partition.jl")
-include("looping.jl")
-include("map.jl")
-include("sort.jl")
+# include("partition.jl")
+# include("looping.jl")
+# include("map.jl")
+# include("sort.jl")
 include("reduce.jl")
 include("accumulate.jl")
-include("predicates.jl")
-include("binarysearch.jl")
+# include("predicates.jl")
+# include("binarysearch.jl")