From 857669f2f8dbd3508399b72858886aa8928dfb2e Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 7 Nov 2025 14:50:33 -0400 Subject: [PATCH 01/12] KA 0.10 --- Project.toml | 4 +- src/AcceleratedKernels.jl | 1 - src/accumulate/accumulate_1d_gpu.jl | 77 ++++++++++++++--------------- src/accumulate/accumulate_nd.jl | 58 +++++++++++----------- src/foreachindex.jl | 15 +++--- src/predicates.jl | 21 +++++--- src/reduce/mapreduce_1d_gpu.jl | 22 ++++----- src/reduce/mapreduce_nd.jl | 46 ++++++++--------- src/reduce/utilities.jl | 20 ++++---- src/sort/merge_sort.jl | 34 ++++++------- src/sort/merge_sort_by_key.jl | 39 ++++++++------- src/sort/merge_sortperm.jl | 6 +-- 12 files changed, 171 insertions(+), 172 deletions(-) diff --git a/Project.toml b/Project.toml index 2fccea8..e56f795 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AcceleratedKernels" uuid = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" authors = ["Andrei-Leonard Nicusan and contributors"] -version = "0.4.3" +version = "0.5" [deps] ArgCheck = "dce04be8-c92d-5529-be00-80e4d2c0e197" @@ -19,7 +19,7 @@ AcceleratedKernelsoneAPIExt = "oneAPI" [compat] ArgCheck = "2" GPUArraysCore = "0.2.0" -KernelAbstractions = "0.9.34" +KernelAbstractions = "0.10" Markdown = "1" UnsafeAtomics = "0.3.0" julia = "1.10" diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index d662c2a..d4655de 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -14,7 +14,6 @@ module AcceleratedKernels using ArgCheck: @argcheck using GPUArraysCore: AnyGPUArray, @allowscalar using KernelAbstractions -using KernelAbstractions: @context import UnsafeAtomics diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl index 9202692..e1dc0f8 100644 --- a/src/accumulate/accumulate_1d_gpu.jl +++ b/src/accumulate/accumulate_1d_gpu.jl @@ -12,16 +12,17 @@ const ACC_FLAG_P::UInt8 = 1 # Only current block's prefix available end -@kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_block!( +function _accumulate_block!( op, v, init, neutral, inclusive, flags, prefixes, # one per block -) + ::Val{block_size} +) where block_size + @inbounds begin # NOTE: shmem_size MUST be greater than 2 * block_size # NOTE: block_size MUST be a power of 2 len = length(v) - @uniform block_size = @groupsize()[1] - temp = @localmem eltype(v) (0x2 * block_size + conflict_free_offset(0x2 * block_size),) + temp = KI.localmemory(eltype(v), 0x2 * block_size + conflict_free_offset(0x2 * block_size)) # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero @@ -29,10 +30,10 @@ end # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 - num_blocks = @ndrange()[1] ÷ block_size + num_blocks = KI.get_num_groups().x block_offset = iblock * block_size * 0x2 # Processing two elements per thread # Copy two elements from the main array; offset indices to avoid bank conflicts @@ -59,7 +60,7 @@ end next_pow2 = block_size * 0x2 d = next_pow2 >> 0x1 while d > 0x0 # TODO: unroll this like in reduce.jl ? - @synchronize() + KI.barrier() if ithread < d _ai = offset * (0x2 * ithread + 0x1) - 0x1 @@ -84,7 +85,7 @@ end d = typeof(ithread)(1) while d < next_pow2 offset = offset >> 0x1 - @synchronize() + KI.barrier() if ithread < d _ai = offset * (0x2 * ithread + 0x1) - 0x1 @@ -103,10 +104,10 @@ end # Later blocks should always be inclusively-scanned if inclusive || iblock != 0x0 # To compute an inclusive scan, shift elements left... - @synchronize() + KI.barrier() t1 = temp[ai + bank_offset_a + 0x1] t2 = temp[bi + bank_offset_b + 0x1] - @synchronize() + KI.barrier() if ai > 0x0 temp[ai - 0x1 + conflict_free_offset(ai - 0x1) + 0x1] = t1 @@ -123,7 +124,7 @@ end end end - @synchronize() + KI.barrier() # Write this block's final prefix to global array and set flag to "block prefix computed" if bi == 0x2 * block_size - 0x1 @@ -145,15 +146,16 @@ end if block_offset + bi < len v[block_offset + bi + 0x1] = temp[bi + bank_offset_b + 0x1] end + end + nothing end -@kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous!( - op, v, flags, @Const(prefixes), -) - +function _accumulate_previous!( + op, v, flags, prefixes, ::Val{block_size} +) where block_size + @inbounds begin len = length(v) - block_size = @groupsize()[1] # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero @@ -161,8 +163,8 @@ end # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 + 0x1 # Skipping first block - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + 0x1 # Skipping first block + ithread = KI.get_local_id().x - 0x1 block_offset = iblock * block_size * 0x2 # Processing two elements per thread # Each block looks back to find running prefix sum @@ -197,7 +199,7 @@ end # There are two synchronization concerns here: # 1. Withing a group we want to ensure that all writed to `v` have occured before setting the flag. # 2. Between groups we need to use a fence and atomic load/store to ensure that memory operations are not re-ordered - @synchronize() # within-block + KI.barrier() # within-block # Note: This fence is needed to ensure that the flag is not set before copying into v. # See https://doc.rust-lang.org/std/sync/atomic/fn.fence.html # for more details. @@ -206,15 +208,17 @@ end if ithread == 0x0 UnsafeAtomics.store!(pointer(flags, iblock + 0x1), convert(eltype(flags), ACC_FLAG_A), UnsafeAtomics.monotonic) end + end + nothing end -@kernel cpu=false inbounds=true unsafe_indices=true function _accumulate_previous_coupled_preblocks!( - op, v, prefixes, -) +function _accumulate_previous_coupled_preblocks!( + op, v, prefixes, ::Val{block_size} +) where block_size + @inbounds begin # No decoupled lookback len = length(v) - block_size = @groupsize()[1] # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero @@ -222,8 +226,8 @@ end # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 + 0x1 # Skipping first block - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + 0x1 # Skipping first block + ithread = KI.get_local_id().x - 0x1 block_offset = iblock * block_size * 0x2 # Processing two elements per thread # Each block looks back to find running prefix sum @@ -250,6 +254,8 @@ end if block_offset + bi < len v[block_offset + bi + 0x1] = op(running_prefix, v[block_offset + bi + 0x1]) end + end + nothing end @@ -298,14 +304,10 @@ function accumulate_1d_gpu!( flags = temp_flags end - kernel1! = _accumulate_block!(backend, block_size) - kernel1!(op, v, init, neutral, inclusive, flags, prefixes, - ndrange=num_blocks * block_size) + KI.@kernel backend workgroupsize=block_size numworkgroups=num_blocks _accumulate_block!(op, v, init, neutral, inclusive, flags, prefixes, Val(block_size)) if num_blocks > 1 - kernel2! = _accumulate_previous!(backend, block_size) - kernel2!(op, v, flags, prefixes, - ndrange=(num_blocks - 1) * block_size) + KI.@kernel backend workgroupsize=block_size numworkgroups=(num_blocks-1) _accumulate_previous!(op, v, flags, prefixes, Val(block_size)) end return v @@ -349,22 +351,17 @@ function accumulate_1d_gpu!( prefixes = temp end - kernel1! = _accumulate_block!(backend, block_size) - kernel1!(op, v, init, neutral, inclusive, nothing, prefixes, - ndrange=num_blocks * block_size) + KI.@kernel backend workgroupsize=block_size numworkgroups=num_blocks _accumulate_block!(op, v, init, neutral, inclusive, nothing, prefixes, Val(block_size)) if num_blocks > 1 # Accumulate prefixes of all blocks; use neutral as init here to not reinclude init num_blocks_prefixes = (length(prefixes) + elems_per_block - 1) ÷ elems_per_block - kernel1!(op, prefixes, neutral, neutral, true, nothing, nothing, - ndrange=num_blocks_prefixes * block_size) + KI.@kernel backend workgroupsize=block_size numworkgroups=num_blocks_prefixes _accumulate_block!(op, prefixes, neutral, neutral, true, nothing, nothing, Val(block_size)) # Prefixes are pre-accumulated (completely accumulated if num_blocks_prefixes == 1, or # partially, which we will account for in the coupled lookback) - kernel2! = _accumulate_previous_coupled_preblocks!(backend, block_size) - kernel2!(op, v, prefixes, - ndrange=(num_blocks - 1) * block_size) + KI.@kernel backend workgroupsize=block_size numworkgroups=(num_blocks-1) _accumulate_previous_coupled_preblocks!(op, v, prefixes, Val(block_size)) end return v diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl index 83b210d..d045948 100644 --- a/src/accumulate/accumulate_nd.jl +++ b/src/accumulate/accumulate_nd.jl @@ -51,18 +51,14 @@ function accumulate_nd!( if length_outer >= length_dims # One thread per outer dimension blocks = (length_outer + block_size - 1) ÷ block_size - kernel1! = _accumulate_nd_by_thread!(backend, block_size) - kernel1!( - v, op, init, dims, inclusive, - ndrange=(block_size * blocks,), + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_thread!( + v, op, init, dims, inclusive, Val(block_size) ) else # One block per outer dimension blocks = length_outer - kernel2! = _accumulate_nd_by_block!(backend, block_size) - kernel2!( - v, op, init, neutral, dims, inclusive, - ndrange=(block_size, blocks), + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_block!( + v, op, init, neutral, dims, inclusive, Val(block_size) ) end end @@ -121,9 +117,11 @@ function _accumulate_nd_cpu_sections!( end -@kernel inbounds=true cpu=false unsafe_indices=true function _accumulate_nd_by_thread!( +function _accumulate_nd_by_thread!( v, op, init, dims, inclusive, -) + ::Val{block_size} +) where block_size + @inbounds begin # One thread per outer dimension element, when there are more outer elements than in the # reduced dim e.g. accumulate(+, rand(3, 1000), dims=1) => only 3 elements in the accumulated # dim @@ -135,16 +133,14 @@ end length_dims = vsizes[dims] length_outer = length(v) ÷ length_dims - block_size = @groupsize()[1] - # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero # indexing). Internal calculations will be done using zero indexing except when actually # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 # Each thread handles one outer element tid = ithread + iblock * block_size @@ -178,12 +174,16 @@ end end end end + end + nothing end -@kernel inbounds=true cpu=false unsafe_indices=true function _accumulate_nd_by_block!( +function _accumulate_nd_by_block!( v, op, init, neutral, dims, inclusive, -) + ::Val{block_size} +) where block_size + @inbounds begin # NOTE: shmem_size MUST be greater than 2 * block_size # NOTE: block_size MUST be a power of 2 @@ -198,10 +198,8 @@ end length_dims = vsizes[dims] length_outer = length(v) ÷ length_dims - @uniform block_size = @groupsize()[1] - - temp = @localmem eltype(v) (0x2 * block_size + conflict_free_offset(0x2 * block_size),) - running_prefix = @localmem eltype(v) (1,) + temp = KI.localmemory(eltype(v), 0x2 * block_size + conflict_free_offset(0x2 * block_size)) + running_prefix = KI.localmemory(eltype(v), 1) # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero @@ -209,8 +207,8 @@ end # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 # Each block handles one outer element; guaranteed to have exact number of blocks, so no need # for `if iblock < length_outer` @@ -234,7 +232,7 @@ end if ithread == 0x0 running_prefix[0x1] = neutral end - @synchronize() + KI.barrier() while ichunk < num_chunks block_offset = ichunk * block_size * 0x2 # Processing two elements per thread @@ -271,7 +269,7 @@ end next_pow2 = block_size * 0x2 d = next_pow2 >> 0x1 while d > 0x0 # TODO: unroll this like in reduce.jl ? - @synchronize() + KI.barrier() if ithread < d _ai = offset * (0x2 * ithread + 0x1) - 0x1 @@ -296,7 +294,7 @@ end d = typeof(ithread)(1) while d < next_pow2 offset = offset >> 0x1 - @synchronize() + KI.barrier() if ithread < d _ai = offset * (0x2 * ithread + 0x1) - 0x1 @@ -315,10 +313,10 @@ end # Later blocks should always be inclusively-scanned if inclusive || ichunk != 0x0 # To compute an inclusive scan, shift elements left... - @synchronize() + KI.barrier() t1 = temp[ai + bank_offset_a + 0x1] t2 = temp[bi + bank_offset_b + 0x1] - @synchronize() + KI.barrier() if ai > 0x0 temp[ai - 0x1 + conflict_free_offset(ai - 0x1) + 0x1] = t1 @@ -344,7 +342,7 @@ end end _running_prefix = running_prefix[0x1] - @synchronize() + KI.barrier() if block_offset + ai < length_dims total = op(_running_prefix, temp[ai + bank_offset_a + 0x1]) @@ -367,8 +365,10 @@ end if bi == 0x2 * block_size - 0x1 running_prefix[0x1] = total end - @synchronize() + KI.barrier() ichunk += 0x1 end + end + nothing end diff --git a/src/foreachindex.jl b/src/foreachindex.jl index 1f8aa47..b6409b6 100644 --- a/src/foreachindex.jl +++ b/src/foreachindex.jl @@ -1,14 +1,16 @@ -@kernel inbounds=true cpu=false unsafe_indices=true function _forindices_global!(f, indices) +function _forindices_global!(f, indices, ::Val{N}) where N # Calculate global index - N = @groupsize()[1] - iblock = @index(Group, Linear) - ithread = @index(Local, Linear) + iblock = KI.get_group_id().x + ithread = KI.get_local_id().x i = ithread + (iblock - 0x1) * N + # i = get_global_id().x + if i <= length(indices) f(indices[i]) end + nothing end @@ -21,9 +23,8 @@ function _forindices_gpu( ) # GPU implementation @argcheck block_size > 0 - blocks = (length(indices) + block_size - 1) ÷ block_size - _forindices_global!(backend, block_size)(f, indices, ndrange=(block_size * blocks,)) - nothing + blocks = max((length(indices) + block_size - 1) ÷ block_size, 1) + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _forindices_global!(f, indices, Val(block_size)) end diff --git a/src/predicates.jl b/src/predicates.jl index cfd63f8..e8576b4 100644 --- a/src/predicates.jl +++ b/src/predicates.jl @@ -6,26 +6,27 @@ Base.@kwdef struct MapReduce{T <: Union{Nothing, AbstractArray}} <: PredicatesAl end -@kernel cpu=false inbounds=true function _any_global!(out, pred, @Const(v)) - temp = @localmem Int8 (1,) - i = @index(Global, Linear) +function _any_global!(out, pred, v) + temp = KI.localmemory(Int8, 1) + i = KI.get_global_id().x # Technically this is a race, but it doesn't matter as all threads would write the same value. # For example, CUDA F4.2 says "If a non-atomic instruction executed by a warp writes to the # same location in global memory for more than one of the threads of the warp, only one thread # performs a write and which thread does it is undefined." temp[0x1] = 0x0 - @synchronize() + KI.barrier() # The ndrange check already protects us from out of bounds access - if pred(v[i]) + if i <= length(v) && pred(v[i]) temp[0x1] = 0x1 end - @synchronize() + KI.barrier() if temp[0x1] != 0x0 out[0x1] = 0x1 end + nothing end @@ -127,7 +128,9 @@ function _any_impl( # CUDA). If not cooperative, we need to do a mapreduce if alg === ConcurrentWrite() out = KernelAbstractions.zeros(backend, Int8, 1) - _any_global!(backend, block_size)(out, pred, v, ndrange=length(v)) + workgroupsize = min(length(v), block_size) + numworkgroups = cld(length(v), workgroupsize) + KI.@kernel backend numworkgroups workgroupsize _any_global!(out, pred, v) outh = @allowscalar(out[1]) return outh == 0 ? false : true else @@ -261,7 +264,9 @@ function _all_impl( # CUDA). If not cooperative, we need to do a mapreduce if alg === ConcurrentWrite() out = KernelAbstractions.zeros(backend, Int8, 1) - _any_global!(backend, block_size)(out, (!pred), v, ndrange=length(v)) + workgroupsize = min(length(v), block_size) + numworkgroups = cld(length(v), workgroupsize) + KI.@kernel backend numworkgroups workgroupsize _any_global!(out, (!pred), v) outh = @allowscalar(out[1]) return outh == 0 ? true : false else diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl index 39e7c41..bb0a225 100644 --- a/src/reduce/mapreduce_1d_gpu.jl +++ b/src/reduce/mapreduce_1d_gpu.jl @@ -1,7 +1,6 @@ -@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_block!(@Const(src), dst, f, op, neutral) - - @uniform N = @groupsize()[1] - sdata = @localmem eltype(dst) (N,) +function _mapreduce_block!(src, dst, f, op, neutral, ::Val{N}) where N + @inbounds begin + sdata = KI.localmemory(eltype(dst), N) len = length(src) @@ -11,8 +10,8 @@ # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 i = ithread + iblock * (N * 0x2) if i >= len @@ -23,9 +22,9 @@ sdata[ithread + 0x1] = op(f(src[i + 0x1]), f(src[i + N + 0x1])) end - @synchronize() + KI.barrier() - @inline reduce_group!(@context, op, sdata, N, ithread) + @inline reduce_group!(op, sdata, N, ithread) # Code below would work on NVidia GPUs with warp size of 32, but create race conditions and # return incorrect results on Intel Graphics. It would be useful to have a way to statically @@ -43,6 +42,8 @@ if ithread == 0x0 dst[iblock + 0x1] = sdata[0x1] end + end + nothing end @@ -91,8 +92,7 @@ function mapreduce_1d_gpu( src_view = @view src[1:end] dst_view = @view dst[1:blocks] - kernel! = _mapreduce_block!(backend, block_size) - kernel!(src_view, dst_view, f, op, neutral, ndrange=(block_size * blocks,)) + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_block!(src_view, dst_view, f, op, neutral, Val(block_size)) # As long as we still have blocks to process, swap between the src and dst pointers at # the beginning of the first and second halves of dst @@ -110,7 +110,7 @@ function mapreduce_1d_gpu( blocks = (len + num_per_block - 1) ÷ num_per_block # Each block produces one reduced value - kernel!(p1, p2, identity, op, neutral, ndrange=(block_size * blocks,)) + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_block!(p1, p2, identity, op, neutral, Val(block_size)) len = blocks if len < switch_below diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl index 231d0dc..71fad0d 100644 --- a/src/reduce/mapreduce_nd.jl +++ b/src/reduce/mapreduce_nd.jl @@ -131,18 +131,14 @@ function mapreduce_nd( # while the other dimensions are processed in parallel, independently if dst_size >= src_sizes[dims] blocks = (dst_size + block_size - 1) ÷ block_size - kernel1! = _mapreduce_nd_by_thread!(backend, block_size) - kernel1!( - src, dst, f, op, init, dims, - ndrange=(block_size * blocks,), + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_thread!( + src, dst, f, op, init, dims, Val(block_size) ) else # One block per output element blocks = dst_size - kernel2! = _mapreduce_nd_by_block!(backend, block_size) - kernel2!( - src, dst, f, op, init, neutral, dims, - ndrange=(block_size * blocks,), + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_block!( + src, dst, f, op, init, neutral, dims, Val(block_size) ) end end @@ -193,11 +189,12 @@ function _mapreduce_nd_cpu_sections!( end -@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_thread!( - @Const(src), dst, +function _mapreduce_nd_by_thread!( + src, dst, f, op, init, dims, -) + ::Val{N} +) where N # One thread per output element, when there are more outer elements than in the reduced dim # e.g. reduce(+, rand(3, 1000), dims=1) => only 3 elements in the reduced dim src_sizes = size(src) @@ -210,16 +207,14 @@ end ndims = length(src_sizes) - N = @groupsize()[1] - # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero # indexing). Internal calculations will be done using zero indexing except when actually # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 # Each thread handles one output element tid = ithread + iblock * N @@ -259,15 +254,16 @@ end end dst[tid + 0x1] = res end + nothing end -@kernel inbounds=true cpu=false unsafe_indices=true function _mapreduce_nd_by_block!( - @Const(src), dst, +function _mapreduce_nd_by_block!( + src, dst, f, op, init, neutral, - dims, -) + dims, ::Val{N} +) where N # One block per output element, when there are more elements in the reduced dim than in outer # e.g. reduce(+, rand(3, 1000), dims=2) => only 3 elements in outer dimensions src_sizes = size(src) @@ -280,8 +276,7 @@ end ndims = length(src_sizes) - @uniform N = @groupsize()[1] - sdata = @localmem eltype(dst) (N,) + sdata = KI.localmemory(eltype(dst), N) # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero @@ -289,8 +284,8 @@ end # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 # Each block handles one output element - thus, iblock ∈ [0, output_size) @@ -330,11 +325,12 @@ end # Store partial result in shared memory; now we are down to a single block to reduce within sdata[ithread + 0x1] = partial - @synchronize() + KI.barrier() - @inline reduce_group!(@context, op, sdata, N, ithread) + @inline reduce_group!(op, sdata, N, ithread) if ithread == 0x0 dst[iblock + 0x1] = op(init, sdata[0x1]) end + nothing end diff --git a/src/reduce/utilities.jl b/src/reduce/utilities.jl index 48f387e..ad099a5 100644 --- a/src/reduce/utilities.jl +++ b/src/reduce/utilities.jl @@ -44,59 +44,59 @@ function _mapreduce_nd_apply_init!( end end -@inline function reduce_group!(@context, op, sdata, N, ithread) +@inline function reduce_group!(op, sdata, N, ithread) if N >= 512u16 if ithread < 256u16 sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 256u16 + 0x1]) end - @synchronize() + KI.barrier() end if N >= 256u16 if ithread < 128u16 sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 128u16 + 0x1]) end - @synchronize() + KI.barrier() end if N >= 128u16 if ithread < 64u16 sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 64u16 + 0x1]) end - @synchronize() + KI.barrier() end if N >= 64u16 if ithread < 32u16 sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 32u16 + 0x1]) end - @synchronize() + KI.barrier() end if N >= 32u16 if ithread < 16u16 sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 16u16 + 0x1]) end - @synchronize() + KI.barrier() end if N >= 16u16 if ithread < 8u16 sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 8u16 + 0x1]) end - @synchronize() + KI.barrier() end if N >= 8u16 if ithread < 4u16 sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 4u16 + 0x1]) end - @synchronize() + KI.barrier() end if N >= 4u16 if ithread < 2u16 sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 2u16 + 0x1]) end - @synchronize() + KI.barrier() end if N >= 2u16 if ithread < 1u16 sdata[ithread + 0x1] = op(sdata[ithread + 0x1], sdata[ithread + 1u16 + 0x1]) end - @synchronize() + KI.barrier() end end diff --git a/src/sort/merge_sort.jl b/src/sort/merge_sort.jl index 5fb7b20..f348542 100644 --- a/src/sort/merge_sort.jl +++ b/src/sort/merge_sort.jl @@ -1,7 +1,6 @@ -@kernel inbounds=true cpu=false unsafe_indices=true function _merge_sort_block!(vec, comp) +function _merge_sort_block!(vec, comp, ::Val{N}) where N - @uniform N = @groupsize()[1] - s_buf = @localmem eltype(vec) (N * 0x2,) + s_buf = KI.localmemory(eltype(vec), N * 0x2) T = eltype(vec) I = typeof(N) @@ -13,8 +12,8 @@ # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 i = ithread + iblock * N * 0x2 i < len && (s_buf[ithread + 0x1] = vec[i + 0x1]) @@ -22,7 +21,7 @@ i = ithread + N + iblock * N * 0x2 i < len && (s_buf[ithread + N + 0x1] = vec[i + 0x1]) - @synchronize() + KI.barrier() half_size_group = typeof(ithread)(1) size_group = typeof(ithread)(2) @@ -56,12 +55,12 @@ pos2 = ithread % half_size_group + _upper_bound_s0(s_buf, v2, lo, hi, comp) - lo end - @synchronize() + KI.barrier() pos1 != typemax(I) && (s_buf[gid * size_group + pos1 + 0x1] = v1) pos2 != typemax(I) && (s_buf[gid * size_group + pos2 + 0x1] = v2) - @synchronize() + KI.barrier() half_size_group = half_size_group << 0x1 size_group = size_group << 0x1 @@ -72,14 +71,13 @@ i = ithread + N + iblock * N * 0x2 i < len && (vec[i + 0x1] = s_buf[ithread + N + 0x1]) + nothing end -@kernel inbounds=true cpu=false unsafe_indices=true function _merge_sort_global!( - @Const(vec_in), vec_out, comp, half_size_group, -) +function _merge_sort_global!(vec_in, vec_out, comp, half_size_group, ::Val{N}) where N + @inbounds begin len = length(vec_in) - N = @groupsize()[1] # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero @@ -87,8 +85,8 @@ end # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 idx = ithread + iblock * N size_group = half_size_group * 0x2 @@ -120,6 +118,8 @@ end vec_out[pos_out + 0x1] = vec_in[pos_in + 0x1] end end + end + nothing end @@ -160,7 +160,7 @@ function merge_sort!( # Block level blocks = (length(v) + block_size * 2 - 1) ÷ (block_size * 2) - _merge_sort_block!(backend, block_size)(v, comp, ndrange=(block_size * blocks,)) + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _merge_sort_block!(v, comp, Val(block_size)) # Global level half_size_group = Int32(block_size * 2) @@ -170,12 +170,12 @@ function merge_sort!( p1 = v p2 = isnothing(temp) ? similar(v) : temp - kernel! = _merge_sort_global!(backend, block_size) + kernel! = KI.@kernel backend launch = false _merge_sort_global!(p1, p2, comp, half_size_group, Val(block_size)) niter = 0 while len > half_size_group blocks = ((len + half_size_group - 1) ÷ half_size_group + 1) ÷ 2 * (half_size_group ÷ block_size) - kernel!(p1, p2, comp, half_size_group, ndrange=(block_size * blocks,)) + kernel!(p1, p2, comp, half_size_group, Val(block_size); workgroupsize=block_size, numworkgroups = blocks) half_size_group = half_size_group << 1; size_group = size_group << 1; diff --git a/src/sort/merge_sort_by_key.jl b/src/sort/merge_sort_by_key.jl index f6de5f3..418e30f 100644 --- a/src/sort/merge_sort_by_key.jl +++ b/src/sort/merge_sort_by_key.jl @@ -1,8 +1,7 @@ -@kernel inbounds=true cpu=false unsafe_indices=true function _merge_sort_by_key_block!(keys, values, comp) +function _merge_sort_by_key_block!(keys, values, comp, ::Val{N}) where N - @uniform N = @groupsize()[1] - s_keys = @localmem eltype(keys) (N * 0x2,) - s_values = @localmem eltype(values) (N * 0x2,) + s_keys = KI.localmemory(eltype(keys), N * 0x2) + s_values = KI.localmemory(eltype(values), N * 0x2) I = typeof(N) len = length(keys) @@ -13,8 +12,8 @@ # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 i = ithread + iblock * N * 0x2 if i < len @@ -28,7 +27,7 @@ s_values[ithread + N + 0x1] = values[i + 0x1] end - @synchronize() + KI.barrier() half_size_group = typeof(ithread)(1) size_group = typeof(ithread)(2) @@ -66,7 +65,7 @@ pos2 = ithread % half_size_group + _upper_bound_s0(s_keys, k2, lo, hi, comp) - lo end - @synchronize() + KI.barrier() if pos1 != typemax(I) s_keys[gid * size_group + pos1 + 0x1] = k1 @@ -77,7 +76,7 @@ s_values[gid * size_group + pos2 + 0x1] = v2 end - @synchronize() + KI.barrier() half_size_group = half_size_group << 0x1 size_group = size_group << 0x1 @@ -94,17 +93,18 @@ keys[i + 0x1] = s_keys[ithread + N + 0x1] values[i + 0x1] = s_values[ithread + N + 0x1] end + nothing end -@kernel inbounds=true cpu=false unsafe_indices=true function _merge_sort_by_key_global!( - @Const(keys_in), keys_out, - @Const(values_in), values_out, +function _merge_sort_by_key_global!( + keys_in, keys_out, + values_in, values_out, comp, half_size_group, -) + ::Val{N} +) where N len = length(keys_in) - N = @groupsize()[1] # NOTE: for many index calculations in this library, computation using zero-indexing leads to # fewer operations (also code is transpiled to CUDA / ROCm / oneAPI / Metal code which do zero @@ -112,8 +112,8 @@ end # accessing memory. As with C, the lower bound is inclusive, the upper bound exclusive. # Group (block) and local (thread) indices - iblock = @index(Group, Linear) - 0x1 - ithread = @index(Local, Linear) - 0x1 + iblock = KI.get_group_id().x - 0x1 + ithread = KI.get_local_id().x - 0x1 idx = ithread + iblock * N size_group = half_size_group * 0x2 @@ -150,6 +150,7 @@ end values_out[pos_out + 0x1] = values_in[pos_in + 0x1] end end + nothing end @@ -201,7 +202,7 @@ function merge_sort_by_key!( # Block level blocks = (length(keys) + block_size * 2 - 1) ÷ (block_size * 2) - _merge_sort_by_key_block!(backend, block_size)(keys, values, comp, ndrange=(block_size * blocks,)) + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _merge_sort_by_key_block!(keys, values, comp, Val(block_size)) # Global level half_size_group = Int32(block_size * 2) @@ -214,12 +215,12 @@ function merge_sort_by_key!( pv1 = values pv2 = isnothing(temp_values) ? similar(values) : temp_values - kernel! = _merge_sort_by_key_global!(backend, block_size) + kernel! = KI.@kernel backend launch = false _merge_sort_by_key_global!(pk1, pk2, pv1, pv2, comp, half_size_group, Val(block_size)) niter = 0 while len > half_size_group blocks = ((len + half_size_group - 1) ÷ half_size_group + 1) ÷ 2 * (half_size_group ÷ block_size) - kernel!(pk1, pk2, pv1, pv2, comp, half_size_group, ndrange=(block_size * blocks,)) + kernel!(pk1, pk2, pv1, pv2, comp, half_size_group, Val(block_size); workgroupsize=block_size, numworkgroups=blocks) half_size_group = half_size_group << 1; size_group = size_group << 1; diff --git a/src/sort/merge_sortperm.jl b/src/sort/merge_sortperm.jl index 6b97061..bff4593 100644 --- a/src/sort/merge_sortperm.jl +++ b/src/sort/merge_sortperm.jl @@ -133,7 +133,7 @@ function merge_sortperm_lowmem!( # Block level blocks = (length(ix) + block_size * 2 - 1) ÷ (block_size * 2) - _merge_sort_block!(backend, block_size)(ix, comp, ndrange=(block_size * blocks,)) + KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _merge_sort_block!(ix, comp, Val(block_size)) # Global level half_size_group = Int32(block_size * 2) @@ -143,12 +143,12 @@ function merge_sortperm_lowmem!( p1 = ix p2 = isnothing(temp) ? similar(ix) : temp - kernel! = _merge_sort_global!(backend, block_size) + kernel! = KI.@kernel backend launch = false _merge_sort_global!(p1, p2, comp, half_size_group, Val(block_size)) niter = 0 while len > half_size_group blocks = ((len + half_size_group - 1) ÷ half_size_group + 1) ÷ 2 * (half_size_group ÷ block_size) - kernel!(p1, p2, comp, half_size_group, ndrange=(block_size * blocks,)) + kernel!(p1, p2, comp, half_size_group; workgroupsize=block_size, numworkgroups=blocks) half_size_group = half_size_group << 1; size_group = size_group << 1; From d8bdc95642dd67e6fdaec47593c72b4d0b81accb Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 14 Nov 2025 12:19:16 -0400 Subject: [PATCH 02/12] [Temp (maybe)] benchmarks improvements --- benchmark/benchmark_graphs_nb.jl | 513 +++++++++++++++++-------------- benchmark/runbenchmarks.jl | 11 +- 2 files changed, 293 insertions(+), 231 deletions(-) diff --git a/benchmark/benchmark_graphs_nb.jl b/benchmark/benchmark_graphs_nb.jl index 5beb8b6..6ee481a 100644 --- a/benchmark/benchmark_graphs_nb.jl +++ b/benchmark/benchmark_graphs_nb.jl @@ -1,5 +1,5 @@ ### A Pluto.jl notebook ### -# v0.20.13 +# v0.20.21 using Markdown using InteractiveUtils @@ -27,6 +27,12 @@ md""" """ end +# ╔═╡ 8601e5de-180c-45b5-b0c6-1f8d807df6d0 + + +# ╔═╡ e27f7b92-79f2-4351-bbc5-46d6e5a9fd67 + + # ╔═╡ 0f77fb5f-e894-43e4-94f5-4ed93af7ba9b begin function plot_benches(df, cat, t; ylabel="Time (ns)") @@ -53,11 +59,35 @@ begin push!(df, (Category=vals[1], T=vals[2],Bench=vals[3],Time=res[2]["time"])) end end - benchresults = JSON.parsefile("benchmarkresults.json") - benchres_df = getbenches(benchresults) - benchres_df.alg .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchres_df.Bench] - benchres_df.test .= [x[6:end] for x in benchres_df.Bench] - sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) + make_res_df(file) = make_res_df(x -> startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK"), file) + function make_res_df(alg_f, file) + benchresults = JSON.parsefile(file) + benchres_df = getbenches(benchresults) + benchres_df.alg .= [alg_f(x) for x in benchres_df.Bench] + benchres_df.test .= [x[6:end] for x in benchres_df.Bench] + + sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) + return benchres_df + end +end + +# ╔═╡ 002b7672-9431-4510-ba89-84be098a2f9f +# begin +# benchresultspre = JSON.parsefile("benchmarkresultsstd.json") +# benchrespre_df = getbenches(benchresultspre) +# benchrespre_df.alg .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchrespre_df.Bench] +# benchrespre_df.test .= [x[6:end] for x in benchrespre_df.Bench] + +# benchrespre_df.alg .= [x == Symbol("1Base") ? Symbol("2Base") : Symbol("4AccK") for x in benchrespre_df.alg] + +# sort!(benchrespre_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) +# end + +# ╔═╡ b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2 +begin + benchrespre_df = make_res_df(x->startswith(x, "base") ? Symbol("1BasePre") : Symbol("4AccKPre"), "benchmarkresultspre.json") + benchres_df = make_res_df(x->startswith(x, "base") ? Symbol("2Base") : Symbol("5AccK"), "benchmarkresultsnew.json") + benchresi32_df = make_res_df(x->startswith(x, "base") ? Symbol("3BaseI32") : Symbol("6AccKI32"), "benchmarkresults.json") end # ╔═╡ d4accca6-f650-453c-bb75-a8e4cac568c1 @@ -73,6 +103,38 @@ Type: $(@bind typ Select(unique(benchres_df.T);)) # ╔═╡ 3cd5fd0a-6f16-4cb3-87ba-43b86224b81c plot_benches(benchres_df, cat, typ) +# ╔═╡ cde9391a-44dd-49ee-8730-4b9ad58c3d90 +plot_benches([benchres_df;benchrespre_df;benchresi32_df], cat, typ) + +# ╔═╡ 4681accf-eaed-47e0-9d26-ab968df83c8a +# begin +# make_res_df(file) = make_res_df(x -> startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK"), file) +# function make_res_df(alg_f, file) +# benchresults = JSON.parsefile("benchmarkresultsstd.json") +# benchres_df = getbenches(benchresults) +# benchres_df.alg .= [alg_f(x) for x in benchres_df.Bench] +# benchres_df.test .= [x[6:end] for x in benchres_df.Bench] + +# sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) +# return benchres_df +# end +# end + +# ╔═╡ 7a3a4783-8424-488d-8122-1d65680703ac +# begin +# benchresultspre = JSON.parsefile("benchmarkresultsstd.json") +# benchrespre_df = getbenches(benchresultspre) +# benchrespre_df.alg .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchrespre_df.Bench] +# benchrespre_df.test .= [x[6:end] for x in benchrespre_df.Bench] + +# benchrespre_df.alg .= [x == Symbol("1Base") ? Symbol("2Base") : Symbol("4AccK") for x in benchrespre_df.alg] + +# sort!(benchrespre_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) +# end + +# ╔═╡ f3ac43c7-f6d0-4aa6-9d43-c66907de4fa0 +benchresboth_df = sort!([benchres_df;benchrespre_df], [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) + # ╔═╡ 00000000-0000-0000-0000-000000000001 PLUTO_PROJECT_TOML_CONTENTS = """ [deps] @@ -82,19 +144,19 @@ PlutoUI = "7f904dfe-b85e-4ff6-b463-dae2292396a8" StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" [compat] -DataFrames = "~1.7.0" +DataFrames = "~1.8.1" JSON = "~0.21.4" -PlutoUI = "~0.7.68" -StatsPlots = "~0.15.7" +PlutoUI = "~0.7.71" +StatsPlots = "~0.15.8" """ # ╔═╡ 00000000-0000-0000-0000-000000000002 PLUTO_MANIFEST_TOML_CONTENTS = """ # This file is machine-generated - editing it directly is not advised -julia_version = "1.11.6" +julia_version = "1.12.1" manifest_format = "2.0" -project_hash = "52f7847a79e6136f8251449142cb9cc15e7eaf99" +project_hash = "baee9e8f260630c1154d35b552a51b54a1819d5e" [[deps.AbstractFFTs]] deps = ["LinearAlgebra"] @@ -115,9 +177,9 @@ version = "1.3.2" [[deps.Adapt]] deps = ["LinearAlgebra", "Requires"] -git-tree-sha1 = "f7817e2e585aa6d924fd714df1e2a84be7896c60" +git-tree-sha1 = "7e35fca2bdfba44d797c53dfe63a51fabf39bfc0" uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" -version = "4.3.0" +version = "4.4.0" weakdeps = ["SparseArrays", "StaticArrays"] [deps.Adapt.extensions] @@ -179,9 +241,9 @@ version = "1.18.5+0" [[deps.ChainRulesCore]] deps = ["Compat", "LinearAlgebra"] -git-tree-sha1 = "06ee8d1aa558d2833aa799f6f0b31b30cada405f" +git-tree-sha1 = "e4c6a16e77171a5f5e25e9646617ab1c276c5607" uuid = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" -version = "1.25.2" +version = "1.26.0" weakdeps = ["SparseArrays"] [deps.ChainRulesCore.extensions] @@ -201,9 +263,9 @@ version = "0.7.8" [[deps.ColorSchemes]] deps = ["ColorTypes", "ColorVectorSpace", "Colors", "FixedPointNumbers", "PrecompileTools", "Random"] -git-tree-sha1 = "403f2d8e209681fcbd9468a8514efff3ea08452e" +git-tree-sha1 = "b0fd3f56fa442f81e0a47815c92245acfaaa4e34" uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4" -version = "3.29.0" +version = "3.31.0" [[deps.ColorTypes]] deps = ["FixedPointNumbers", "Random"] @@ -233,9 +295,9 @@ version = "0.13.1" [[deps.Compat]] deps = ["TOML", "UUIDs"] -git-tree-sha1 = "8ae8d32e09f0dcf42a36b90d4e17f5dd2e4c4215" +git-tree-sha1 = "9d8a54ce4b17aa5bdce0ea5c34bc5e7c340d16ad" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "4.16.0" +version = "4.18.1" weakdeps = ["Dates", "LinearAlgebra"] [deps.Compat.extensions] @@ -244,7 +306,7 @@ weakdeps = ["Dates", "LinearAlgebra"] [[deps.CompilerSupportLibraries_jll]] deps = ["Artifacts", "Libdl"] uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "1.1.1+0" +version = "1.3.0+1" [[deps.ConcurrentUtilities]] deps = ["Serialization", "Sockets"] @@ -269,15 +331,15 @@ version = "1.16.0" [[deps.DataFrames]] deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] -git-tree-sha1 = "fb61b4812c49343d7ef0b533ba982c46021938a6" +git-tree-sha1 = "d8928e9169ff76c6281f39a659f9bca3a573f24c" uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" -version = "1.7.0" +version = "1.8.1" [[deps.DataStructures]] -deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "4e1fe97fdaed23e9dc21d4d664bea76b65fc50a0" +deps = ["OrderedCollections"] +git-tree-sha1 = "e357641bb3e0638d353c4b29ea0e40ea644066a6" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.22" +version = "0.19.3" [[deps.DataValueInterfaces]] git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" @@ -319,9 +381,9 @@ version = "1.11.0" [[deps.Distributions]] deps = ["AliasTables", "FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SpecialFunctions", "Statistics", "StatsAPI", "StatsBase", "StatsFuns"] -git-tree-sha1 = "3e6d038b77f22791b8e3472b7c633acea1ecac06" +git-tree-sha1 = "3bc002af51045ca3b47d2e1787d6ce02e68b943a" uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" -version = "0.25.120" +version = "0.25.122" [deps.Distributions.extensions] DistributionsChainRulesCoreExt = "ChainRulesCore" @@ -357,27 +419,27 @@ version = "0.1.11" [[deps.Expat_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "d55dffd9ae73ff72f1c0482454dcf2ec6c6c4a63" +git-tree-sha1 = "27af30de8b5445644e8ffe3bcb0d72049c089cf1" uuid = "2e619515-83b5-522b-bb60-26c02a35a201" -version = "2.6.5+0" +version = "2.7.3+0" [[deps.FFMPEG]] deps = ["FFMPEG_jll"] -git-tree-sha1 = "53ebe7511fa11d33bec688a9178fac4e49eeee00" +git-tree-sha1 = "95ecf07c2eea562b5adbd0696af6db62c0f52560" uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a" -version = "0.4.2" +version = "0.4.5" [[deps.FFMPEG_jll]] deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "JLLWrappers", "LAME_jll", "Libdl", "Ogg_jll", "OpenSSL_jll", "Opus_jll", "PCRE2_jll", "Zlib_jll", "libaom_jll", "libass_jll", "libfdk_aac_jll", "libvorbis_jll", "x264_jll", "x265_jll"] -git-tree-sha1 = "466d45dc38e15794ec7d5d63ec03d776a9aff36e" +git-tree-sha1 = "ccc81ba5e42497f4e76553a5545665eed577a663" uuid = "b22a6f82-2f65-5046-a5b2-351ab43fb4e5" -version = "4.4.4+1" +version = "8.0.0+0" [[deps.FFTW]] -deps = ["AbstractFFTs", "FFTW_jll", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"] -git-tree-sha1 = "797762812ed063b9b94f6cc7742bc8883bb5e69e" +deps = ["AbstractFFTs", "FFTW_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Preferences", "Reexport"] +git-tree-sha1 = "97f08406df914023af55ade2f843c39e99c5d969" uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" -version = "1.9.0" +version = "1.10.0" [[deps.FFTW_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -391,9 +453,9 @@ version = "1.11.0" [[deps.FillArrays]] deps = ["LinearAlgebra"] -git-tree-sha1 = "6a70198746448456524cb442b8af316927ff3e1a" +git-tree-sha1 = "5bfcd42851cf2f1b303f51525a54dc5e98d408a3" uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" -version = "1.13.0" +version = "1.15.0" weakdeps = ["PDMats", "SparseArrays", "Statistics"] [deps.FillArrays.extensions] @@ -409,9 +471,9 @@ version = "0.8.5" [[deps.Fontconfig_jll]] deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Zlib_jll"] -git-tree-sha1 = "301b5d5d731a0654825f1f2e906990f7141a106b" +git-tree-sha1 = "f85dac9a96a01087df6e3a749840015a0ca3817d" uuid = "a3f928ae-7b40-5064-980b-68af3947d34b" -version = "2.16.0+0" +version = "2.17.1+0" [[deps.Format]] git-tree-sha1 = "9c68794ef81b08086aeb32eeaf33531668d5f5fc" @@ -443,15 +505,15 @@ version = "3.4.0+2" [[deps.GR]] deps = ["Artifacts", "Base64", "DelimitedFiles", "Downloads", "GR_jll", "HTTP", "JSON", "Libdl", "LinearAlgebra", "Preferences", "Printf", "Qt6Wayland_jll", "Random", "Serialization", "Sockets", "TOML", "Tar", "Test", "p7zip_jll"] -git-tree-sha1 = "1828eb7275491981fa5f1752a5e126e8f26f8741" +git-tree-sha1 = "f52c27dd921390146624f3aab95f4e8614ad6531" uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71" -version = "0.73.17" +version = "0.73.18" [[deps.GR_jll]] deps = ["Artifacts", "Bzip2_jll", "Cairo_jll", "FFMPEG_jll", "Fontconfig_jll", "FreeType2_jll", "GLFW_jll", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Libtiff_jll", "Pixman_jll", "Qt6Base_jll", "Zlib_jll", "libpng_jll"] -git-tree-sha1 = "27299071cc29e409488ada41ec7643e0ab19091f" +git-tree-sha1 = "4b0406b866ea9fdbaf1148bc9c0b887e59f9af68" uuid = "d2c73de3-f751-5644-a686-071e5b155ba9" -version = "0.73.17+0" +version = "0.73.18+0" [[deps.GettextRuntime_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Libiconv_jll"] @@ -459,11 +521,17 @@ git-tree-sha1 = "45288942190db7c5f760f59c04495064eedf9340" uuid = "b0724c58-0f36-5564-988d-3bb0596ebc4a" version = "0.22.4+0" +[[deps.Ghostscript_jll]] +deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Zlib_jll"] +git-tree-sha1 = "38044a04637976140074d0b0621c1edf0eb531fd" +uuid = "61579ee1-b43e-5ca0-a5da-69d92c66a64b" +version = "9.55.1+0" + [[deps.Glib_jll]] deps = ["Artifacts", "GettextRuntime_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Libiconv_jll", "Libmount_jll", "PCRE2_jll", "Zlib_jll"] -git-tree-sha1 = "35fbd0cefb04a516104b8e183ce0df11b70a3f1a" +git-tree-sha1 = "50c11ffab2a3d50192a228c313f05b5b5dc5acb2" uuid = "7746bdde-850d-59dc-9ae8-88ece973131d" -version = "2.84.3+0" +version = "2.86.0+0" [[deps.Graphite2_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -478,9 +546,9 @@ version = "1.0.2" [[deps.HTTP]] deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "PrecompileTools", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"] -git-tree-sha1 = "ed5e9c58612c4e081aecdb6e1a479e18462e041e" +git-tree-sha1 = "5e6fe50ae7f23d171f44e311c2960294aaa0beb5" uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" -version = "1.10.17" +version = "1.10.19" [[deps.HarfBuzz_jll]] deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll"] @@ -513,9 +581,9 @@ uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" version = "0.2.5" [[deps.InlineStrings]] -git-tree-sha1 = "6a9fde685a7ac1eb3495f8e812c5a7c3711c2d5e" +git-tree-sha1 = "8f3d257792a522b4601c24a577954b0a8cd7334d" uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" -version = "1.4.3" +version = "1.4.5" [deps.InlineStrings.extensions] ArrowTypesExt = "ArrowTypes" @@ -527,9 +595,9 @@ version = "1.4.3" [[deps.IntelOpenMP_jll]] deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl"] -git-tree-sha1 = "0f14a5456bdc6b9731a5682f439a672750a09e48" +git-tree-sha1 = "ec1debd61c300961f98064cfb21287613ad7f303" uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0" -version = "2025.0.4+0" +version = "2025.2.0+0" [[deps.InteractiveUtils]] deps = ["Markdown"] @@ -537,24 +605,28 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" version = "1.11.0" [[deps.Interpolations]] -deps = ["Adapt", "AxisAlgorithms", "ChainRulesCore", "LinearAlgebra", "OffsetArrays", "Random", "Ratios", "Requires", "SharedArrays", "SparseArrays", "StaticArrays", "WoodburyMatrices"] -git-tree-sha1 = "88a101217d7cb38a7b481ccd50d21876e1d1b0e0" +deps = ["Adapt", "AxisAlgorithms", "ChainRulesCore", "LinearAlgebra", "OffsetArrays", "Random", "Ratios", "SharedArrays", "SparseArrays", "StaticArrays", "WoodburyMatrices"] +git-tree-sha1 = "65d505fa4c0d7072990d659ef3fc086eb6da8208" uuid = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" -version = "0.15.1" -weakdeps = ["Unitful"] +version = "0.16.2" [deps.Interpolations.extensions] + InterpolationsForwardDiffExt = "ForwardDiff" InterpolationsUnitfulExt = "Unitful" + [deps.Interpolations.weakdeps] + ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" + Unitful = "1986cc42-f94f-5a68-af5c-568840ba703d" + [[deps.InvertedIndices]] git-tree-sha1 = "6da3c4316095de0f5ee2ebd875df8721e7e0bdbe" uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" version = "1.3.1" [[deps.IrrationalConstants]] -git-tree-sha1 = "e2222959fbc6c19554dc15174c81bf7bf3aa691c" +git-tree-sha1 = "b2d91fe939cae05960e760110b328288867b5758" uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" -version = "0.2.4" +version = "0.2.6" [[deps.IteratorInterfaceExtensions]] git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" @@ -569,9 +641,9 @@ version = "0.1.11" [[deps.JLLWrappers]] deps = ["Artifacts", "Preferences"] -git-tree-sha1 = "a007feb38b422fbdab534406aeca1b86823cb4d6" +git-tree-sha1 = "0533e564aae234aff59ab625543145446d8b6ec2" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.7.0" +version = "1.7.1" [[deps.JSON]] deps = ["Dates", "Mmap", "Parsers", "Unicode"] @@ -581,21 +653,26 @@ version = "0.21.4" [[deps.JpegTurbo_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "eac1206917768cb54957c65a615460d87b455fc1" +git-tree-sha1 = "4255f0032eafd6451d707a51d5f0248b8a165e4d" uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8" -version = "3.1.1+0" +version = "3.1.3+0" + +[[deps.JuliaSyntaxHighlighting]] +deps = ["StyledStrings"] +uuid = "ac6e5ff7-fb65-4e79-a425-ec3bc9c03011" +version = "1.12.0" [[deps.KernelDensity]] deps = ["Distributions", "DocStringExtensions", "FFTW", "Interpolations", "StatsBase"] -git-tree-sha1 = "7d703202e65efa1369de1279c162b915e245eed1" +git-tree-sha1 = "ba51324b894edaf1df3ab16e2cc6bc3280a2f1a7" uuid = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b" -version = "0.6.9" +version = "0.6.10" [[deps.LAME_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "170b660facf5df5de098d866564877e119141cbd" +git-tree-sha1 = "059aabebaa7c82ccb853dd4a0ee9d17796f7e1bc" uuid = "c1c5ebd0-6772-5130-a774-d5fcae4a789d" -version = "3.100.2+0" +version = "3.100.3+0" [[deps.LERC_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -621,10 +698,10 @@ uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" version = "1.4.0" [[deps.Latexify]] -deps = ["Format", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "OrderedCollections", "Requires"] -git-tree-sha1 = "4f34eaabe49ecb3fb0d58d6015e32fd31a733199" +deps = ["Format", "Ghostscript_jll", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "OrderedCollections", "Requires"] +git-tree-sha1 = "44f93c47f9cd6c7e431f2f2091fcba8f01cd7e8f" uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316" -version = "0.16.8" +version = "0.16.10" [deps.Latexify.extensions] DataFramesExt = "DataFrames" @@ -649,24 +726,24 @@ uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" version = "0.6.4" [[deps.LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "OpenSSL_jll", "Zlib_jll", "nghttp2_jll"] uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" -version = "8.6.0+0" +version = "8.11.1+1" [[deps.LibGit2]] -deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"] +deps = ["LibGit2_jll", "NetworkOptions", "Printf", "SHA"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" version = "1.11.0" [[deps.LibGit2_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "OpenSSL_jll"] uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5" -version = "1.7.2+0" +version = "1.9.0+0" [[deps.LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +deps = ["Artifacts", "Libdl", "OpenSSL_jll"] uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" -version = "1.11.0+1" +version = "1.11.3+1" [[deps.Libdl]] uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" @@ -692,26 +769,26 @@ version = "1.18.0+0" [[deps.Libmount_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "a31572773ac1b745e0343fe5e2c8ddda7a37e997" +git-tree-sha1 = "3acf07f130a76f87c041cfb2ff7d7284ca67b072" uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9" -version = "2.41.0+0" +version = "2.41.2+0" [[deps.Libtiff_jll]] deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "LERC_jll", "Libdl", "XZ_jll", "Zlib_jll", "Zstd_jll"] -git-tree-sha1 = "4ab7581296671007fc33f07a721631b8855f4b1d" +git-tree-sha1 = "f04133fe05eff1667d2054c53d59f9122383fe05" uuid = "89763e89-9b03-5906-acba-b20f662cd828" -version = "4.7.1+0" +version = "4.7.2+0" [[deps.Libuuid_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "321ccef73a96ba828cd51f2ab5b9f917fa73945a" +git-tree-sha1 = "2a7a12fc0a4e7fb773450d17975322aa77142106" uuid = "38a345b3-de98-5d2b-a5d3-14cd9215e700" -version = "2.41.0+0" +version = "2.41.2+0" [[deps.LinearAlgebra]] deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" -version = "1.11.0" +version = "1.12.0" [[deps.LogExpFunctions]] deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] @@ -735,9 +812,9 @@ version = "1.11.0" [[deps.LoggingExtras]] deps = ["Dates", "Logging"] -git-tree-sha1 = "f02b56007b064fbfddb4c9cd60161b6dd0f40df3" +git-tree-sha1 = "f00544d95982ea270145636c181ceda21c4e2575" uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36" -version = "1.1.0" +version = "1.2.0" [[deps.MIMEs]] git-tree-sha1 = "c64d943587f7187e751162b3b84445bbbd79f691" @@ -746,9 +823,9 @@ version = "1.1.0" [[deps.MKL_jll]] deps = ["Artifacts", "IntelOpenMP_jll", "JLLWrappers", "LazyArtifacts", "Libdl", "oneTBB_jll"] -git-tree-sha1 = "5de60bc6cb3899cd318d80d627560fae2e2d99ae" +git-tree-sha1 = "282cadc186e7b2ae0eeadbd7a4dffed4196ae2aa" uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7" -version = "2025.0.1+1" +version = "2025.2.0+0" [[deps.MacroTools]] git-tree-sha1 = "1e0228a030642014fe5cfe68c2c0a818f9e3f522" @@ -756,7 +833,7 @@ uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" version = "0.5.16" [[deps.Markdown]] -deps = ["Base64"] +deps = ["Base64", "JuliaSyntaxHighlighting", "StyledStrings"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" version = "1.11.0" @@ -767,14 +844,15 @@ uuid = "739be429-bea8-5141-9913-cc70e7f3736d" version = "1.1.9" [[deps.MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "3cce3511ca2c6f87b19c34ffc623417ed2798cbd" uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.28.6+0" +version = "2.28.10+0" [[deps.Measures]] -git-tree-sha1 = "c13304c81eec1ed3af7fc20e75fb6b26092a1102" +git-tree-sha1 = "b513cedd20d9c914783d8ad83d08120702bf2c77" uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e" -version = "0.3.2" +version = "0.3.3" [[deps.Missings]] deps = ["DataAPI"] @@ -788,7 +866,7 @@ version = "1.11.0" [[deps.MozillaCACerts_jll]] uuid = "14a3606d-f60d-562e-9121-12d972cd8159" -version = "2023.12.12" +version = "2025.5.20" [[deps.MultivariateStats]] deps = ["Arpack", "Distributions", "LinearAlgebra", "SparseArrays", "Statistics", "StatsAPI", "StatsBase"] @@ -804,13 +882,13 @@ version = "1.1.3" [[deps.NearestNeighbors]] deps = ["Distances", "StaticArrays"] -git-tree-sha1 = "8a3271d8309285f4db73b4f662b1b290c715e85e" +git-tree-sha1 = "ca7e18198a166a1f3eb92a3650d53d94ed8ca8a1" uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce" -version = "0.4.21" +version = "0.4.22" [[deps.NetworkOptions]] uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" -version = "1.2.0" +version = "1.3.0" [[deps.Observables]] git-tree-sha1 = "7438a59546cf62428fc9d1bc94729146d37a7225" @@ -827,32 +905,31 @@ weakdeps = ["Adapt"] OffsetArraysAdaptExt = "Adapt" [[deps.Ogg_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "887579a3eb005446d514ab7aeac5d1d027658b8f" +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "b6aa4566bb7ae78498a5e68943863fa8b5231b59" uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051" -version = "1.3.5+1" +version = "1.3.6+0" [[deps.OpenBLAS_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.27+1" +version = "0.3.29+0" [[deps.OpenLibm_jll]] deps = ["Artifacts", "Libdl"] uuid = "05823500-19ac-5b8b-9628-191a04bc5112" -version = "0.8.5+0" +version = "0.8.7+0" [[deps.OpenSSL]] -deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"] -git-tree-sha1 = "f1a7e086c677df53e064e0fdd2c9d0b0833e3f6e" +deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "NetworkOptions", "OpenSSL_jll", "Sockets"] +git-tree-sha1 = "386b47442468acfb1add94bf2d85365dea10cbab" uuid = "4d8831e6-92b7-49fb-bdf8-b643e874388c" -version = "1.5.0" +version = "1.6.0" [[deps.OpenSSL_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "9216a80ff3682833ac4b733caa8c00390620ba5d" +deps = ["Artifacts", "Libdl"] uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" -version = "3.5.0+0" +version = "3.5.1+0" [[deps.OpenSpecFun_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl"] @@ -862,9 +939,9 @@ version = "0.5.6+0" [[deps.Opus_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "6703a85cb3781bd5909d48730a67205f3f31a575" +git-tree-sha1 = "c392fc5dd032381919e3b22dd32d6443760ce7ea" uuid = "91d4177d-7536-5919-b921-800302f37372" -version = "1.3.3+0" +version = "1.5.2+0" [[deps.OrderedCollections]] git-tree-sha1 = "05868e21324cede2207c6f0f466b4bfef6d5e7ee" @@ -874,19 +951,23 @@ version = "1.8.1" [[deps.PCRE2_jll]] deps = ["Artifacts", "Libdl"] uuid = "efcefdf7-47ab-520b-bdef-62a2eaa19f15" -version = "10.42.0+1" +version = "10.44.0+1" [[deps.PDMats]] deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"] -git-tree-sha1 = "f07c06228a1c670ae4c87d1276b92c7c597fdda0" +git-tree-sha1 = "d922b4d80d1e12c658da7785e754f4796cc1d60d" uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" -version = "0.11.35" +version = "0.11.36" +weakdeps = ["StatsBase"] + + [deps.PDMats.extensions] + StatsBaseExt = "StatsBase" [[deps.Pango_jll]] deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "FriBidi_jll", "Glib_jll", "HarfBuzz_jll", "JLLWrappers", "Libdl"] -git-tree-sha1 = "275a9a6d85dc86c24d03d1837a0010226a96f540" +git-tree-sha1 = "0662b083e11420952f2e62e17eddae7fc07d5997" uuid = "36c8627f-9965-5494-a995-c6b170f724f3" -version = "1.56.3+0" +version = "1.57.0+0" [[deps.Parsers]] deps = ["Dates", "PrecompileTools", "UUIDs"] @@ -903,7 +984,7 @@ version = "0.44.2+0" [[deps.Pkg]] deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "Random", "SHA", "TOML", "Tar", "UUIDs", "p7zip_jll"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.11.0" +version = "1.12.0" weakdeps = ["REPL"] [deps.Pkg.extensions] @@ -917,15 +998,15 @@ version = "3.3.0" [[deps.PlotUtils]] deps = ["ColorSchemes", "Colors", "Dates", "PrecompileTools", "Printf", "Random", "Reexport", "StableRNGs", "Statistics"] -git-tree-sha1 = "3ca9a356cd2e113c420f2c13bea19f8d3fb1cb18" +git-tree-sha1 = "26ca162858917496748aad52bb5d3be4d26a228a" uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" -version = "1.4.3" +version = "1.4.4" [[deps.Plots]] -deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "JLFzf", "JSON", "LaTeXStrings", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "PrecompileTools", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "RelocatableFolders", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "TOML", "UUIDs", "UnicodeFun", "UnitfulLatexify", "Unzip"] -git-tree-sha1 = "28ea788b78009c695eb0d637587c81d26bdf0e36" +deps = ["Base64", "Contour", "Dates", "Downloads", "FFMPEG", "FixedPointNumbers", "GR", "JLFzf", "JSON", "LaTeXStrings", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "Pkg", "PlotThemes", "PlotUtils", "PrecompileTools", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "RelocatableFolders", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "TOML", "UUIDs", "UnicodeFun", "Unzip"] +git-tree-sha1 = "12ce661880f8e309569074a61d3767e5756a199f" uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" -version = "1.40.14" +version = "1.41.1" [deps.Plots.extensions] FileIOExt = "FileIO" @@ -943,9 +1024,9 @@ version = "1.40.14" [[deps.PlutoUI]] deps = ["AbstractPlutoDingetjes", "Base64", "ColorTypes", "Dates", "Downloads", "FixedPointNumbers", "Hyperscript", "HypertextLiteral", "IOCapture", "InteractiveUtils", "JSON", "Logging", "MIMEs", "Markdown", "Random", "Reexport", "URIs", "UUIDs"] -git-tree-sha1 = "ec9e63bd098c50e4ad28e7cb95ca7a4860603298" +git-tree-sha1 = "8329a3a4f75e178c11c1ce2342778bcbbbfa7e3c" uuid = "7f904dfe-b85e-4ff6-b463-dae2292396a8" -version = "0.7.68" +version = "0.7.71" [[deps.PooledArrays]] deps = ["DataAPI", "Future"] @@ -955,21 +1036,21 @@ version = "1.4.3" [[deps.PrecompileTools]] deps = ["Preferences"] -git-tree-sha1 = "5aa36f7049a63a1528fe8f7c3f2113413ffd4e1f" +git-tree-sha1 = "07a921781cab75691315adc645096ed5e370cb77" uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" -version = "1.2.1" +version = "1.3.3" [[deps.Preferences]] deps = ["TOML"] -git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6" +git-tree-sha1 = "0f27480397253da18fe2c12a4ba4eb9eb208bf3d" uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.4.3" +version = "1.5.0" [[deps.PrettyTables]] -deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "Reexport", "StringManipulation", "Tables"] -git-tree-sha1 = "1101cd475833706e4d0e7b122218257178f48f34" +deps = ["Crayons", "LaTeXStrings", "Markdown", "PrecompileTools", "Printf", "REPL", "Reexport", "StringManipulation", "Tables"] +git-tree-sha1 = "6b8e2f0bae3f678811678065c09571c1619da219" uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" -version = "2.4.0" +version = "3.1.0" [[deps.Printf]] deps = ["Unicode"] @@ -983,9 +1064,9 @@ version = "1.3.0" [[deps.Qt6Base_jll]] deps = ["Artifacts", "CompilerSupportLibraries_jll", "Fontconfig_jll", "Glib_jll", "JLLWrappers", "Libdl", "Libglvnd_jll", "OpenSSL_jll", "Vulkan_Loader_jll", "Xorg_libSM_jll", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Xorg_libxcb_jll", "Xorg_xcb_util_cursor_jll", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_keysyms_jll", "Xorg_xcb_util_renderutil_jll", "Xorg_xcb_util_wm_jll", "Zlib_jll", "libinput_jll", "xkbcommon_jll"] -git-tree-sha1 = "eb38d376097f47316fe089fc62cb7c6d85383a52" +git-tree-sha1 = "34f7e5d2861083ec7596af8b8c092531facf2192" uuid = "c0090381-4147-56d7-9ebc-da0b1113ec56" -version = "6.8.2+1" +version = "6.8.2+2" [[deps.Qt6Declarative_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Qt6Base_jll", "Qt6ShaderTools_jll"] @@ -1001,9 +1082,9 @@ version = "6.8.2+1" [[deps.Qt6Wayland_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Qt6Base_jll", "Qt6Declarative_jll"] -git-tree-sha1 = "e1d5e16d0f65762396f9ca4644a5f4ddab8d452b" +git-tree-sha1 = "8f528b0851b5b7025032818eb5abbeb8a736f853" uuid = "e99dba38-086e-5de3-a5b1-6e4c66e897c3" -version = "6.8.2+1" +version = "6.8.2+2" [[deps.QuadGK]] deps = ["DataStructures", "LinearAlgebra"] @@ -1018,7 +1099,7 @@ version = "2.11.2" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" [[deps.REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "StyledStrings", "Unicode"] +deps = ["InteractiveUtils", "JuliaSyntaxHighlighting", "Markdown", "Sockets", "StyledStrings", "Unicode"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" version = "1.11.0" @@ -1068,9 +1149,9 @@ version = "1.3.1" [[deps.Rmath]] deps = ["Random", "Rmath_jll"] -git-tree-sha1 = "852bd0f55565a9e973fcfee83a84413270224dc4" +git-tree-sha1 = "5b3d50eb374cea306873b371d3f8d3915a018f0b" uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" -version = "0.8.0" +version = "0.9.0" [[deps.Rmath_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -1120,20 +1201,20 @@ version = "1.11.0" [[deps.SortingAlgorithms]] deps = ["DataStructures"] -git-tree-sha1 = "66e0a8e672a0bdfca2c3f5937efb8538b9ddc085" +git-tree-sha1 = "64d974c2e6fdf07f8155b5b2ca2ffa9069b608d9" uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" -version = "1.2.1" +version = "1.2.2" [[deps.SparseArrays]] deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" -version = "1.11.0" +version = "1.12.0" [[deps.SpecialFunctions]] deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] -git-tree-sha1 = "41852b8679f78c8d8961eeadc8f62cef861a52e3" +git-tree-sha1 = "f2685b435df2613e25fc10ad8c26dddb8640f547" uuid = "276daf66-3868-5448-9aa4-cd146d93841b" -version = "2.5.1" +version = "2.6.1" weakdeps = ["ChainRulesCore"] [deps.SpecialFunctions.extensions] @@ -1141,15 +1222,15 @@ weakdeps = ["ChainRulesCore"] [[deps.StableRNGs]] deps = ["Random"] -git-tree-sha1 = "95af145932c2ed859b63329952ce8d633719f091" +git-tree-sha1 = "4f96c596b8c8258cc7d3b19797854d368f243ddc" uuid = "860ef19b-820b-49d6-a774-d7a799459cd3" -version = "1.0.3" +version = "1.0.4" [[deps.StaticArrays]] deps = ["LinearAlgebra", "PrecompileTools", "Random", "StaticArraysCore"] -git-tree-sha1 = "0feb6b9031bd5c51f9072393eb5ab3efd31bf9e4" +git-tree-sha1 = "b8693004b385c842357406e3af647701fe783f98" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.9.13" +version = "1.9.15" weakdeps = ["ChainRulesCore", "Statistics"] [deps.StaticArrays.extensions] @@ -1157,9 +1238,9 @@ weakdeps = ["ChainRulesCore", "Statistics"] StaticArraysStatisticsExt = "Statistics" [[deps.StaticArraysCore]] -git-tree-sha1 = "192954ef1208c7019899fbf8049e717f92959682" +git-tree-sha1 = "6ab403037779dae8c514bad259f32a447262455a" uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" -version = "1.4.3" +version = "1.4.4" [[deps.Statistics]] deps = ["LinearAlgebra"] @@ -1179,15 +1260,15 @@ version = "1.7.1" [[deps.StatsBase]] deps = ["AliasTables", "DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"] -git-tree-sha1 = "b81c5035922cc89c2d9523afc6c54be512411466" +git-tree-sha1 = "064b532283c97daae49e544bb9cb413c26511f8c" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" -version = "0.34.5" +version = "0.34.8" [[deps.StatsFuns]] deps = ["HypergeometricFunctions", "IrrationalConstants", "LogExpFunctions", "Reexport", "Rmath", "SpecialFunctions"] -git-tree-sha1 = "8e45cecc66f3b42633b8ce14d431e8e57a3e242e" +git-tree-sha1 = "91f091a8716a6bb38417a6e6f274602a19aaa685" uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" -version = "1.5.0" +version = "1.5.2" [deps.StatsFuns.extensions] StatsFunsChainRulesCoreExt = "ChainRulesCore" @@ -1199,9 +1280,9 @@ version = "1.5.0" [[deps.StatsPlots]] deps = ["AbstractFFTs", "Clustering", "DataStructures", "Distributions", "Interpolations", "KernelDensity", "LinearAlgebra", "MultivariateStats", "NaNMath", "Observables", "Plots", "RecipesBase", "RecipesPipeline", "Reexport", "StatsBase", "TableOperations", "Tables", "Widgets"] -git-tree-sha1 = "3b1dcbf62e469a67f6733ae493401e53d92ff543" +git-tree-sha1 = "88cf3587711d9ad0a55722d339a013c4c56c5bbc" uuid = "f3b207a7-027a-5e70-b257-86293d7955fd" -version = "0.15.7" +version = "0.15.8" [[deps.StringManipulation]] deps = ["PrecompileTools"] @@ -1220,7 +1301,7 @@ uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" [[deps.SuiteSparse_jll]] deps = ["Artifacts", "Libdl", "libblastrampoline_jll"] uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" -version = "7.7.0+0" +version = "7.8.3+2" [[deps.TOML]] deps = ["Dates"] @@ -1267,14 +1348,14 @@ uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" version = "0.11.3" [[deps.Tricks]] -git-tree-sha1 = "6cae795a5a9313bbb4f60683f7263318fc7d1505" +git-tree-sha1 = "311349fd1c93a31f783f977a71e8b062a57d4101" uuid = "410a4b4d-49e4-4fbc-ab6d-cb71b17b3775" -version = "0.1.10" +version = "0.1.13" [[deps.URIs]] -git-tree-sha1 = "24c1c558881564e2217dcf7840a8b2e10caeb0f9" +git-tree-sha1 = "bef26fb046d031353ef97a82e3fdb6afe7f21b1a" uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" -version = "1.6.0" +version = "1.6.1" [[deps.UUIDs]] deps = ["Random", "SHA"] @@ -1291,30 +1372,6 @@ git-tree-sha1 = "53915e50200959667e78a92a418594b428dffddf" uuid = "1cfade01-22cf-5700-b092-accc4b62d6e1" version = "0.4.1" -[[deps.Unitful]] -deps = ["Dates", "LinearAlgebra", "Random"] -git-tree-sha1 = "d2282232f8a4d71f79e85dc4dd45e5b12a6297fb" -uuid = "1986cc42-f94f-5a68-af5c-568840ba703d" -version = "1.23.1" - - [deps.Unitful.extensions] - ConstructionBaseUnitfulExt = "ConstructionBase" - ForwardDiffExt = "ForwardDiff" - InverseFunctionsUnitfulExt = "InverseFunctions" - PrintfExt = "Printf" - - [deps.Unitful.weakdeps] - ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9" - ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" - InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" - Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.UnitfulLatexify]] -deps = ["LaTeXStrings", "Latexify", "Unitful"] -git-tree-sha1 = "af305cc62419f9bd61b6644d19170a4d258c7967" -uuid = "45397f5d-5981-4c77-b2b3-fc36d6e9b728" -version = "1.7.0" - [[deps.Unzip]] git-tree-sha1 = "ca0969166a028236229f63514992fc073799bb78" uuid = "41fe7b60-77ed-43a1-b4f0-825fd5a5650d" @@ -1327,16 +1384,10 @@ uuid = "a44049a8-05dd-5a78-86c9-5fde0876e88c" version = "1.3.243+0" [[deps.Wayland_jll]] -deps = ["Artifacts", "EpollShim_jll", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll", "XML2_jll"] -git-tree-sha1 = "53ab3e9c94f4343c68d5905565be63002e13ec8c" +deps = ["Artifacts", "EpollShim_jll", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll"] +git-tree-sha1 = "96478df35bbc2f3e1e791bc7a3d0eeee559e60e9" uuid = "a2964d1f-97da-50d4-b82a-358c7fce9d89" -version = "1.23.1+1" - -[[deps.Wayland_protocols_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "54b8a029ac145ebe8299463447fd1590b2b1d92f" -uuid = "2381bf8a-dfd0-557d-9999-79630e7b1b91" -version = "1.44.0+0" +version = "1.24.0+0" [[deps.Widgets]] deps = ["Colors", "Dates", "Observables", "OrderedCollections"] @@ -1350,12 +1401,6 @@ git-tree-sha1 = "c1a7aa6219628fcd757dede0ca95e245c5cd9511" uuid = "efce3f68-66dc-5838-9240-27a6d6f5f9b6" version = "1.0.0" -[[deps.XML2_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"] -git-tree-sha1 = "b8b243e47228b4a3877f1dd6aee0c5d56db7fcf4" -uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" -version = "2.13.6+1" - [[deps.XZ_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] git-tree-sha1 = "fee71455b0aaa3440dfdd54a9a36ccef829be7d4" @@ -1406,9 +1451,9 @@ version = "1.3.7+0" [[deps.Xorg_libXfixes_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libX11_jll"] -git-tree-sha1 = "9caba99d38404b285db8801d5c45ef4f4f425a6d" +git-tree-sha1 = "75e00946e43621e09d431d9b95818ee751e6b2ef" uuid = "d091e8ba-531a-589c-9de9-94069b037ed8" -version = "6.0.1+0" +version = "6.0.2+0" [[deps.Xorg_libXi_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libXext_jll", "Xorg_libXfixes_jll"] @@ -1448,9 +1493,9 @@ version = "1.1.3+0" [[deps.Xorg_xcb_util_cursor_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_jll", "Xorg_xcb_util_renderutil_jll"] -git-tree-sha1 = "c5bf2dad6a03dfef57ea0a170a1fe493601603f2" +git-tree-sha1 = "9750dc53819eba4e9a20be42349a6d3b86c7cdf8" uuid = "e920d4aa-a673-5f3a-b3d7-f755a4d47c43" -version = "0.1.5+0" +version = "0.1.6+0" [[deps.Xorg_xcb_util_image_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_xcb_util_jll"] @@ -1503,7 +1548,7 @@ version = "1.6.0+0" [[deps.Zlib_jll]] deps = ["Libdl"] uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.13+1" +version = "1.3.1+2" [[deps.Zstd_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -1525,20 +1570,20 @@ version = "0.61.1+0" [[deps.libaom_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "522c1df09d05a71785765d19c9524661234738e9" +git-tree-sha1 = "371cc681c00a3ccc3fbc5c0fb91f58ba9bec1ecf" uuid = "a4ae2306-e953-59d6-aa16-d00cac43593b" -version = "3.11.0+0" +version = "3.13.1+0" [[deps.libass_jll]] deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "HarfBuzz_jll", "JLLWrappers", "Libdl", "Zlib_jll"] -git-tree-sha1 = "e17c115d55c5fbb7e52ebedb427a0dca79d4484e" +git-tree-sha1 = "125eedcb0a4a0bba65b657251ce1d27c8714e9d6" uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0" -version = "0.15.2+0" +version = "0.17.4+0" [[deps.libblastrampoline_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" -version = "5.11.0+0" +version = "5.15.0+0" [[deps.libdecor_jll]] deps = ["Artifacts", "Dbus_jll", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pango_jll", "Wayland_jll", "xkbcommon_jll"] @@ -1554,9 +1599,9 @@ version = "1.13.4+0" [[deps.libfdk_aac_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "8a22cf860a7d27e4f3498a0fe0811a7957badb38" +git-tree-sha1 = "646634dd19587a56ee2f1199563ec056c5f228df" uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280" -version = "2.0.3+0" +version = "2.0.4+0" [[deps.libinput_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "eudev_jll", "libevdev_jll", "mtdev_jll"] @@ -1566,15 +1611,15 @@ version = "1.28.1+0" [[deps.libpng_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"] -git-tree-sha1 = "cd155272a3738da6db765745b89e466fa64d0830" +git-tree-sha1 = "07b6a107d926093898e82b3b1db657ebe33134ec" uuid = "b53b4c65-9356-5827-b1ea-8c7a1a84506f" -version = "1.6.49+0" +version = "1.6.50+0" [[deps.libvorbis_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Ogg_jll", "Pkg"] -git-tree-sha1 = "490376214c4721cdaca654041f635213c6165cb3" +deps = ["Artifacts", "JLLWrappers", "Libdl", "Ogg_jll"] +git-tree-sha1 = "11e1772e7f3cc987e9d3de991dd4f6b2602663a5" uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a" -version = "1.3.7+2" +version = "1.3.8+0" [[deps.mtdev_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl"] @@ -1585,43 +1630,51 @@ version = "1.1.7+0" [[deps.nghttp2_jll]] deps = ["Artifacts", "Libdl"] uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" -version = "1.59.0+0" +version = "1.64.0+1" [[deps.oneTBB_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl"] -git-tree-sha1 = "d5a767a3bb77135a99e433afe0eb14cd7f6914c3" +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl"] +git-tree-sha1 = "1350188a69a6e46f799d3945beef36435ed7262f" uuid = "1317d2d5-d96f-522e-a858-c73665f53c3e" -version = "2022.0.0+0" +version = "2022.0.0+1" [[deps.p7zip_jll]] deps = ["Artifacts", "Libdl"] uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" -version = "17.4.0+2" +version = "17.5.0+2" [[deps.x264_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "4fea590b89e6ec504593146bf8b988b2c00922b2" +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "14cc7083fc6dff3cc44f2bc435ee96d06ed79aa7" uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a" -version = "2021.5.5+0" +version = "10164.0.1+0" [[deps.x265_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] -git-tree-sha1 = "ee567a171cce03570d77ad3a43e90218e38937a9" +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "e7b67590c14d487e734dcb925924c5dc43ec85f3" uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76" -version = "3.5.0+0" +version = "4.1.0+0" [[deps.xkbcommon_jll]] -deps = ["Artifacts", "JLLWrappers", "Libdl", "Wayland_jll", "Wayland_protocols_jll", "Xorg_libxcb_jll", "Xorg_xkeyboard_config_jll"] -git-tree-sha1 = "c950ae0a3577aec97bfccf3381f66666bc416729" +deps = ["Artifacts", "JLLWrappers", "Libdl", "Xorg_libxcb_jll", "Xorg_xkeyboard_config_jll"] +git-tree-sha1 = "fbf139bce07a534df0e699dbb5f5cc9346f95cc1" uuid = "d8fb68d0-12a3-5cfd-a85a-d49703b185fd" -version = "1.8.1+0" +version = "1.9.2+0" """ # ╔═╡ Cell order: # ╟─7259dbee-52ac-11f0-3192-dd97323b274a # ╟─d4accca6-f650-453c-bb75-a8e4cac568c1 # ╟─51dbf656-5846-440e-aff6-ea7d57e62e5c -# ╟─3cd5fd0a-6f16-4cb3-87ba-43b86224b81c -# ╟─0f77fb5f-e894-43e4-94f5-4ed93af7ba9b +# ╠═3cd5fd0a-6f16-4cb3-87ba-43b86224b81c +# ╠═cde9391a-44dd-49ee-8730-4b9ad58c3d90 +# ╠═8601e5de-180c-45b5-b0c6-1f8d807df6d0 +# ╠═e27f7b92-79f2-4351-bbc5-46d6e5a9fd67 +# ╠═0f77fb5f-e894-43e4-94f5-4ed93af7ba9b +# ╠═002b7672-9431-4510-ba89-84be098a2f9f +# ╠═b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2 +# ╠═4681accf-eaed-47e0-9d26-ab968df83c8a +# ╠═7a3a4783-8424-488d-8122-1d65680703ac +# ╠═f3ac43c7-f6d0-4aa6-9d43-c66907de4fa0 # ╟─00000000-0000-0000-0000-000000000001 # ╟─00000000-0000-0000-0000-000000000002 diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl index 93736bb..8a8dd98 100644 --- a/benchmark/runbenchmarks.jl +++ b/benchmark/runbenchmarks.jl @@ -119,7 +119,15 @@ end @info "Preparing benchmarks" warmup(SUITE; verbose=false) -tune!(SUITE) + +if isfile("params.json") + @info "Loading params" + loadparams!(SUITE, BenchmarkTools.load("params.json")[1], :evals, :samples); +else + @info "Tuning suite" + tune!(SUITE) + BenchmarkTools.save("params.json", params(SUITE)); +end reclaim_mem() @@ -127,6 +135,7 @@ reclaim_mem() results = run(SUITE, verbose=true) BenchmarkTools.save("benchmarkresults.json", median(results)) +BenchmarkTools.save("benchmarkresultsstd.json", std(results)) # save plots for each file/datatype # for l1 in keys(results) From 59fa29e8e31897e19493b82b1971f2ae080ff2f1 Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:36:39 -0400 Subject: [PATCH 03/12] Extra shape tests --- benchmark/accumulate_nd.jl | 20 +++++++++++--------- benchmark/mapreduce_nd.jl | 23 +++++++++++------------ 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/benchmark/accumulate_nd.jl b/benchmark/accumulate_nd.jl index e8747db..24f253e 100644 --- a/benchmark/accumulate_nd.jl +++ b/benchmark/accumulate_nd.jl @@ -12,17 +12,19 @@ for T in [UInt32, Int64, Float32] local randrange = T == Float32 ? T : T(1):T(100) - _group["base_dims=1"] = @benchmarkable @sb(Base.accumulate(+, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["acck_dims=1"] = @benchmarkable @sb(AK.accumulate(+, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + for (suff, (n1, n2)) in (("L", (3, 1_000_000)), ("", (512, 1000))) + _group["base_dims=1$suff"] = @benchmarkable @sb(Base.accumulate(+, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["acck_dims=1$suff"] = @benchmarkable @sb(AK.accumulate(+, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["base_dims=2"] = @benchmarkable @sb(Base.accumulate(+, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["acck_dims=2"] = @benchmarkable @sb(AK.accumulate(+, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["base_dims=2$suff"] = @benchmarkable @sb(Base.accumulate(+, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["acck_dims=2$suff"] = @benchmarkable @sb(AK.accumulate(+, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - T == Float32 || continue + T == Float32 || continue - _group["base_sincos_dims=1"] = @benchmarkable @sb(Base.accumulate(acc_f, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["acck_sincos_dims=1"] = @benchmarkable @sb(AK.accumulate(acc_f, v, init=$T(0), neutral=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["base_sincos_dims=1$suff"] = @benchmarkable @sb(Base.accumulate(acc_f, v, init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["acck_sincos_dims=1$suff"] = @benchmarkable @sb(AK.accumulate(acc_f, v, init=$T(0), neutral=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["base_sincos_dims=2"] = @benchmarkable @sb(Base.accumulate(acc_f, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["acck_sincos_dims=2"] = @benchmarkable @sb(AK.accumulate(acc_f, v, init=$T(0), neutral=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["base_sincos_dims=2$suff"] = @benchmarkable @sb(Base.accumulate(acc_f, v, init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["acck_sincos_dims=2$suff"] = @benchmarkable @sb(AK.accumulate(acc_f, v, init=$T(0), neutral=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + end end diff --git a/benchmark/mapreduce_nd.jl b/benchmark/mapreduce_nd.jl index 26578be..2aef7c1 100644 --- a/benchmark/mapreduce_nd.jl +++ b/benchmark/mapreduce_nd.jl @@ -1,24 +1,23 @@ group = addgroup!(SUITE, "mapreduce_nd") -n1 = 3 -n2 = 1_000_000 - for T in [UInt32, Int64, Float32] local _group = addgroup!(group, "$T") local randrange = T == Float32 ? T : T(1):T(100) - _group["base_dims=1"] = @benchmarkable @sb(Base.reduce(+, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["acck_dims=1"] = @benchmarkable @sb(AK.reduce(+, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + for (suff, (n1, n2)) in (("L", (3, 1_000_000)), ("", (512, 1000))) + _group["base_dims=1$(suff)"] = @benchmarkable @sb(Base.reduce(+, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["acck_dims=1$(suff)"] = @benchmarkable @sb(AK.reduce(+, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["base_dims=2"] = @benchmarkable @sb(Base.reduce(+, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["acck_dims=2"] = @benchmarkable @sb(AK.reduce(+, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["base_dims=2$(suff)"] = @benchmarkable @sb(Base.reduce(+, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["acck_dims=2$(suff)"] = @benchmarkable @sb(AK.reduce(+, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - T == Float32 || continue + T == Float32 || continue - _group["base_dims=1_sin"] = @benchmarkable @sb(Base.mapreduce(sin, +, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["acck_dims=1_sin"] = @benchmarkable @sb(AK.mapreduce(sin, +, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["base_dims=1$(suff)_sin"] = @benchmarkable @sb(Base.mapreduce(sin, +, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["acck_dims=1$(suff)_sin"] = @benchmarkable @sb(AK.mapreduce(sin, +, v; init=$T(0), dims=1)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["base_dims=2_sin"] = @benchmarkable @sb(Base.mapreduce(sin, +, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) - _group["acck_dims=2_sin"] = @benchmarkable @sb(AK.mapreduce(sin, +, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["base_dims=2$(suff)_sin"] = @benchmarkable @sb(Base.mapreduce(sin, +, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + _group["acck_dims=2$(suff)_sin"] = @benchmarkable @sb(AK.mapreduce(sin, +, v; init=$T(0), dims=2)) setup=(v = ArrayType(rand(rng, $randrange, n1, n2))) + end end From 3321852e3f4e26b369812f3918fee31993279109 Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:37:14 -0400 Subject: [PATCH 04/12] Support specifying benchmark output file --- benchmark/runbenchmarks.jl | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/benchmark/runbenchmarks.jl b/benchmark/runbenchmarks.jl index 8a8dd98..4e59dc5 100644 --- a/benchmark/runbenchmarks.jl +++ b/benchmark/runbenchmarks.jl @@ -10,6 +10,7 @@ rng = StableRNG(123) # parse command line args BACKENDS = ["--CUDA", "--oneAPI", "--AMDGPU", "--Metal", "--OpenCL", "--CPU"] b_opt_idx = in.(ARGS, Ref(BACKENDS)) +out_opt_idx = findall(x -> endswith(x, ".json",), ARGS) if !@isdefined(backend_arg) backend_arg = if sum(b_opt_idx) == 0 @@ -22,7 +23,9 @@ if !@isdefined(backend_arg) end backend_arg in BACKENDS || throw(ArgumentError("\"$backend_arg\" is not a valid backend.")) -other_args = ARGS[.!b_opt_idx] +other_args_idx = copy(b_opt_idx) +other_args_idx[out_opt_idx] .= true +other_args = ARGS[.!other_args_idx] # other_args = ["accumulate_1"] bench_to_include = isempty(other_args) ? nothing : other_args @@ -134,8 +137,15 @@ reclaim_mem() @info "Running benchmarks" results = run(SUITE, verbose=true) -BenchmarkTools.save("benchmarkresults.json", median(results)) -BenchmarkTools.save("benchmarkresultsstd.json", std(results)) + +result_file = if isempty((out_opt_idx)) + "benchmarkresults" +else + first(splitext(ARGS[first(out_opt_idx)])) +end + +BenchmarkTools.save("$(result_file).json", median(results)) +BenchmarkTools.save("$(result_file)std.json", std(results)) # save plots for each file/datatype # for l1 in keys(results) From 73acfe21b4821c07098829f302c79bcab3a6b95b Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:56:14 -0400 Subject: [PATCH 05/12] Show standard error and more improvements --- benchmark/benchmark_graphs_nb.jl | 102 ++++++++++--------------------- 1 file changed, 33 insertions(+), 69 deletions(-) diff --git a/benchmark/benchmark_graphs_nb.jl b/benchmark/benchmark_graphs_nb.jl index 6ee481a..4007a91 100644 --- a/benchmark/benchmark_graphs_nb.jl +++ b/benchmark/benchmark_graphs_nb.jl @@ -27,18 +27,13 @@ md""" """ end -# ╔═╡ 8601e5de-180c-45b5-b0c6-1f8d807df6d0 - - -# ╔═╡ e27f7b92-79f2-4351-bbc5-46d6e5a9fd67 - - # ╔═╡ 0f77fb5f-e894-43e4-94f5-4ed93af7ba9b begin - function plot_benches(df, cat, t; ylabel="Time (ns)") + function plot_benches(df, cat, t; ylabel="Time (ns)", kwargs...) df = filter(x -> isequal(cat, x.Category), df) df = filter(x -> isequal(t, x.T), df) - groupedbar(df.Time; group=df.alg, ylabel, title="$cat/$t", xticks=(1:length(unique(df.test)),unique(df.test)), xtickfontsize=6, xrotation = 30) + yerror = "Std" in names(df) ? df.Std : nothing + groupedbar(df.Time; group=df.alg, ylabel, yerror, title="$cat/$t", xticks=(1:length(unique(df.test)),unique(df.test)), xtickfontsize=6, xrotation = 30, kwargs...) end function getbenches(res) _res = res[2][1] @@ -60,34 +55,42 @@ begin end end make_res_df(file) = make_res_df(x -> startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK"), file) - function make_res_df(alg_f, file) + function _make_res_df(alg_f, file) benchresults = JSON.parsefile(file) benchres_df = getbenches(benchresults) benchres_df.alg .= [alg_f(x) for x in benchres_df.Bench] benchres_df.test .= [x[6:end] for x in benchres_df.Bench] - - sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) return benchres_df end + function make_res_df(alg_f, file, filestd=true) + benchres_df = _make_res_df(alg_f, file) + + final_df = if filestd + sbenchres_df = _make_res_df(alg_f, "$(first(splitext(file)))std.json") + rename!(sbenchres_df, :Time => :Std) + + innerjoin(benchres_df, sbenchres_df; on=[:Category, :T, :Bench, :alg, :test]) + else + benchres_df + end + final_df = sort!(final_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) + + remove_trail(x) = first(split(x, "_")) + final_df.Category .= remove_trail.(final_df.Category) + + sort!(final_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) + return final_df + end end -# ╔═╡ 002b7672-9431-4510-ba89-84be098a2f9f -# begin -# benchresultspre = JSON.parsefile("benchmarkresultsstd.json") -# benchrespre_df = getbenches(benchresultspre) -# benchrespre_df.alg .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchrespre_df.Bench] -# benchrespre_df.test .= [x[6:end] for x in benchrespre_df.Bench] - -# benchrespre_df.alg .= [x == Symbol("1Base") ? Symbol("2Base") : Symbol("4AccK") for x in benchrespre_df.alg] - -# sort!(benchrespre_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) -# end - # ╔═╡ b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2 begin - benchrespre_df = make_res_df(x->startswith(x, "base") ? Symbol("1BasePre") : Symbol("4AccKPre"), "benchmarkresultspre.json") - benchres_df = make_res_df(x->startswith(x, "base") ? Symbol("2Base") : Symbol("5AccK"), "benchmarkresultsnew.json") - benchresi32_df = make_res_df(x->startswith(x, "base") ? Symbol("3BaseI32") : Symbol("6AccKI32"), "benchmarkresults.json") + benchreska09_df = make_res_df(x->startswith(x, "base") ? Symbol("1Baseka0.9") : Symbol("5AccKka0.9"), "benchmarkresultska0.9.json") + benchreska10_df = make_res_df(x->startswith(x, "base") ? Symbol("2Baseka0.10") : Symbol("6AccKka0.10"), "benchmarkresultska0.10.json") + benchreski10_df = make_res_df(x->startswith(x, "base") ? Symbol("3Baseki0.10") : Symbol("7AccKki0.10"), "benchmarkresultski0.10.json") + benchreski10heur_df = make_res_df(x->startswith(x, "base") ? Symbol("4Baseki0.10heur") : Symbol("8AccKkiheur0.10"), "benchmarkresultski0.10heur.json") + benchres_df = [benchreska09_df;benchreska10_df;benchreski10_df;benchreski10heur_df] + benchres_df = benchres_df[.!occursin.(Ref("sin"), benchres_df.Bench), :] end # ╔═╡ d4accca6-f650-453c-bb75-a8e4cac568c1 @@ -100,40 +103,8 @@ md" Type: $(@bind typ Select(unique(benchres_df.T);)) " -# ╔═╡ 3cd5fd0a-6f16-4cb3-87ba-43b86224b81c -plot_benches(benchres_df, cat, typ) - # ╔═╡ cde9391a-44dd-49ee-8730-4b9ad58c3d90 -plot_benches([benchres_df;benchrespre_df;benchresi32_df], cat, typ) - -# ╔═╡ 4681accf-eaed-47e0-9d26-ab968df83c8a -# begin -# make_res_df(file) = make_res_df(x -> startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK"), file) -# function make_res_df(alg_f, file) -# benchresults = JSON.parsefile("benchmarkresultsstd.json") -# benchres_df = getbenches(benchresults) -# benchres_df.alg .= [alg_f(x) for x in benchres_df.Bench] -# benchres_df.test .= [x[6:end] for x in benchres_df.Bench] - -# sort!(benchres_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) -# return benchres_df -# end -# end - -# ╔═╡ 7a3a4783-8424-488d-8122-1d65680703ac -# begin -# benchresultspre = JSON.parsefile("benchmarkresultsstd.json") -# benchrespre_df = getbenches(benchresultspre) -# benchrespre_df.alg .= [startswith(x, "base") ? Symbol("1Base") : Symbol("2AccK") for x in benchrespre_df.Bench] -# benchrespre_df.test .= [x[6:end] for x in benchrespre_df.Bench] - -# benchrespre_df.alg .= [x == Symbol("1Base") ? Symbol("2Base") : Symbol("4AccK") for x in benchrespre_df.alg] - -# sort!(benchrespre_df, [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) -# end - -# ╔═╡ f3ac43c7-f6d0-4aa6-9d43-c66907de4fa0 -benchresboth_df = sort!([benchres_df;benchrespre_df], [:Category, :T, order(:Bench, by=length), :test, order(:alg, rev=true)]) +plot_benches(benchres_df, cat, typ) # ╔═╡ 00000000-0000-0000-0000-000000000001 PLUTO_PROJECT_TOML_CONTENTS = """ @@ -1666,15 +1637,8 @@ version = "1.9.2+0" # ╟─7259dbee-52ac-11f0-3192-dd97323b274a # ╟─d4accca6-f650-453c-bb75-a8e4cac568c1 # ╟─51dbf656-5846-440e-aff6-ea7d57e62e5c -# ╠═3cd5fd0a-6f16-4cb3-87ba-43b86224b81c -# ╠═cde9391a-44dd-49ee-8730-4b9ad58c3d90 -# ╠═8601e5de-180c-45b5-b0c6-1f8d807df6d0 -# ╠═e27f7b92-79f2-4351-bbc5-46d6e5a9fd67 -# ╠═0f77fb5f-e894-43e4-94f5-4ed93af7ba9b -# ╠═002b7672-9431-4510-ba89-84be098a2f9f -# ╠═b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2 -# ╠═4681accf-eaed-47e0-9d26-ab968df83c8a -# ╠═7a3a4783-8424-488d-8122-1d65680703ac -# ╠═f3ac43c7-f6d0-4aa6-9d43-c66907de4fa0 +# ╟─cde9391a-44dd-49ee-8730-4b9ad58c3d90 +# ╟─0f77fb5f-e894-43e4-94f5-4ed93af7ba9b +# ╟─b6c92a3f-2b4d-4ebf-82fc-09b1f21531d2 # ╟─00000000-0000-0000-0000-000000000001 # ╟─00000000-0000-0000-0000-000000000002 From 8dce5fa5654996e2150903e693de631f69586349 Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:57:25 -0400 Subject: [PATCH 06/12] Implement algorithm selection heuristic --- src/accumulate/accumulate_nd.jl | 17 +++++++++-------- src/reduce/mapreduce_nd.jl | 16 +++++++++------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl index d045948..77458d0 100644 --- a/src/accumulate/accumulate_nd.jl +++ b/src/accumulate/accumulate_nd.jl @@ -38,17 +38,18 @@ function accumulate_nd!( if !use_gpu_algorithm(backend, prefer_threads) _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems) else - # On GPUs we have two parallelisation approaches, based on which dimension has more elements: - # - If the dimension we are accumulating along has more elements than the "outer" dimensions, - # (e.g. accumulate(+, rand(3, 1000), dims=2)), we use a block of threads per outer - # dimension - thus, a block of threads reduces the dims axis - # - If the other dimensions have more elements (e.g. reduce(+, rand(3, 1000), dims=1)), we - # use a single thread per outer dimension - thus, a thread reduces the dims axis - # sequentially, while the other dimensions are processed in parallel, independently + # On GPUs we have two parallelisation approaches, based on destination dimension and current hardware: + # - If the other dimensions have more elements than the product of the device's compute units and + # maximum number of threads , we use a single thread per outer dimension - thus, a thread reduces + # the dims axis sequentially, while the other dimensions are processed in parallel, independently + # - If the dimension we are accumulating along has more elements, we use a block of threads per outer + # element - thus, a block of threads reduces the dims axis length_dims = vsizes[dims] length_outer = length(v) ÷ length_dims - if length_outer >= length_dims + serial_threshold = KI.max_work_group_size(backend) * KI.multiprocessor_count(backend) + + if length_outer >= serial_threshold # One thread per outer dimension blocks = (length_outer + block_size - 1) ÷ block_size KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_thread!( diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl index 71fad0d..da00582 100644 --- a/src/reduce/mapreduce_nd.jl +++ b/src/reduce/mapreduce_nd.jl @@ -123,13 +123,15 @@ function mapreduce_nd( min_elems=min_elems, ) else - # On GPUs we have two parallelisation approaches, based on which dimension has more elements: - # - If the dimension we are reducing has more elements, (e.g. reduce(+, rand(3, 1000), dims=2)), - # we use a block of threads per dst element - thus, a block of threads reduces the dims axis - # - If the other dimensions have more elements (e.g. reduce(+, rand(3, 1000), dims=1)), we - # use a single thread per dst element - thus, a thread reduces the dims axis sequentially, - # while the other dimensions are processed in parallel, independently - if dst_size >= src_sizes[dims] + # On GPUs we have two parallelisation approaches, based on destination dimension and current hardware: + # - If the other dimensions have more elements than the product of the device's compute units and + # maximum number of threads , we use a single thread per dst element - thus, a thread reduces + # the dims axis sequentially, while the other dimensions are processed in parallel, independently + # - If the dimension we are reducing has more elements, we use a block of threads per dst + # element - thus, a block of threads reduces the dims axis + by_thread_threshold = KI.max_work_group_size(backend) * KI.multiprocessor_count(backend) + + if dst_size >= by_thread_threshold blocks = (dst_size + block_size - 1) ÷ block_size KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_thread!( src, dst, f, op, init, dims, Val(block_size) From e8bc09cee2fae5f1f379bd10bbd75b63b8bb6d36 Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 16 Nov 2025 15:27:53 -0400 Subject: [PATCH 07/12] Only check `block_size` argument when on GPU --- src/accumulate/accumulate_nd.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl index 77458d0..a078f12 100644 --- a/src/accumulate/accumulate_nd.jl +++ b/src/accumulate/accumulate_nd.jl @@ -13,10 +13,6 @@ function accumulate_nd!( # GPU settings block_size::Int, ) - # Correctness checks - @argcheck block_size > 0 - @argcheck ispow2(block_size) - # Degenerate cases begin; order of priority matters # Invalid dims @@ -38,6 +34,10 @@ function accumulate_nd!( if !use_gpu_algorithm(backend, prefer_threads) _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems) else + # Correctness checks + @argcheck block_size > 0 + @argcheck ispow2(block_size) + # On GPUs we have two parallelisation approaches, based on destination dimension and current hardware: # - If the other dimensions have more elements than the product of the device's compute units and # maximum number of threads , we use a single thread per outer dimension - thus, a thread reduces From e12445ab0b4817bc104164883d044618a8abb1a5 Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 16 Nov 2025 16:39:53 -0400 Subject: [PATCH 08/12] jyf --- src/AcceleratedKernels.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/AcceleratedKernels.jl b/src/AcceleratedKernels.jl index d4655de..5f14229 100644 --- a/src/AcceleratedKernels.jl +++ b/src/AcceleratedKernels.jl @@ -14,6 +14,7 @@ module AcceleratedKernels using ArgCheck: @argcheck using GPUArraysCore: AnyGPUArray, @allowscalar using KernelAbstractions +import KernelAbstractions.KernelIntrinsics as KI import UnsafeAtomics From dcb01d9af68bee711a112c96e24290362c543a46 Mon Sep 17 00:00:00 2001 From: Christian <28689358+christiangnrd@users.noreply.github.com> Date: Sun, 16 Nov 2025 16:42:47 -0400 Subject: [PATCH 09/12] Progress --- benchmark/Project.toml | 1 + src/accumulate/accumulate.jl | 8 ++--- src/accumulate/accumulate_1d_cpu.jl | 2 +- src/accumulate/accumulate_1d_gpu.jl | 4 +-- src/accumulate/accumulate_nd.jl | 36 +++++++++++++-------- src/foreachindex.jl | 31 +++++++++--------- src/reduce/mapreduce_1d_cpu.jl | 2 +- src/reduce/mapreduce_1d_gpu.jl | 49 +++++++++++++++++++---------- src/reduce/mapreduce_nd.jl | 34 +++++++++++++------- src/reduce/reduce.jl | 6 ++-- src/reduce/utilities.jl | 2 +- src/utils.jl | 4 +++ 12 files changed, 110 insertions(+), 69 deletions(-) diff --git a/benchmark/Project.toml b/benchmark/Project.toml index a441514..1b8be73 100644 --- a/benchmark/Project.toml +++ b/benchmark/Project.toml @@ -2,6 +2,7 @@ AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" BenchmarkPlots = "ab8c0f59-4072-4e0d-8f91-a91e1495eb26" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" diff --git a/src/accumulate/accumulate.jl b/src/accumulate/accumulate.jl index e60be03..f9858c3 100644 --- a/src/accumulate/accumulate.jl +++ b/src/accumulate/accumulate.jl @@ -43,7 +43,7 @@ include("accumulate_nd.jl") alg::AccumulateAlgorithm=ScanPrefixes(), # GPU settings - block_size::Int=256, + block_size::Union{Nothing, Int}=nothing, temp::Union{Nothing, AbstractArray}=nothing, temp_flags::Union{Nothing, AbstractArray}=nothing, ) @@ -63,7 +63,7 @@ include("accumulate_nd.jl") alg::AccumulateAlgorithm=ScanPrefixes(), # GPU settings - block_size::Int=256, + block_size::Union{Nothing, Int}=nothing, temp::Union{Nothing, AbstractArray}=nothing, temp_flags::Union{Nothing, AbstractArray}=nothing, ) @@ -162,7 +162,7 @@ function _accumulate_impl!( prefer_threads::Bool=true, # GPU settings - block_size::Int=256, + block_size::Union{Nothing, Int}=nothing, temp::Union{Nothing, AbstractArray}=nothing, temp_flags::Union{Nothing, AbstractArray}=nothing, ) @@ -209,7 +209,7 @@ end alg::AccumulateAlgorithm=ScanPrefixes(), # GPU settings - block_size::Int=256, + block_size::Union{Nothing, Int}=nothing, temp::Union{Nothing, AbstractArray}=nothing, temp_flags::Union{Nothing, AbstractArray}=nothing, ) diff --git a/src/accumulate/accumulate_1d_cpu.jl b/src/accumulate/accumulate_1d_cpu.jl index 9f45ada..3d7b98c 100644 --- a/src/accumulate/accumulate_1d_cpu.jl +++ b/src/accumulate/accumulate_1d_cpu.jl @@ -9,7 +9,7 @@ function accumulate_1d_cpu!( min_elems::Int, # GPU settings - not used - block_size::Int, + block_size::Union{Nothing, Int}, temp::Union{Nothing, AbstractArray}, temp_flags::Union{Nothing, AbstractArray}, ) diff --git a/src/accumulate/accumulate_1d_gpu.jl b/src/accumulate/accumulate_1d_gpu.jl index e1dc0f8..2fb226b 100644 --- a/src/accumulate/accumulate_1d_gpu.jl +++ b/src/accumulate/accumulate_1d_gpu.jl @@ -271,7 +271,7 @@ function accumulate_1d_gpu!( min_elems::Int, # GPU settings - block_size::Int, + block_size::Union{Nothing, Int}=256, temp::Union{Nothing, AbstractArray}, temp_flags::Union{Nothing, AbstractArray}, ) @@ -326,7 +326,7 @@ function accumulate_1d_gpu!( min_elems::Int, # GPU settings - block_size::Int, + block_size::Union{Nothing, Int}=256, temp::Union{Nothing, AbstractArray}, temp_flags::Union{Nothing, AbstractArray}, ) diff --git a/src/accumulate/accumulate_nd.jl b/src/accumulate/accumulate_nd.jl index a078f12..2cc70e5 100644 --- a/src/accumulate/accumulate_nd.jl +++ b/src/accumulate/accumulate_nd.jl @@ -11,7 +11,7 @@ function accumulate_nd!( prefer_threads::Bool=true, # GPU settings - block_size::Int, + block_size::Union{Nothing, Int}, ) # Degenerate cases begin; order of priority matters @@ -35,9 +35,10 @@ function accumulate_nd!( _accumulate_nd_cpu_sections!(op, v; init, dims, inclusive, max_tasks, min_elems) else # Correctness checks - @argcheck block_size > 0 - @argcheck ispow2(block_size) - + max_block_size = get_max_block_size(backend, block_size) + @argcheck max_block_size > 0 + @argcheck ispow2(max_block_size) + # On GPUs we have two parallelisation approaches, based on destination dimension and current hardware: # - If the other dimensions have more elements than the product of the device's compute units and # maximum number of threads , we use a single thread per outer dimension - thus, a thread reduces @@ -50,16 +51,25 @@ function accumulate_nd!( serial_threshold = KI.max_work_group_size(backend) * KI.multiprocessor_count(backend) if length_outer >= serial_threshold - # One thread per outer dimension - blocks = (length_outer + block_size - 1) ÷ block_size - KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_thread!( - v, op, init, dims, inclusive, Val(block_size) + kernel = KI.@kernel backend launch = false _accumulate_nd_by_thread!( + v, op, init, dims, inclusive + ) + workgroupsize = block_size_pow_2(kernel, block_size) + numworkgroups = (length_outer + workgroupsize - 1) ÷ workgroupsize + kernel( + v, op, init, dims, inclusive; workgroupsize, numworkgroups ) else + kernel = KI.@kernel backend launch = false _accumulate_nd_by_block!( + v, op, init, neutral, dims, inclusive, Val(max_block_size) + ) + + workgroupsize = block_size_pow_2(kernel, block_size) + # One block per outer dimension - blocks = length_outer - KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _accumulate_nd_by_block!( - v, op, init, neutral, dims, inclusive, Val(block_size) + numworkgroups = length_outer + kernel( + v, op, init, neutral, dims, inclusive, Val(max_block_size); workgroupsize, numworkgroups ) end end @@ -120,8 +130,7 @@ end function _accumulate_nd_by_thread!( v, op, init, dims, inclusive, - ::Val{block_size} -) where block_size +) @inbounds begin # One thread per outer dimension element, when there are more outer elements than in the # reduced dim e.g. accumulate(+, rand(3, 1000), dims=1) => only 3 elements in the accumulated @@ -142,6 +151,7 @@ function _accumulate_nd_by_thread!( # Group (block) and local (thread) indices iblock = KI.get_group_id().x - 0x1 ithread = KI.get_local_id().x - 0x1 + block_size = KI.get_local_size().x # Each thread handles one outer element tid = ithread + iblock * block_size diff --git a/src/foreachindex.jl b/src/foreachindex.jl index b6409b6..2963568 100644 --- a/src/foreachindex.jl +++ b/src/foreachindex.jl @@ -1,11 +1,5 @@ -function _forindices_global!(f, indices, ::Val{N}) where N - - # Calculate global index - iblock = KI.get_group_id().x - ithread = KI.get_local_id().x - i = ithread + (iblock - 0x1) * N - # i = get_global_id().x - +function _forindices_global!(f, indices) + i = KI.get_global_id().x if i <= length(indices) f(indices[i]) @@ -19,12 +13,17 @@ function _forindices_gpu( indices, backend::Backend; - block_size::Int=256, + block_size::Union{Nothing, Int}=nothing, ) # GPU implementation - @argcheck block_size > 0 - blocks = max((length(indices) + block_size - 1) ÷ block_size, 1) - KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _forindices_global!(f, indices, Val(block_size)) + max_block_size = get_max_block_size(backend, block_size) + @argcheck max_block_size > 0 + kernel = KI.@kernel backend launch = false _forindices_global!(f, indices) + + workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size + numworkgroups = max((length(indices) + workgroupsize - 1) ÷ workgroupsize, 1) + + kernel(f, indices; workgroupsize, numworkgroups) end @@ -50,7 +49,7 @@ end min_elems=1, # GPU settings - block_size=256, + block_size::Union{Nothing, Int}=nothing, ) Parallelised `for` loop over the indices of an iterable. @@ -129,7 +128,7 @@ function foreachindex( prefer_threads::Bool=true, # GPU settings - block_size=256, + block_size::Union{Nothing, Int}=nothing, ) if use_gpu_algorithm(backend, prefer_threads) _forindices_gpu(f, eachindex(itr), backend; block_size) @@ -148,7 +147,7 @@ end min_elems=1, # GPU settings - block_size=256, + block_size::Union{Nothing, Int}=nothing, ) Parallelised `for` loop over the indices along axis `dims` of an iterable. @@ -223,7 +222,7 @@ function foraxes( prefer_threads::Bool=true, # GPU settings - block_size=256, + block_size::Union{Nothing, Int}=nothing, ) if isnothing(dims) return foreachindex( diff --git a/src/reduce/mapreduce_1d_cpu.jl b/src/reduce/mapreduce_1d_cpu.jl index 95a93f2..43233ca 100644 --- a/src/reduce/mapreduce_1d_cpu.jl +++ b/src/reduce/mapreduce_1d_cpu.jl @@ -8,7 +8,7 @@ function mapreduce_1d_cpu( min_elems::Int, # GPU settings - ignored here - block_size::Int, + block_size::Union{Nothing, Int}, temp::Union{Nothing, AbstractArray}, switch_below::Int, ) diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl index bb0a225..906676f 100644 --- a/src/reduce/mapreduce_1d_gpu.jl +++ b/src/reduce/mapreduce_1d_gpu.jl @@ -1,6 +1,7 @@ function _mapreduce_block!(src, dst, f, op, neutral, ::Val{N}) where N @inbounds begin sdata = KI.localmemory(eltype(dst), N) + N_actual = KI.get_local_size().x len = length(src) @@ -13,18 +14,18 @@ function _mapreduce_block!(src, dst, f, op, neutral, ::Val{N}) where N iblock = KI.get_group_id().x - 0x1 ithread = KI.get_local_id().x - 0x1 - i = ithread + iblock * (N * 0x2) + i = ithread + iblock * (N_actual * 0x2) if i >= len sdata[ithread + 0x1] = neutral - elseif i + N >= len + elseif i + N_actual >= len sdata[ithread + 0x1] = f(src[i + 0x1]) else - sdata[ithread + 0x1] = op(f(src[i + 0x1]), f(src[i + N + 0x1])) + sdata[ithread + 0x1] = op(f(src[i + 0x1]), f(src[i + N_actual + 0x1])) end KI.barrier() - @inline reduce_group!(op, sdata, N, ithread) + @inline reduce_group!(op, sdata, N_actual, ithread) # Code below would work on NVidia GPUs with warp size of 32, but create race conditions and # return incorrect results on Intel Graphics. It would be useful to have a way to statically @@ -57,11 +58,13 @@ function mapreduce_1d_gpu( min_elems::Int, # GPU settings - block_size::Int, + block_size::Union{Nothing, Int}, temp::Union{Nothing, AbstractArray}, switch_below::Int, ) - @argcheck 1 <= block_size <= 1024 + min_block_size = 16 + max_block_size = min(1024, get_max_block_size(backend, block_size)) + @argcheck 1 <= max_block_size <= 1024 @argcheck switch_below >= 0 # Degenerate cases @@ -74,29 +77,37 @@ function mapreduce_1d_gpu( end # Each thread will handle two elements - num_per_block = 2 * block_size - blocks = (len + num_per_block - 1) ÷ num_per_block + # max_num_per_block = 2 * max_block_size + min_num_per_block = 2 * min_block_size + max_blocks = (len + min_num_per_block - 1) ÷ min_num_per_block if !isnothing(temp) @argcheck get_backend(temp) === backend @argcheck eltype(temp) === typeof(init) - @argcheck length(temp) >= blocks * 2 + @argcheck length(temp) >= max_blocks * 2 dst = temp else # Figure out type for destination dst_type = typeof(init) - dst = KernelAbstractions.allocate(backend, dst_type, blocks * 2) + dst = KernelAbstractions.allocate(backend, dst_type, max_blocks * 2) end # Later the kernel will be compiled for views anyways, so use same types src_view = @view src[1:end] - dst_view = @view dst[1:blocks] + dst_view = @view dst[1:max_blocks] - KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_block!(src_view, dst_view, f, op, neutral, Val(block_size)) + kernel = KI.@kernel backend launch = false _mapreduce_block!(src_view, dst_view, f, op, neutral, Val(max_block_size)) + + workgroupsize = block_size_pow_2(kernel, block_size) + numworkgroups = (len + workgroupsize - 1) ÷ workgroupsize + + dst_view = @view dst[1:numworkgroups] + + kernel(src_view, dst_view, f, op, neutral, Val(max_block_size); numworkgroups, workgroupsize) # As long as we still have blocks to process, swap between the src and dst pointers at # the beginning of the first and second halves of dst - len = blocks + len = numworkgroups if len < switch_below h_src = Vector(@view(dst[1:len])) return Base.reduce(op, h_src; init) @@ -104,14 +115,18 @@ function mapreduce_1d_gpu( # Now all src elements have been passed through f; just do final reduction, no map needed p1 = @view dst[1:len] - p2 = @view dst[blocks + 1:end] + p2 = @view dst[numworkgroups + 1:end] while len > 1 - blocks = (len + num_per_block - 1) ÷ num_per_block + kernel = KI.@kernel backend launch = false _mapreduce_block!(p1, p2, identity, op, neutral, Val(max_block_size)) + + workgroupsize = block_size_pow_2(kernel, block_size) + numworkgroups = (len + workgroupsize - 1) ÷ workgroupsize + + kernel(p1, p2, identity, op, neutral, Val(max_block_size); numworkgroups, workgroupsize) # Each block produces one reduced value - KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_block!(p1, p2, identity, op, neutral, Val(block_size)) - len = blocks + len = numworkgroups if len < switch_below h_src = Vector(@view(p2[1:len])) diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl index da00582..e74eb28 100644 --- a/src/reduce/mapreduce_nd.jl +++ b/src/reduce/mapreduce_nd.jl @@ -10,10 +10,12 @@ function mapreduce_nd( prefer_threads::Bool=true, # GPU settings - block_size::Int, + block_size::Union{Nothing, Int}, temp::Union{Nothing, AbstractArray}, ) - @argcheck 1 <= block_size <= 1024 + + max_block_size = min(1024, get_max_block_size(backend, block_size)) + @argcheck 1 <= max_block_size <= 1024 # Degenerate cases begin; order of priority matters @@ -132,15 +134,25 @@ function mapreduce_nd( by_thread_threshold = KI.max_work_group_size(backend) * KI.multiprocessor_count(backend) if dst_size >= by_thread_threshold - blocks = (dst_size + block_size - 1) ÷ block_size - KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_thread!( - src, dst, f, op, init, dims, Val(block_size) + kernel = KI.@kernel backend launch = false _mapreduce_nd_by_thread!( + src, dst, f, op, init, dims + ) + workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size + numworkgroups = (dst_size + workgroupsize - 1) ÷ workgroupsize + kernel( + src, dst, f, op, init, dims; workgroupsize, numworkgroups ) else + kernel = KI.@kernel backend launch = false _mapreduce_nd_by_block!( + src, dst, f, op, init, neutral, dims, Val(max_block_size) + ) + + workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size + # One block per output element - blocks = dst_size - KI.@kernel backend workgroupsize=block_size numworkgroups=blocks _mapreduce_nd_by_block!( - src, dst, f, op, init, neutral, dims, Val(block_size) + numworkgroups = dst_size + kernel( + src, dst, f, op, init, neutral, dims, Val(max_block_size); workgroupsize, numworkgroups ) end end @@ -194,9 +206,8 @@ end function _mapreduce_nd_by_thread!( src, dst, f, op, - init, dims, - ::Val{N} -) where N + init, dims +) # One thread per output element, when there are more outer elements than in the reduced dim # e.g. reduce(+, rand(3, 1000), dims=1) => only 3 elements in the reduced dim src_sizes = size(src) @@ -217,6 +228,7 @@ function _mapreduce_nd_by_thread!( # Group (block) and local (thread) indices iblock = KI.get_group_id().x - 0x1 ithread = KI.get_local_id().x - 0x1 + N = KI.get_local_size().x # Each thread handles one output element tid = ithread + iblock * N diff --git a/src/reduce/reduce.jl b/src/reduce/reduce.jl index 230ded1..4b33526 100644 --- a/src/reduce/reduce.jl +++ b/src/reduce/reduce.jl @@ -17,7 +17,7 @@ include("mapreduce_nd.jl") min_elems::Int=1, # GPU settings - block_size::Int=256, + block_size::Union{Nothing, Int}=nothing, temp::Union{Nothing, AbstractArray}=nothing, switch_below::Int=0, ) @@ -100,7 +100,7 @@ end min_elems::Int=1, # GPU settings - block_size::Int=256, + block_size::Union{Nothing, Int}=nothing, temp::Union{Nothing, AbstractArray}=nothing, switch_below::Int=0, ) @@ -178,7 +178,7 @@ function _mapreduce_impl( prefer_threads::Bool=true, # GPU settings - block_size::Int=256, + block_size::Union{Nothing, Int}=nothing, temp::Union{Nothing, AbstractArray}=nothing, switch_below::Int=0, ) diff --git a/src/reduce/utilities.jl b/src/reduce/utilities.jl index ad099a5..035fabc 100644 --- a/src/reduce/utilities.jl +++ b/src/reduce/utilities.jl @@ -37,7 +37,7 @@ function _mapreduce_nd_apply_init!( init, max_tasks=Threads.nthreads(), min_elems=1, - block_size=256, + block_size=nothing, ) foreachindex(dst, backend; max_tasks, min_elems, block_size) do i dst[i] = op(init, f(src[i])) diff --git a/src/utils.jl b/src/utils.jl index d644120..9c405ce 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -8,6 +8,10 @@ const CPU_BACKEND = get_backend([]) return backend != CPU_BACKEND || !prefer_threads end +@inline get_max_block_size(backend, block_size) = isnothing(block_size) ? KI.max_work_group_size(backend) : block_size +@inline block_size_pow_2(kernel, block_size) = isnothing(block_size) ? 2^floor(Int, log2(KI.kernel_max_work_group_size(kernel))) : block_size + + """ struct TypeWrap{T} end TypeWrap(T) = TypeWrap{T}() From a4610f7354afb3fd60267308ddb438cc9d1f8559 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 19 Nov 2025 20:56:53 -0400 Subject: [PATCH 10/12] Fix test on Metal --- test/accumulate.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/accumulate.jl b/test/accumulate.jl index 5022e38..41baa25 100644 --- a/test/accumulate.jl +++ b/test/accumulate.jl @@ -175,7 +175,7 @@ end init = rand(-1000:1000) s = AK.accumulate(+, v; prefer_threads, init=Float32(init), dims) sh = Array(s) - @test all(sh .≈ accumulate(+, vh; init=Float32(init), dims)) + @test sh ≈ accumulate(+, vh; init=Float32(init), dims) end end From 4ba365c0f7f848896565fe49a6461436166c20e5 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 19 Nov 2025 20:57:26 -0400 Subject: [PATCH 11/12] Fix? --- src/reduce/mapreduce_1d_gpu.jl | 8 ++++++-- src/reduce/mapreduce_nd.jl | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/reduce/mapreduce_1d_gpu.jl b/src/reduce/mapreduce_1d_gpu.jl index 906676f..411a094 100644 --- a/src/reduce/mapreduce_1d_gpu.jl +++ b/src/reduce/mapreduce_1d_gpu.jl @@ -99,7 +99,9 @@ function mapreduce_1d_gpu( kernel = KI.@kernel backend launch = false _mapreduce_block!(src_view, dst_view, f, op, neutral, Val(max_block_size)) workgroupsize = block_size_pow_2(kernel, block_size) - numworkgroups = (len + workgroupsize - 1) ÷ workgroupsize + + num_per_block = 2 * workgroupsize + numworkgroups = (len + num_per_block - 1) ÷ num_per_block dst_view = @view dst[1:numworkgroups] @@ -121,7 +123,9 @@ function mapreduce_1d_gpu( kernel = KI.@kernel backend launch = false _mapreduce_block!(p1, p2, identity, op, neutral, Val(max_block_size)) workgroupsize = block_size_pow_2(kernel, block_size) - numworkgroups = (len + workgroupsize - 1) ÷ workgroupsize + + num_per_block = 2 * workgroupsize + numworkgroups = (len + num_per_block - 1) ÷ num_per_block kernel(p1, p2, identity, op, neutral, Val(max_block_size); numworkgroups, workgroupsize) diff --git a/src/reduce/mapreduce_nd.jl b/src/reduce/mapreduce_nd.jl index e74eb28..f45e4ce 100644 --- a/src/reduce/mapreduce_nd.jl +++ b/src/reduce/mapreduce_nd.jl @@ -137,7 +137,7 @@ function mapreduce_nd( kernel = KI.@kernel backend launch = false _mapreduce_nd_by_thread!( src, dst, f, op, init, dims ) - workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size + workgroupsize = block_size_pow_2(kernel, block_size) numworkgroups = (dst_size + workgroupsize - 1) ÷ workgroupsize kernel( src, dst, f, op, init, dims; workgroupsize, numworkgroups @@ -147,7 +147,7 @@ function mapreduce_nd( src, dst, f, op, init, neutral, dims, Val(max_block_size) ) - workgroupsize = isnothing(block_size) ? KI.kernel_max_work_group_size(kernel) : block_size + workgroupsize = block_size_pow_2(kernel, block_size) # One block per output element numworkgroups = dst_size From 1272f37a0a16fcaf0702c6b09c359bcc404297e5 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Wed, 19 Nov 2025 20:56:41 -0400 Subject: [PATCH 12/12] TEMP --- test/reduce.jl | 272 +++++++++++++++++++++++------------------------ test/runtests.jl | 22 ++-- 2 files changed, 147 insertions(+), 147 deletions(-) diff --git a/test/reduce.jl b/test/reduce.jl index 9fe8b5c..4bcd5ab 100644 --- a/test/reduce.jl +++ b/test/reduce.jl @@ -20,7 +20,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) end # Fuzzy correctness testing - for _ in 1:1000 + for _ in 1:1#0 num_elems = rand(1:100_000) v = array_from_host(rand(Int32, num_elems)) s = redmin(v) @@ -28,7 +28,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) @test s == minimum(vh) end - for _ in 1:1000 + for _ in 1:1#0 num_elems = rand(1:100_000) v = array_from_host(rand(UInt32, num_elems)) s = redmin(v) @@ -36,7 +36,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) @test s == minimum(vh) end - for _ in 1:1000 + for _ in 1:1#0 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) s = redmin(v) @@ -56,7 +56,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) end # Fuzzy correctness testing - for _ in 1:1000 + for _ in 1:1#0 num_elems = rand(1:100_000) v = array_from_host(rand(1:100, num_elems), Int32) s = redsum(v) @@ -64,7 +64,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) @test s == sum(vh) end - for _ in 1:1000 + for _ in 1:1#0 num_elems = rand(1:100_000) v = array_from_host(rand(1:100, num_elems), UInt32) s = redsum(v) @@ -72,7 +72,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) @test s == sum(vh) end - for _ in 1:1000 + for _ in 1:1#0 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) s = redsum(v) @@ -81,7 +81,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) end # Allowing N-dimensional arrays, still reduced as 1D - for _ in 1:100 + for _ in 1:1 n1 = rand(1:100) n2 = rand(1:100) n3 = rand(1:100) @@ -92,7 +92,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) end # Ensuring that the init value is respected - for _ in 1:100 + for _ in 1:1 num_elems = rand(1:100_000) v = array_from_host(rand(Int32(1):Int32(100), num_elems)) s = AK.reduce(+, v; prefer_threads, init=Int32(10)) @@ -101,7 +101,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) end # Testing with switch_below - i.e. finishing on the CPU - for _ in 1:100 + for _ in 1:1 num_elems = rand(1:100_000) v = array_from_host(rand(1:100, num_elems), Int32) switch_below = rand(1:100) @@ -112,7 +112,7 @@ Base.zero(::Type{Point}) = Point(0.0f0, 0.0f0) end # Test with unmaterialised ranges - for _ in 1:100 + for _ in 1:1 num_elems = rand(1:1000) v = 1:num_elems s = AK.reduce(+, v, BACKEND; prefer_threads, init=Int32(0)) @@ -168,7 +168,7 @@ end end # Fuzzy correctness testing - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:100) n2 = rand(1:100) @@ -181,7 +181,7 @@ end end end - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:100) n2 = rand(1:100) @@ -194,7 +194,7 @@ end end end - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:100) n2 = rand(1:100) @@ -208,7 +208,7 @@ end end # Ensuring that the init value is respected - for _ in 1:100 + for _ in 1:1 for dims in 1:4 n1 = rand(1:100) n2 = rand(1:100) @@ -255,114 +255,114 @@ end end -@testset "mapreduce_1d" begin - Random.seed!(0) - - function minbox(s) - # Extract coordinates into tuple and reduce to find dimensionwise minima - AK.mapreduce( - p -> (p.x, p.y), - (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]), - s; - prefer_threads, - init=(typemax(Float32), typemax(Float32)), - neutral=(typemax(Float32), typemax(Float32)), - ) - end - - function minbox_base(s) - # Extract coordinates into tuple and reduce to find dimensionwise minima - Base.mapreduce( - p -> (p.x, p.y), - (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]), - s; - init=(typemax(Float32), typemax(Float32)), - ) - end - - # Fuzzy correctness testing - for _ in 1:1000 - num_elems = rand(1:100_000) - v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:num_elems]) - mgpu = minbox(v) - - vh = Array(v) - mcpu = minbox(vh) - mbase = minbox_base(vh) - - @test typeof(mgpu) === typeof(mcpu) === typeof(mbase) - @test mgpu[1] ≈ mcpu[1] ≈ mbase[1] - @test mgpu[2] ≈ mcpu[2] ≈ mbase[2] - end - - # Allowing N-dimensional arrays, still reduced as 1D - for _ in 1:100 - n1 = rand(1:100) - n2 = rand(1:100) - n3 = rand(1:100) - - v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:n1, _ in 1:n2, _ in 1:n3]) - mgpu = minbox(v) - - vh = Array(v) - mcpu = minbox(vh) - mbase = minbox_base(vh) - - @test typeof(mgpu) === typeof(mcpu) === typeof(mbase) - @test mgpu[1] ≈ mcpu[1] ≈ mbase[1] - @test mgpu[2] ≈ mcpu[2] ≈ mbase[2] - end - - # Ensuring that the init value is respected - for _ in 1:100 - num_elems = rand(1:100_000) - v = array_from_host(rand(Int32(1):Int32(100), num_elems)) - s = AK.mapreduce(abs, +, v; prefer_threads, init=Int32(10)) - vh = Array(v) - @test s == sum(vh) + 10 - end - - # Testing with switch_below - i.e. finishing on the CPU - for _ in 1:100 - num_elems = rand(1:100_000) - v = array_from_host(rand(-100:-1, num_elems), Int32) - switch_below = rand(1:100) - init = rand(1:100) - s = AK.mapreduce(abs, +, v; prefer_threads, switch_below=switch_below, init=Int32(init)) - vh = Array(v) - @test s == mapreduce(abs, +, vh; init) - end - - # Test with unmaterialised ranges - for _ in 1:100 - num_elems = rand(1:1000) - v = 1:num_elems - s = AK.mapreduce(abs, +, v, BACKEND; prefer_threads, init=Int32(0)) - vh = Array(v) - @test s == mapreduce(abs, +, vh) - end - - # Testing different settings, enforcing change of type between f and op - f(s, temp) = AK.mapreduce( - p -> (p.x, p.y), - (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]), - s; - prefer_threads, - init=(typemax(Float32), typemax(Float32)), - neutral=(typemax(Float32), typemax(Float32)), - block_size=64, - temp=temp, - switch_below=50, - max_tasks=10, - min_elems=100, - ) - v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:10_042]) - temp = similar(v, Tuple{Float32, Float32}) - f(v, temp) - - # Test that undefined kwargs are not accepted - @test_throws MethodError AK.mapreduce(-, +, v; prefer_threads, init=10, bad=:kwarg) -end +# @testset "mapreduce_1d" begin +# Random.seed!(0) + +# function minbox(s) +# # Extract coordinates into tuple and reduce to find dimensionwise minima +# AK.mapreduce( +# p -> (p.x, p.y), +# (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]), +# s; +# prefer_threads, +# init=(typemax(Float32), typemax(Float32)), +# neutral=(typemax(Float32), typemax(Float32)), +# ) +# end + +# function minbox_base(s) +# # Extract coordinates into tuple and reduce to find dimensionwise minima +# Base.mapreduce( +# p -> (p.x, p.y), +# (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]), +# s; +# init=(typemax(Float32), typemax(Float32)), +# ) +# end + +# # Fuzzy correctness testing +# for _ in 1:1#0 +# num_elems = rand(1:100_000) +# v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:num_elems]) +# mgpu = minbox(v) + +# vh = Array(v) +# mcpu = minbox(vh) +# mbase = minbox_base(vh) + +# @test typeof(mgpu) === typeof(mcpu) === typeof(mbase) +# @test mgpu[1] ≈ mcpu[1] ≈ mbase[1] +# @test mgpu[2] ≈ mcpu[2] ≈ mbase[2] +# end + +# # Allowing N-dimensional arrays, still reduced as 1D +# for _ in 1:1 +# n1 = rand(1:100) +# n2 = rand(1:100) +# n3 = rand(1:100) + +# v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:n1, _ in 1:n2, _ in 1:n3]) +# mgpu = minbox(v) + +# vh = Array(v) +# mcpu = minbox(vh) +# mbase = minbox_base(vh) + +# @test typeof(mgpu) === typeof(mcpu) === typeof(mbase) +# @test mgpu[1] ≈ mcpu[1] ≈ mbase[1] +# @test mgpu[2] ≈ mcpu[2] ≈ mbase[2] +# end + +# # Ensuring that the init value is respected +# for _ in 1:1 +# num_elems = rand(1:100_000) +# v = array_from_host(rand(Int32(1):Int32(100), num_elems)) +# s = AK.mapreduce(abs, +, v; prefer_threads, init=Int32(10)) +# vh = Array(v) +# @test s == sum(vh) + 10 +# end + +# # Testing with switch_below - i.e. finishing on the CPU +# for _ in 1:1 +# num_elems = rand(1:100_000) +# v = array_from_host(rand(-100:-1, num_elems), Int32) +# switch_below = rand(1:100) +# init = rand(1:100) +# s = AK.mapreduce(identity, +, v; prefer_threads, switch_below=switch_below, init=Int32(init), block_size=256) +# vh = Array(v) +# @test s == mapreduce(identity, +, vh; init) +# end + +# # Test with unmaterialised ranges +# for _ in 1:1 +# num_elems = rand(1:1000) +# v = 1:num_elems +# s = AK.mapreduce(abs, +, v, BACKEND; prefer_threads, init=Int32(0)) +# vh = Array(v) +# @test s == mapreduce(abs, +, vh) +# end + +# # Testing different settings, enforcing change of type between f and op +# f(s, temp) = AK.mapreduce( +# p -> (p.x, p.y), +# (a, b) -> (a[1] < b[1] ? a[1] : b[1], a[2] < b[2] ? a[2] : b[2]), +# s; +# prefer_threads, +# init=(typemax(Float32), typemax(Float32)), +# neutral=(typemax(Float32), typemax(Float32)), +# block_size=64, +# temp=temp, +# switch_below=50, +# max_tasks=10, +# min_elems=100, +# ) +# v = array_from_host([Point(rand(Float32), rand(Float32)) for _ in 1:10_042]) +# temp = similar(v, Tuple{Float32, Float32}) +# f(v, temp) + +# # Test that undefined kwargs are not accepted +# @test_throws MethodError AK.mapreduce(-, +, v; prefer_threads, init=10, bad=:kwarg) +# end @testset "mapreduce_nd" begin @@ -385,7 +385,7 @@ end end # Fuzzy correctness testing - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:100) n2 = rand(1:100) @@ -423,7 +423,7 @@ end end # Fuzzy correctness testing - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:100) n2 = rand(1:100) @@ -444,7 +444,7 @@ end end # Ensuring that the init value is respected - for _ in 1:100 + for _ in 1:1 for dims in 1:4 n1 = rand(1:100) n2 = rand(1:100) @@ -500,13 +500,13 @@ end @test AK.sum(v; prefer_threads) == sum(Array(v)) # Fuzzy testing - for _ in 1:100 + for _ in 1:1 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) @test AK.sum(v; prefer_threads) ≈ sum(Array(v)) end - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:100) n2 = rand(1:100) @@ -545,13 +545,13 @@ end @test AK.prod(v; prefer_threads) == prod(Array(v)) # Fuzzy testing - for _ in 1:100 + for _ in 1:1 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) @test AK.prod(v; prefer_threads) ≈ prod(Array(v)) end - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:10) n2 = rand(1:10) @@ -590,13 +590,13 @@ end @test AK.minimum(v; prefer_threads) == minimum(Array(v)) # Fuzzy testing - for _ in 1:100 + for _ in 1:1 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) @test AK.minimum(v; prefer_threads) == minimum(Array(v)) end - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:100) n2 = rand(1:100) @@ -635,13 +635,13 @@ end @test AK.maximum(v; prefer_threads) == maximum(Array(v)) # Fuzzy testing - for _ in 1:100 + for _ in 1:1 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) @test AK.maximum(v; prefer_threads) == maximum(Array(v)) end - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:100) n2 = rand(1:100) @@ -680,13 +680,13 @@ end @test AK.count(x->x>50, v; prefer_threads) == count(x->x>50, Array(v)) # Fuzzy testing - for _ in 1:100 + for _ in 1:1 num_elems = rand(1:100_000) v = array_from_host(rand(Float32, num_elems)) @test AK.count(x->x>0.5, v; prefer_threads) == count(x->x>0.5, Array(v)) end - for _ in 1:100 + for _ in 1:1 for dims in 1:3 n1 = rand(1:100) n2 = rand(1:100) @@ -706,7 +706,7 @@ end end # Counting booleans directly - for _ in 1:100 + for _ in 1:1 num_elems = rand(1:100_000) v = array_from_host(rand(Bool, num_elems)) @test AK.count(v; prefer_threads) == count(Array(v)) diff --git a/test/runtests.jl b/test/runtests.jl index 716fd8e..1544bb3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -31,7 +31,7 @@ elseif "--AMDGPU" in ARGS const BACKEND = ROCBackend() TEST_DL[] = true elseif "--Metal" in ARGS - Pkg.add("Metal") + # Pkg.add("Metal") using Metal Metal.versioninfo() const BACKEND = MetalBackend() @@ -61,16 +61,16 @@ function array_from_host(backend, h_arr::AbstractArray, dtype=nothing) d_arr end -@testset "Aqua" begin - using Aqua - Aqua.test_all(AK) -end +# @testset "Aqua" begin +# using Aqua +# Aqua.test_all(AK) +# end -include("partition.jl") -include("looping.jl") -include("map.jl") -include("sort.jl") +# include("partition.jl") +# include("looping.jl") +# include("map.jl") +# include("sort.jl") include("reduce.jl") include("accumulate.jl") -include("predicates.jl") -include("binarysearch.jl") +# include("predicates.jl") +# include("binarysearch.jl")