Skip to content

Conversation

@amontoison
Copy link
Member

@amontoison amontoison commented Oct 7, 2025

  • Support CuSparseMatrixBSR for generic sparse mv! added with CUDA 13.0.1.
  • I also removed also some dead code for CUDA < 12.0
  • We should not call the "preprocess" routines for mv! and mm! because we don't have any high-level way to reuse the buffer / descriptor. We just do more work for nothing.

@amontoison amontoison requested a review from kshyatt October 7, 2025 20:09
@github-actions
Copy link
Contributor

github-actions bot commented Oct 7, 2025

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/lib/cusparse/generic.jl b/lib/cusparse/generic.jl
index 3b044351f..aee4386ea 100644
--- a/lib/cusparse/generic.jl
+++ b/lib/cusparse/generic.jl
@@ -230,7 +230,8 @@ function vv!(transx::SparseChar, X::CuSparseVector{T}, Y::DenseCuVector{T}, inde
     return result[]
 end
 
-function mv!(transa::SparseChar, alpha::Number, A::CuSparseMatrix{TA}, X::DenseCuVector{T},
+function mv!(
+        transa::SparseChar, alpha::Number, A::CuSparseMatrix{TA}, X::DenseCuVector{T},
              beta::Number, Y::DenseCuVector{T}, index::SparseChar, algo::cusparseSpMVAlg_t=CUSPARSE_SPMV_ALG_DEFAULT) where {TA, T}
 
     (A isa CuSparseMatrixBSR) && (CUSPARSE.version() < v"12.6.3") && throw(ErrorException("This operation is not supported by the current CUDA version."))
diff --git a/lib/cusparse/level2.jl b/lib/cusparse/level2.jl
index 37a9bd686..afccca8b6 100644
--- a/lib/cusparse/level2.jl
+++ b/lib/cusparse/level2.jl
@@ -8,13 +8,15 @@ for (fname,elty) in ((:cusparseSbsrmv, :Float32),
                      (:cusparseCbsrmv, :ComplexF32),
                      (:cusparseZbsrmv, :ComplexF64))
     @eval begin
-        function mv2!(transa::SparseChar,
-                      alpha::Number,
-                      A::CuSparseMatrixBSR{$elty},
-                      X::CuVector{$elty},
-                      beta::Number,
-                      Y::CuVector{$elty},
-                      index::SparseChar)
+        function mv2!(
+                transa::SparseChar,
+                alpha::Number,
+                A::CuSparseMatrixBSR{$elty},
+                X::CuVector{$elty},
+                beta::Number,
+                Y::CuVector{$elty},
+                index::SparseChar
+            )
 
             # Support transa = 'C' for real matrices
             transa = $elty <: Real && transa == 'C' ? 'T' : transa
diff --git a/test/libraries/cusparse.jl b/test/libraries/cusparse.jl
index ecc2dcd6b..5e448da3b 100644
--- a/test/libraries/cusparse.jl
+++ b/test/libraries/cusparse.jl
@@ -757,7 +757,8 @@ end
         alpha = rand(elty)
         beta = rand(elty)
         @testset "$(typeof(d_A))" for d_A in [CuSparseMatrixCSR(A),
-                                              CuSparseMatrixCSC(A)]
+                CuSparseMatrixCSC(A),
+            ]
             d_x = CuArray(x)
             d_y = CuArray(y)
             @test_throws DimensionMismatch CUSPARSE.mv!('T',alpha,d_A,d_x,beta,d_y,'O')
@@ -770,9 +771,9 @@ end
         @testset "$(typeof(d_A))" for d_A in [CuSparseMatrixBSR(A, blockdim)]
             d_x = CuArray(x)
             d_y = CuArray(y)
-            @test_throws DimensionMismatch CUSPARSE.mv2!('T',alpha,d_A,d_x,beta,d_y,'O')
-            @test_throws DimensionMismatch CUSPARSE.mv2!('N',alpha,d_A,d_y,beta,d_x,'O')
-            CUSPARSE.mv2!('N',alpha,d_A,d_x,beta,d_y,'O')
+            @test_throws DimensionMismatch CUSPARSE.mv2!('T', alpha, d_A, d_x, beta, d_y, 'O')
+            @test_throws DimensionMismatch CUSPARSE.mv2!('N', alpha, d_A, d_y, beta, d_x, 'O')
+            CUSPARSE.mv2!('N', alpha, d_A, d_x, beta, d_y, 'O')
             h_z = collect(d_y)
             z = alpha * A * x + beta * y
             @test z ≈ h_z
diff --git a/test/libraries/cusparse/generic.jl b/test/libraries/cusparse/generic.jl
index 7843fd40b..b729c92b9 100644
--- a/test/libraries/cusparse/generic.jl
+++ b/test/libraries/cusparse/generic.jl
@@ -32,7 +32,9 @@ SPMV_ALGOS = Dict(CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT],
                                         CUSPARSE.CUSPARSE_SPMV_CSR_ALG1,
                                         CUSPARSE.CUSPARSE_SPMV_CSR_ALG2],
                   CuSparseMatrixCOO => [CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT,
-                                        CUSPARSE.CUSPARSE_SPMV_COO_ALG1])
+        CUSPARSE.CUSPARSE_SPMV_COO_ALG1,
+    ]
+)
 
 SPMM_ALGOS = Dict(CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT],
                   CuSparseMatrixCSR => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT,
@@ -41,9 +43,11 @@ SPMM_ALGOS = Dict(CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT],
                                         CUSPARSE.CUSPARSE_SPMM_CSR_ALG3],
                   CuSparseMatrixCOO => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT,
                                         CUSPARSE.CUSPARSE_SPMM_COO_ALG1,
-                                        CUSPARSE.CUSPARSE_SPMM_COO_ALG2,
+        CUSPARSE.CUSPARSE_SPMM_COO_ALG2,
                                         CUSPARSE.CUSPARSE_SPMM_COO_ALG3,
-                                        CUSPARSE.CUSPARSE_SPMM_COO_ALG4])
+        CUSPARSE.CUSPARSE_SPMM_COO_ALG4,
+    ]
+)
 
 if CUSPARSE.version() >= v"12.1.3"
     push!(SPMV_ALGOS[CuSparseMatrixCOO], CUSPARSE.CUSPARSE_SPMV_COO_ALG2)
@@ -55,8 +59,10 @@ if CUSPARSE.version() >= v"12.5.1"
 end
 
 if CUSPARSE.version() >= v"12.6.3"
-    SPMV_ALGOS[CuSparseMatrixBSR] = [CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT,
-                                     CUSPARSE.CUSPARSE_SPMV_BSR_ALG1]
+    SPMV_ALGOS[CuSparseMatrixBSR] = [
+        CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT,
+        CUSPARSE.CUSPARSE_SPMV_BSR_ALG1,
+    ]
 end
 
 for SparseMatrixType in keys(SPMV_ALGOS)
@@ -67,7 +73,7 @@ for SparseMatrixType in keys(SPMV_ALGOS)
                 A = sprand(T, 20, 10, 0.1)
                 B = transa == 'N' ? rand(T, 10) : rand(T, 20)
                 C = transa == 'N' ? rand(T, 20) : rand(T, 10)
-                dA = SparseMatrixType == CuSparseMatrixBSR ? SparseMatrixType(A,1) : SparseMatrixType(A)
+                dA = SparseMatrixType == CuSparseMatrixBSR ? SparseMatrixType(A, 1) : SparseMatrixType(A)
                 dB = CuArray(B)
                 dC = CuArray(C)
 
@@ -313,14 +319,20 @@ end
     @test Z ≈ collect(dY)
 end
 
-SPGEMM_ALGOS = Dict(CuSparseMatrixCSR => [CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG1,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG2,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG3],
-                    CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG1,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG2,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG3])
+SPGEMM_ALGOS = Dict(
+    CuSparseMatrixCSR => [
+        CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG1,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG2,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG3,
+    ],
+    CuSparseMatrixCSC => [
+        CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG1,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG2,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG3,
+    ]
+)
 # Algorithms CUSPARSE.CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC and
 # CUSPARSE.CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC are dedicated to the cusparseSpGEMMreuse routine.
 
@@ -406,9 +418,9 @@ for SparseMatrixType in keys(SDDMM_ALGOS)
                     mB = transb == 'N' ? 10 : 35
                     nB = transb == 'N' ? 35 : 10
 
-                    A = rand(T,mA,nA)
-                    B = rand(T,mB,nB)
-                    C = sprand(T,25,35,0.3)
+                    A = rand(T, mA, nA)
+                    B = rand(T, mB, nB)
+                    C = sprand(T, 25, 35, 0.3)
 
                     spyC = copy(C)
                     spyC.nzval .= one(T)

@amontoison amontoison added the cuda libraries Stuff about CUDA library wrappers. label Oct 7, 2025
@kshyatt
Copy link
Member

kshyatt commented Oct 8, 2025

CI failures are related

@kshyatt kshyatt force-pushed the am/cusparse_bsr_mv branch from e488f51 to 949b9f1 Compare December 5, 2025 14:29
Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Benchmark suite Current: 949b9f1 Previous: 1af91be Ratio
latency/precompile 55490039074.5 ns 55207707384.5 ns 1.01
latency/ttfp 7796891791 ns 7803466984 ns 1.00
latency/import 4116374553.5 ns 4119333235.5 ns 1.00
integration/volumerhs 9622149.5 ns 9616867 ns 1.00
integration/byval/slices=1 147475 ns 147131 ns 1.00
integration/byval/slices=3 426384 ns 426158 ns 1.00
integration/byval/reference 145442 ns 145358 ns 1.00
integration/byval/slices=2 286847 ns 286555 ns 1.00
integration/cudadevrt 103900 ns 103753 ns 1.00
kernel/indexing 14553 ns 14494 ns 1.00
kernel/indexing_checked 15310 ns 15153 ns 1.01
kernel/occupancy 681.0955414012739 ns 670.8662420382166 ns 1.02
kernel/launch 2285.6666666666665 ns 2220.4444444444443 ns 1.03
kernel/rand 15279.5 ns 15661 ns 0.98
array/reverse/1d 20402 ns 20102.5 ns 1.01
array/reverse/2dL_inplace 67174 ns 67011 ns 1.00
array/reverse/1dL 70558 ns 70372 ns 1.00
array/reverse/2d 22385 ns 22007 ns 1.02
array/reverse/1d_inplace 9921 ns 9891 ns 1.00
array/reverse/2d_inplace 13745 ns 13546 ns 1.01
array/reverse/2dL 74458 ns 73927.5 ns 1.01
array/reverse/1dL_inplace 67045 ns 67056 ns 1.00
array/copy 20946.5 ns 20954 ns 1.00
array/iteration/findall/int 158922 ns 158738.5 ns 1.00
array/iteration/findall/bool 140711 ns 140481.5 ns 1.00
array/iteration/findfirst/int 161583 ns 161535 ns 1.00
array/iteration/findfirst/bool 162766 ns 162298 ns 1.00
array/iteration/scalar 75067 ns 73003.5 ns 1.03
array/iteration/logical 219916.5 ns 218149.5 ns 1.01
array/iteration/findmin/1d 51106 ns 53245 ns 0.96
array/iteration/findmin/2d 96796 ns 96825.5 ns 1.00
array/reductions/reduce/Int64/1d 44165 ns 43671 ns 1.01
array/reductions/reduce/Int64/dims=1 45382 ns 44843.5 ns 1.01
array/reductions/reduce/Int64/dims=2 62194 ns 61899 ns 1.00
array/reductions/reduce/Int64/dims=1L 89589 ns 89258 ns 1.00
array/reductions/reduce/Int64/dims=2L 88530 ns 88446 ns 1.00
array/reductions/reduce/Float32/1d 38770 ns 37742 ns 1.03
array/reductions/reduce/Float32/dims=1 42616.5 ns 42436 ns 1.00
array/reductions/reduce/Float32/dims=2 60796 ns 60098 ns 1.01
array/reductions/reduce/Float32/dims=1L 52811 ns 52602 ns 1.00
array/reductions/reduce/Float32/dims=2L 72890 ns 72179 ns 1.01
array/reductions/mapreduce/Int64/1d 44275 ns 43634 ns 1.01
array/reductions/mapreduce/Int64/dims=1 45089 ns 46988 ns 0.96
array/reductions/mapreduce/Int64/dims=2 62455 ns 61675 ns 1.01
array/reductions/mapreduce/Int64/dims=1L 89664 ns 89086 ns 1.01
array/reductions/mapreduce/Int64/dims=2L 88900 ns 88150 ns 1.01
array/reductions/mapreduce/Float32/1d 38160 ns 36978 ns 1.03
array/reductions/mapreduce/Float32/dims=1 44935 ns 48419.5 ns 0.93
array/reductions/mapreduce/Float32/dims=2 60584.5 ns 60111 ns 1.01
array/reductions/mapreduce/Float32/dims=1L 53236 ns 52878 ns 1.01
array/reductions/mapreduce/Float32/dims=2L 72526 ns 72374.5 ns 1.00
array/broadcast 20600 ns 20123 ns 1.02
array/copyto!/gpu_to_gpu 11544 ns 13003 ns 0.89
array/copyto!/cpu_to_gpu 216971 ns 217546 ns 1.00
array/copyto!/gpu_to_cpu 286227 ns 285690 ns 1.00
array/accumulate/Int64/1d 125453 ns 124863 ns 1.00
array/accumulate/Int64/dims=1 84123 ns 83917 ns 1.00
array/accumulate/Int64/dims=2 158707 ns 158224 ns 1.00
array/accumulate/Int64/dims=1L 1721075.5 ns 1710808 ns 1.01
array/accumulate/Int64/dims=2L 968366.5 ns 966620 ns 1.00
array/accumulate/Float32/1d 110148 ns 109551.5 ns 1.01
array/accumulate/Float32/dims=1 81147 ns 80701.5 ns 1.01
array/accumulate/Float32/dims=2 147843.5 ns 148055.5 ns 1.00
array/accumulate/Float32/dims=1L 1629124 ns 1619581.5 ns 1.01
array/accumulate/Float32/dims=2L 701850.5 ns 698770 ns 1.00
array/construct 1272.2 ns 1306.1 ns 0.97
array/random/randn/Float32 50099.5 ns 45766 ns 1.09
array/random/randn!/Float32 25474 ns 25261 ns 1.01
array/random/rand!/Int64 27875 ns 27478 ns 1.01
array/random/rand!/Float32 9045.666666666666 ns 8968 ns 1.01
array/random/rand/Int64 30599 ns 30173.5 ns 1.01
array/random/rand/Float32 13504.5 ns 13273 ns 1.02
array/permutedims/4d 56228.5 ns 56320.5 ns 1.00
array/permutedims/2d 54845 ns 54500 ns 1.01
array/permutedims/3d 55747 ns 55121.5 ns 1.01
array/sorting/1d 2777979 ns 2758806 ns 1.01
array/sorting/by 3357231.5 ns 3345943 ns 1.00
array/sorting/2d 1086142.5 ns 1082452 ns 1.00
cuda/synchronization/stream/auto 1027.2727272727273 ns 1022.3076923076923 ns 1.00
cuda/synchronization/stream/nonblocking 7193.2 ns 7461.6 ns 0.96
cuda/synchronization/stream/blocking 797.7777777777778 ns 805.6082474226804 ns 0.99
cuda/synchronization/context/auto 1171.6 ns 1182.4 ns 0.99
cuda/synchronization/context/nonblocking 7298.299999999999 ns 7352 ns 0.99
cuda/synchronization/context/blocking 882.3461538461538 ns 901.8522727272727 ns 0.98

This comment was automatically generated by workflow using github-action-benchmark.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

cuda libraries Stuff about CUDA library wrappers.

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants