[CUSPARSE] Interface generic mv! for SparseMatrixBSR #2929

amontoison · 2025-10-07T20:09:32Z

Support CuSparseMatrixBSR for generic sparse mv! added with CUDA 13.0.1.
I also removed also some dead code for CUDA < 12.0
We should not call the "preprocess" routines for mv! and mm! because we don't have any high-level way to reuse the buffer / descriptor. We just do more work for nothing.

github-actions · 2025-10-07T20:10:14Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.

diff --git a/lib/cusparse/generic.jl b/lib/cusparse/generic.jl
index 3b044351f..aee4386ea 100644
--- a/lib/cusparse/generic.jl
+++ b/lib/cusparse/generic.jl
@@ -230,7 +230,8 @@ function vv!(transx::SparseChar, X::CuSparseVector{T}, Y::DenseCuVector{T}, inde
     return result[]
 end
 
-function mv!(transa::SparseChar, alpha::Number, A::CuSparseMatrix{TA}, X::DenseCuVector{T},
+function mv!(
+        transa::SparseChar, alpha::Number, A::CuSparseMatrix{TA}, X::DenseCuVector{T},
              beta::Number, Y::DenseCuVector{T}, index::SparseChar, algo::cusparseSpMVAlg_t=CUSPARSE_SPMV_ALG_DEFAULT) where {TA, T}
 
     (A isa CuSparseMatrixBSR) && (CUSPARSE.version() < v"12.6.3") && throw(ErrorException("This operation is not supported by the current CUDA version."))
diff --git a/lib/cusparse/level2.jl b/lib/cusparse/level2.jl
index 37a9bd686..afccca8b6 100644
--- a/lib/cusparse/level2.jl
+++ b/lib/cusparse/level2.jl
@@ -8,13 +8,15 @@ for (fname,elty) in ((:cusparseSbsrmv, :Float32),
                      (:cusparseCbsrmv, :ComplexF32),
                      (:cusparseZbsrmv, :ComplexF64))
     @eval begin
-        function mv2!(transa::SparseChar,
-                      alpha::Number,
-                      A::CuSparseMatrixBSR{$elty},
-                      X::CuVector{$elty},
-                      beta::Number,
-                      Y::CuVector{$elty},
-                      index::SparseChar)
+        function mv2!(
+                transa::SparseChar,
+                alpha::Number,
+                A::CuSparseMatrixBSR{$elty},
+                X::CuVector{$elty},
+                beta::Number,
+                Y::CuVector{$elty},
+                index::SparseChar
+            )
 
             # Support transa = 'C' for real matrices
             transa = $elty <: Real && transa == 'C' ? 'T' : transa
diff --git a/test/libraries/cusparse.jl b/test/libraries/cusparse.jl
index ecc2dcd6b..5e448da3b 100644
--- a/test/libraries/cusparse.jl
+++ b/test/libraries/cusparse.jl
@@ -757,7 +757,8 @@ end
         alpha = rand(elty)
         beta = rand(elty)
         @testset "$(typeof(d_A))" for d_A in [CuSparseMatrixCSR(A),
-                                              CuSparseMatrixCSC(A)]
+                CuSparseMatrixCSC(A),
+            ]
             d_x = CuArray(x)
             d_y = CuArray(y)
             @test_throws DimensionMismatch CUSPARSE.mv!('T',alpha,d_A,d_x,beta,d_y,'O')
@@ -770,9 +771,9 @@ end
         @testset "$(typeof(d_A))" for d_A in [CuSparseMatrixBSR(A, blockdim)]
             d_x = CuArray(x)
             d_y = CuArray(y)
-            @test_throws DimensionMismatch CUSPARSE.mv2!('T',alpha,d_A,d_x,beta,d_y,'O')
-            @test_throws DimensionMismatch CUSPARSE.mv2!('N',alpha,d_A,d_y,beta,d_x,'O')
-            CUSPARSE.mv2!('N',alpha,d_A,d_x,beta,d_y,'O')
+            @test_throws DimensionMismatch CUSPARSE.mv2!('T', alpha, d_A, d_x, beta, d_y, 'O')
+            @test_throws DimensionMismatch CUSPARSE.mv2!('N', alpha, d_A, d_y, beta, d_x, 'O')
+            CUSPARSE.mv2!('N', alpha, d_A, d_x, beta, d_y, 'O')
             h_z = collect(d_y)
             z = alpha * A * x + beta * y
             @test z ≈ h_z
diff --git a/test/libraries/cusparse/generic.jl b/test/libraries/cusparse/generic.jl
index 7843fd40b..b729c92b9 100644
--- a/test/libraries/cusparse/generic.jl
+++ b/test/libraries/cusparse/generic.jl
@@ -32,7 +32,9 @@ SPMV_ALGOS = Dict(CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT],
                                         CUSPARSE.CUSPARSE_SPMV_CSR_ALG1,
                                         CUSPARSE.CUSPARSE_SPMV_CSR_ALG2],
                   CuSparseMatrixCOO => [CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT,
-                                        CUSPARSE.CUSPARSE_SPMV_COO_ALG1])
+        CUSPARSE.CUSPARSE_SPMV_COO_ALG1,
+    ]
+)
 
 SPMM_ALGOS = Dict(CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT],
                   CuSparseMatrixCSR => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT,
@@ -41,9 +43,11 @@ SPMM_ALGOS = Dict(CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT],
                                         CUSPARSE.CUSPARSE_SPMM_CSR_ALG3],
                   CuSparseMatrixCOO => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT,
                                         CUSPARSE.CUSPARSE_SPMM_COO_ALG1,
-                                        CUSPARSE.CUSPARSE_SPMM_COO_ALG2,
+        CUSPARSE.CUSPARSE_SPMM_COO_ALG2,
                                         CUSPARSE.CUSPARSE_SPMM_COO_ALG3,
-                                        CUSPARSE.CUSPARSE_SPMM_COO_ALG4])
+        CUSPARSE.CUSPARSE_SPMM_COO_ALG4,
+    ]
+)
 
 if CUSPARSE.version() >= v"12.1.3"
     push!(SPMV_ALGOS[CuSparseMatrixCOO], CUSPARSE.CUSPARSE_SPMV_COO_ALG2)
@@ -55,8 +59,10 @@ if CUSPARSE.version() >= v"12.5.1"
 end
 
 if CUSPARSE.version() >= v"12.6.3"
-    SPMV_ALGOS[CuSparseMatrixBSR] = [CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT,
-                                     CUSPARSE.CUSPARSE_SPMV_BSR_ALG1]
+    SPMV_ALGOS[CuSparseMatrixBSR] = [
+        CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT,
+        CUSPARSE.CUSPARSE_SPMV_BSR_ALG1,
+    ]
 end
 
 for SparseMatrixType in keys(SPMV_ALGOS)
@@ -67,7 +73,7 @@ for SparseMatrixType in keys(SPMV_ALGOS)
                 A = sprand(T, 20, 10, 0.1)
                 B = transa == 'N' ? rand(T, 10) : rand(T, 20)
                 C = transa == 'N' ? rand(T, 20) : rand(T, 10)
-                dA = SparseMatrixType == CuSparseMatrixBSR ? SparseMatrixType(A,1) : SparseMatrixType(A)
+                dA = SparseMatrixType == CuSparseMatrixBSR ? SparseMatrixType(A, 1) : SparseMatrixType(A)
                 dB = CuArray(B)
                 dC = CuArray(C)
 
@@ -313,14 +319,20 @@ end
     @test Z ≈ collect(dY)
 end
 
-SPGEMM_ALGOS = Dict(CuSparseMatrixCSR => [CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG1,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG2,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG3],
-                    CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG1,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG2,
-                                          CUSPARSE.CUSPARSE_SPGEMM_ALG3])
+SPGEMM_ALGOS = Dict(
+    CuSparseMatrixCSR => [
+        CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG1,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG2,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG3,
+    ],
+    CuSparseMatrixCSC => [
+        CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG1,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG2,
+        CUSPARSE.CUSPARSE_SPGEMM_ALG3,
+    ]
+)
 # Algorithms CUSPARSE.CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC and
 # CUSPARSE.CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC are dedicated to the cusparseSpGEMMreuse routine.
 
@@ -406,9 +418,9 @@ for SparseMatrixType in keys(SDDMM_ALGOS)
                     mB = transb == 'N' ? 10 : 35
                     nB = transb == 'N' ? 35 : 10
 
-                    A = rand(T,mA,nA)
-                    B = rand(T,mB,nB)
-                    C = sprand(T,25,35,0.3)
+                    A = rand(T, mA, nA)
+                    B = rand(T, mB, nB)
+                    C = sprand(T, 25, 35, 0.3)
 
                     spyC = copy(C)
                     spyC.nzval .= one(T)

kshyatt · 2025-10-08T09:11:01Z

CI failures are related

github-actions

CUDA.jl Benchmarks

Benchmark suite	Current: `949b9f1`	Previous: `1af91be`	Ratio
`latency/precompile`	`55490039074.5` ns	`55207707384.5` ns	`1.01`
`latency/ttfp`	`7796891791` ns	`7803466984` ns	`1.00`
`latency/import`	`4116374553.5` ns	`4119333235.5` ns	`1.00`
`integration/volumerhs`	`9622149.5` ns	`9616867` ns	`1.00`
`integration/byval/slices=1`	`147475` ns	`147131` ns	`1.00`
`integration/byval/slices=3`	`426384` ns	`426158` ns	`1.00`
`integration/byval/reference`	`145442` ns	`145358` ns	`1.00`
`integration/byval/slices=2`	`286847` ns	`286555` ns	`1.00`
`integration/cudadevrt`	`103900` ns	`103753` ns	`1.00`
`kernel/indexing`	`14553` ns	`14494` ns	`1.00`
`kernel/indexing_checked`	`15310` ns	`15153` ns	`1.01`
`kernel/occupancy`	`681.0955414012739` ns	`670.8662420382166` ns	`1.02`
`kernel/launch`	`2285.6666666666665` ns	`2220.4444444444443` ns	`1.03`
`kernel/rand`	`15279.5` ns	`15661` ns	`0.98`
`array/reverse/1d`	`20402` ns	`20102.5` ns	`1.01`
`array/reverse/2dL_inplace`	`67174` ns	`67011` ns	`1.00`
`array/reverse/1dL`	`70558` ns	`70372` ns	`1.00`
`array/reverse/2d`	`22385` ns	`22007` ns	`1.02`
`array/reverse/1d_inplace`	`9921` ns	`9891` ns	`1.00`
`array/reverse/2d_inplace`	`13745` ns	`13546` ns	`1.01`
`array/reverse/2dL`	`74458` ns	`73927.5` ns	`1.01`
`array/reverse/1dL_inplace`	`67045` ns	`67056` ns	`1.00`
`array/copy`	`20946.5` ns	`20954` ns	`1.00`
`array/iteration/findall/int`	`158922` ns	`158738.5` ns	`1.00`
`array/iteration/findall/bool`	`140711` ns	`140481.5` ns	`1.00`
`array/iteration/findfirst/int`	`161583` ns	`161535` ns	`1.00`
`array/iteration/findfirst/bool`	`162766` ns	`162298` ns	`1.00`
`array/iteration/scalar`	`75067` ns	`73003.5` ns	`1.03`
`array/iteration/logical`	`219916.5` ns	`218149.5` ns	`1.01`
`array/iteration/findmin/1d`	`51106` ns	`53245` ns	`0.96`
`array/iteration/findmin/2d`	`96796` ns	`96825.5` ns	`1.00`
`array/reductions/reduce/Int64/1d`	`44165` ns	`43671` ns	`1.01`
`array/reductions/reduce/Int64/dims=1`	`45382` ns	`44843.5` ns	`1.01`
`array/reductions/reduce/Int64/dims=2`	`62194` ns	`61899` ns	`1.00`
`array/reductions/reduce/Int64/dims=1L`	`89589` ns	`89258` ns	`1.00`
`array/reductions/reduce/Int64/dims=2L`	`88530` ns	`88446` ns	`1.00`
`array/reductions/reduce/Float32/1d`	`38770` ns	`37742` ns	`1.03`
`array/reductions/reduce/Float32/dims=1`	`42616.5` ns	`42436` ns	`1.00`
`array/reductions/reduce/Float32/dims=2`	`60796` ns	`60098` ns	`1.01`
`array/reductions/reduce/Float32/dims=1L`	`52811` ns	`52602` ns	`1.00`
`array/reductions/reduce/Float32/dims=2L`	`72890` ns	`72179` ns	`1.01`
`array/reductions/mapreduce/Int64/1d`	`44275` ns	`43634` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=1`	`45089` ns	`46988` ns	`0.96`
`array/reductions/mapreduce/Int64/dims=2`	`62455` ns	`61675` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=1L`	`89664` ns	`89086` ns	`1.01`
`array/reductions/mapreduce/Int64/dims=2L`	`88900` ns	`88150` ns	`1.01`
`array/reductions/mapreduce/Float32/1d`	`38160` ns	`36978` ns	`1.03`
`array/reductions/mapreduce/Float32/dims=1`	`44935` ns	`48419.5` ns	`0.93`
`array/reductions/mapreduce/Float32/dims=2`	`60584.5` ns	`60111` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=1L`	`53236` ns	`52878` ns	`1.01`
`array/reductions/mapreduce/Float32/dims=2L`	`72526` ns	`72374.5` ns	`1.00`
`array/broadcast`	`20600` ns	`20123` ns	`1.02`
`array/copyto!/gpu_to_gpu`	`11544` ns	`13003` ns	`0.89`
`array/copyto!/cpu_to_gpu`	`216971` ns	`217546` ns	`1.00`
`array/copyto!/gpu_to_cpu`	`286227` ns	`285690` ns	`1.00`
`array/accumulate/Int64/1d`	`125453` ns	`124863` ns	`1.00`
`array/accumulate/Int64/dims=1`	`84123` ns	`83917` ns	`1.00`
`array/accumulate/Int64/dims=2`	`158707` ns	`158224` ns	`1.00`
`array/accumulate/Int64/dims=1L`	`1721075.5` ns	`1710808` ns	`1.01`
`array/accumulate/Int64/dims=2L`	`968366.5` ns	`966620` ns	`1.00`
`array/accumulate/Float32/1d`	`110148` ns	`109551.5` ns	`1.01`
`array/accumulate/Float32/dims=1`	`81147` ns	`80701.5` ns	`1.01`
`array/accumulate/Float32/dims=2`	`147843.5` ns	`148055.5` ns	`1.00`
`array/accumulate/Float32/dims=1L`	`1629124` ns	`1619581.5` ns	`1.01`
`array/accumulate/Float32/dims=2L`	`701850.5` ns	`698770` ns	`1.00`
`array/construct`	`1272.2` ns	`1306.1` ns	`0.97`
`array/random/randn/Float32`	`50099.5` ns	`45766` ns	`1.09`
`array/random/randn!/Float32`	`25474` ns	`25261` ns	`1.01`
`array/random/rand!/Int64`	`27875` ns	`27478` ns	`1.01`
`array/random/rand!/Float32`	`9045.666666666666` ns	`8968` ns	`1.01`
`array/random/rand/Int64`	`30599` ns	`30173.5` ns	`1.01`
`array/random/rand/Float32`	`13504.5` ns	`13273` ns	`1.02`
`array/permutedims/4d`	`56228.5` ns	`56320.5` ns	`1.00`
`array/permutedims/2d`	`54845` ns	`54500` ns	`1.01`
`array/permutedims/3d`	`55747` ns	`55121.5` ns	`1.01`
`array/sorting/1d`	`2777979` ns	`2758806` ns	`1.01`
`array/sorting/by`	`3357231.5` ns	`3345943` ns	`1.00`
`array/sorting/2d`	`1086142.5` ns	`1082452` ns	`1.00`
`cuda/synchronization/stream/auto`	`1027.2727272727273` ns	`1022.3076923076923` ns	`1.00`
`cuda/synchronization/stream/nonblocking`	`7193.2` ns	`7461.6` ns	`0.96`
`cuda/synchronization/stream/blocking`	`797.7777777777778` ns	`805.6082474226804` ns	`0.99`
`cuda/synchronization/context/auto`	`1171.6` ns	`1182.4` ns	`0.99`
`cuda/synchronization/context/nonblocking`	`7298.299999999999` ns	`7352` ns	`0.99`
`cuda/synchronization/context/blocking`	`882.3461538461538` ns	`901.8522727272727` ns	`0.98`

This comment was automatically generated by workflow using github-action-benchmark.

amontoison requested a review from kshyatt October 7, 2025 20:09

amontoison added the cuda libraries Stuff about CUDA library wrappers. label Oct 7, 2025

amontoison force-pushed the am/cusparse_bsr_mv branch from acabc12 to 4e9d911 Compare October 8, 2025 06:04

amontoison force-pushed the am/cusparse_bsr_mv branch from 4e9d911 to e488f51 Compare November 1, 2025 05:35

[CUSPARSE] Interface generic mv! for SparseMatrixBSR

949b9f1

kshyatt force-pushed the am/cusparse_bsr_mv branch from e488f51 to 949b9f1 Compare December 5, 2025 14:29

github-actions bot reviewed Dec 5, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[CUSPARSE] Interface generic mv! for SparseMatrixBSR #2929

[CUSPARSE] Interface generic mv! for SparseMatrixBSR #2929

amontoison commented Oct 7, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Oct 7, 2025 •

edited

Loading

Uh oh!

kshyatt commented Oct 8, 2025

Uh oh!

github-actions bot left a comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

[CUSPARSE] Interface generic mv! for SparseMatrixBSR #2929

Are you sure you want to change the base?

[CUSPARSE] Interface generic mv! for SparseMatrixBSR #2929

Conversation

amontoison commented Oct 7, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Oct 7, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

kshyatt commented Oct 8, 2025

Uh oh!

github-actions bot left a comment

Choose a reason for hiding this comment

CUDA.jl Benchmarks

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

amontoison commented Oct 7, 2025 •

edited

Loading

github-actions bot commented Oct 7, 2025 •

edited

Loading