-
Notifications
You must be signed in to change notification settings - Fork 260
[CUSPARSE] Interface generic mv! for SparseMatrixBSR #2929
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
amontoison
wants to merge
1
commit into
JuliaGPU:master
Choose a base branch
from
amontoison:am/cusparse_bsr_mv
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+81
−74
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Contributor
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cusparse/generic.jl b/lib/cusparse/generic.jl
index 3b044351f..aee4386ea 100644
--- a/lib/cusparse/generic.jl
+++ b/lib/cusparse/generic.jl
@@ -230,7 +230,8 @@ function vv!(transx::SparseChar, X::CuSparseVector{T}, Y::DenseCuVector{T}, inde
return result[]
end
-function mv!(transa::SparseChar, alpha::Number, A::CuSparseMatrix{TA}, X::DenseCuVector{T},
+function mv!(
+ transa::SparseChar, alpha::Number, A::CuSparseMatrix{TA}, X::DenseCuVector{T},
beta::Number, Y::DenseCuVector{T}, index::SparseChar, algo::cusparseSpMVAlg_t=CUSPARSE_SPMV_ALG_DEFAULT) where {TA, T}
(A isa CuSparseMatrixBSR) && (CUSPARSE.version() < v"12.6.3") && throw(ErrorException("This operation is not supported by the current CUDA version."))
diff --git a/lib/cusparse/level2.jl b/lib/cusparse/level2.jl
index 37a9bd686..afccca8b6 100644
--- a/lib/cusparse/level2.jl
+++ b/lib/cusparse/level2.jl
@@ -8,13 +8,15 @@ for (fname,elty) in ((:cusparseSbsrmv, :Float32),
(:cusparseCbsrmv, :ComplexF32),
(:cusparseZbsrmv, :ComplexF64))
@eval begin
- function mv2!(transa::SparseChar,
- alpha::Number,
- A::CuSparseMatrixBSR{$elty},
- X::CuVector{$elty},
- beta::Number,
- Y::CuVector{$elty},
- index::SparseChar)
+ function mv2!(
+ transa::SparseChar,
+ alpha::Number,
+ A::CuSparseMatrixBSR{$elty},
+ X::CuVector{$elty},
+ beta::Number,
+ Y::CuVector{$elty},
+ index::SparseChar
+ )
# Support transa = 'C' for real matrices
transa = $elty <: Real && transa == 'C' ? 'T' : transa
diff --git a/test/libraries/cusparse.jl b/test/libraries/cusparse.jl
index ecc2dcd6b..5e448da3b 100644
--- a/test/libraries/cusparse.jl
+++ b/test/libraries/cusparse.jl
@@ -757,7 +757,8 @@ end
alpha = rand(elty)
beta = rand(elty)
@testset "$(typeof(d_A))" for d_A in [CuSparseMatrixCSR(A),
- CuSparseMatrixCSC(A)]
+ CuSparseMatrixCSC(A),
+ ]
d_x = CuArray(x)
d_y = CuArray(y)
@test_throws DimensionMismatch CUSPARSE.mv!('T',alpha,d_A,d_x,beta,d_y,'O')
@@ -770,9 +771,9 @@ end
@testset "$(typeof(d_A))" for d_A in [CuSparseMatrixBSR(A, blockdim)]
d_x = CuArray(x)
d_y = CuArray(y)
- @test_throws DimensionMismatch CUSPARSE.mv2!('T',alpha,d_A,d_x,beta,d_y,'O')
- @test_throws DimensionMismatch CUSPARSE.mv2!('N',alpha,d_A,d_y,beta,d_x,'O')
- CUSPARSE.mv2!('N',alpha,d_A,d_x,beta,d_y,'O')
+ @test_throws DimensionMismatch CUSPARSE.mv2!('T', alpha, d_A, d_x, beta, d_y, 'O')
+ @test_throws DimensionMismatch CUSPARSE.mv2!('N', alpha, d_A, d_y, beta, d_x, 'O')
+ CUSPARSE.mv2!('N', alpha, d_A, d_x, beta, d_y, 'O')
h_z = collect(d_y)
z = alpha * A * x + beta * y
@test z ≈ h_z
diff --git a/test/libraries/cusparse/generic.jl b/test/libraries/cusparse/generic.jl
index 7843fd40b..b729c92b9 100644
--- a/test/libraries/cusparse/generic.jl
+++ b/test/libraries/cusparse/generic.jl
@@ -32,7 +32,9 @@ SPMV_ALGOS = Dict(CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT],
CUSPARSE.CUSPARSE_SPMV_CSR_ALG1,
CUSPARSE.CUSPARSE_SPMV_CSR_ALG2],
CuSparseMatrixCOO => [CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT,
- CUSPARSE.CUSPARSE_SPMV_COO_ALG1])
+ CUSPARSE.CUSPARSE_SPMV_COO_ALG1,
+ ]
+)
SPMM_ALGOS = Dict(CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT],
CuSparseMatrixCSR => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT,
@@ -41,9 +43,11 @@ SPMM_ALGOS = Dict(CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT],
CUSPARSE.CUSPARSE_SPMM_CSR_ALG3],
CuSparseMatrixCOO => [CUSPARSE.CUSPARSE_SPMM_ALG_DEFAULT,
CUSPARSE.CUSPARSE_SPMM_COO_ALG1,
- CUSPARSE.CUSPARSE_SPMM_COO_ALG2,
+ CUSPARSE.CUSPARSE_SPMM_COO_ALG2,
CUSPARSE.CUSPARSE_SPMM_COO_ALG3,
- CUSPARSE.CUSPARSE_SPMM_COO_ALG4])
+ CUSPARSE.CUSPARSE_SPMM_COO_ALG4,
+ ]
+)
if CUSPARSE.version() >= v"12.1.3"
push!(SPMV_ALGOS[CuSparseMatrixCOO], CUSPARSE.CUSPARSE_SPMV_COO_ALG2)
@@ -55,8 +59,10 @@ if CUSPARSE.version() >= v"12.5.1"
end
if CUSPARSE.version() >= v"12.6.3"
- SPMV_ALGOS[CuSparseMatrixBSR] = [CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT,
- CUSPARSE.CUSPARSE_SPMV_BSR_ALG1]
+ SPMV_ALGOS[CuSparseMatrixBSR] = [
+ CUSPARSE.CUSPARSE_SPMV_ALG_DEFAULT,
+ CUSPARSE.CUSPARSE_SPMV_BSR_ALG1,
+ ]
end
for SparseMatrixType in keys(SPMV_ALGOS)
@@ -67,7 +73,7 @@ for SparseMatrixType in keys(SPMV_ALGOS)
A = sprand(T, 20, 10, 0.1)
B = transa == 'N' ? rand(T, 10) : rand(T, 20)
C = transa == 'N' ? rand(T, 20) : rand(T, 10)
- dA = SparseMatrixType == CuSparseMatrixBSR ? SparseMatrixType(A,1) : SparseMatrixType(A)
+ dA = SparseMatrixType == CuSparseMatrixBSR ? SparseMatrixType(A, 1) : SparseMatrixType(A)
dB = CuArray(B)
dC = CuArray(C)
@@ -313,14 +319,20 @@ end
@test Z ≈ collect(dY)
end
-SPGEMM_ALGOS = Dict(CuSparseMatrixCSR => [CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
- CUSPARSE.CUSPARSE_SPGEMM_ALG1,
- CUSPARSE.CUSPARSE_SPGEMM_ALG2,
- CUSPARSE.CUSPARSE_SPGEMM_ALG3],
- CuSparseMatrixCSC => [CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
- CUSPARSE.CUSPARSE_SPGEMM_ALG1,
- CUSPARSE.CUSPARSE_SPGEMM_ALG2,
- CUSPARSE.CUSPARSE_SPGEMM_ALG3])
+SPGEMM_ALGOS = Dict(
+ CuSparseMatrixCSR => [
+ CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
+ CUSPARSE.CUSPARSE_SPGEMM_ALG1,
+ CUSPARSE.CUSPARSE_SPGEMM_ALG2,
+ CUSPARSE.CUSPARSE_SPGEMM_ALG3,
+ ],
+ CuSparseMatrixCSC => [
+ CUSPARSE.CUSPARSE_SPGEMM_DEFAULT,
+ CUSPARSE.CUSPARSE_SPGEMM_ALG1,
+ CUSPARSE.CUSPARSE_SPGEMM_ALG2,
+ CUSPARSE.CUSPARSE_SPGEMM_ALG3,
+ ]
+)
# Algorithms CUSPARSE.CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC and
# CUSPARSE.CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC are dedicated to the cusparseSpGEMMreuse routine.
@@ -406,9 +418,9 @@ for SparseMatrixType in keys(SDDMM_ALGOS)
mB = transb == 'N' ? 10 : 35
nB = transb == 'N' ? 35 : 10
- A = rand(T,mA,nA)
- B = rand(T,mB,nB)
- C = sprand(T,25,35,0.3)
+ A = rand(T, mA, nA)
+ B = rand(T, mB, nB)
+ C = sprand(T, 25, 35, 0.3)
spyC = copy(C)
spyC.nzval .= one(T) |
acabc12 to
4e9d911
Compare
Member
|
CI failures are related |
4e9d911 to
e488f51
Compare
e488f51 to
949b9f1
Compare
Contributor
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CUDA.jl Benchmarks
| Benchmark suite | Current: 949b9f1 | Previous: 1af91be | Ratio |
|---|---|---|---|
latency/precompile |
55490039074.5 ns |
55207707384.5 ns |
1.01 |
latency/ttfp |
7796891791 ns |
7803466984 ns |
1.00 |
latency/import |
4116374553.5 ns |
4119333235.5 ns |
1.00 |
integration/volumerhs |
9622149.5 ns |
9616867 ns |
1.00 |
integration/byval/slices=1 |
147475 ns |
147131 ns |
1.00 |
integration/byval/slices=3 |
426384 ns |
426158 ns |
1.00 |
integration/byval/reference |
145442 ns |
145358 ns |
1.00 |
integration/byval/slices=2 |
286847 ns |
286555 ns |
1.00 |
integration/cudadevrt |
103900 ns |
103753 ns |
1.00 |
kernel/indexing |
14553 ns |
14494 ns |
1.00 |
kernel/indexing_checked |
15310 ns |
15153 ns |
1.01 |
kernel/occupancy |
681.0955414012739 ns |
670.8662420382166 ns |
1.02 |
kernel/launch |
2285.6666666666665 ns |
2220.4444444444443 ns |
1.03 |
kernel/rand |
15279.5 ns |
15661 ns |
0.98 |
array/reverse/1d |
20402 ns |
20102.5 ns |
1.01 |
array/reverse/2dL_inplace |
67174 ns |
67011 ns |
1.00 |
array/reverse/1dL |
70558 ns |
70372 ns |
1.00 |
array/reverse/2d |
22385 ns |
22007 ns |
1.02 |
array/reverse/1d_inplace |
9921 ns |
9891 ns |
1.00 |
array/reverse/2d_inplace |
13745 ns |
13546 ns |
1.01 |
array/reverse/2dL |
74458 ns |
73927.5 ns |
1.01 |
array/reverse/1dL_inplace |
67045 ns |
67056 ns |
1.00 |
array/copy |
20946.5 ns |
20954 ns |
1.00 |
array/iteration/findall/int |
158922 ns |
158738.5 ns |
1.00 |
array/iteration/findall/bool |
140711 ns |
140481.5 ns |
1.00 |
array/iteration/findfirst/int |
161583 ns |
161535 ns |
1.00 |
array/iteration/findfirst/bool |
162766 ns |
162298 ns |
1.00 |
array/iteration/scalar |
75067 ns |
73003.5 ns |
1.03 |
array/iteration/logical |
219916.5 ns |
218149.5 ns |
1.01 |
array/iteration/findmin/1d |
51106 ns |
53245 ns |
0.96 |
array/iteration/findmin/2d |
96796 ns |
96825.5 ns |
1.00 |
array/reductions/reduce/Int64/1d |
44165 ns |
43671 ns |
1.01 |
array/reductions/reduce/Int64/dims=1 |
45382 ns |
44843.5 ns |
1.01 |
array/reductions/reduce/Int64/dims=2 |
62194 ns |
61899 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
89589 ns |
89258 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
88530 ns |
88446 ns |
1.00 |
array/reductions/reduce/Float32/1d |
38770 ns |
37742 ns |
1.03 |
array/reductions/reduce/Float32/dims=1 |
42616.5 ns |
42436 ns |
1.00 |
array/reductions/reduce/Float32/dims=2 |
60796 ns |
60098 ns |
1.01 |
array/reductions/reduce/Float32/dims=1L |
52811 ns |
52602 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
72890 ns |
72179 ns |
1.01 |
array/reductions/mapreduce/Int64/1d |
44275 ns |
43634 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=1 |
45089 ns |
46988 ns |
0.96 |
array/reductions/mapreduce/Int64/dims=2 |
62455 ns |
61675 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=1L |
89664 ns |
89086 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2L |
88900 ns |
88150 ns |
1.01 |
array/reductions/mapreduce/Float32/1d |
38160 ns |
36978 ns |
1.03 |
array/reductions/mapreduce/Float32/dims=1 |
44935 ns |
48419.5 ns |
0.93 |
array/reductions/mapreduce/Float32/dims=2 |
60584.5 ns |
60111 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=1L |
53236 ns |
52878 ns |
1.01 |
array/reductions/mapreduce/Float32/dims=2L |
72526 ns |
72374.5 ns |
1.00 |
array/broadcast |
20600 ns |
20123 ns |
1.02 |
array/copyto!/gpu_to_gpu |
11544 ns |
13003 ns |
0.89 |
array/copyto!/cpu_to_gpu |
216971 ns |
217546 ns |
1.00 |
array/copyto!/gpu_to_cpu |
286227 ns |
285690 ns |
1.00 |
array/accumulate/Int64/1d |
125453 ns |
124863 ns |
1.00 |
array/accumulate/Int64/dims=1 |
84123 ns |
83917 ns |
1.00 |
array/accumulate/Int64/dims=2 |
158707 ns |
158224 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1721075.5 ns |
1710808 ns |
1.01 |
array/accumulate/Int64/dims=2L |
968366.5 ns |
966620 ns |
1.00 |
array/accumulate/Float32/1d |
110148 ns |
109551.5 ns |
1.01 |
array/accumulate/Float32/dims=1 |
81147 ns |
80701.5 ns |
1.01 |
array/accumulate/Float32/dims=2 |
147843.5 ns |
148055.5 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1629124 ns |
1619581.5 ns |
1.01 |
array/accumulate/Float32/dims=2L |
701850.5 ns |
698770 ns |
1.00 |
array/construct |
1272.2 ns |
1306.1 ns |
0.97 |
array/random/randn/Float32 |
50099.5 ns |
45766 ns |
1.09 |
array/random/randn!/Float32 |
25474 ns |
25261 ns |
1.01 |
array/random/rand!/Int64 |
27875 ns |
27478 ns |
1.01 |
array/random/rand!/Float32 |
9045.666666666666 ns |
8968 ns |
1.01 |
array/random/rand/Int64 |
30599 ns |
30173.5 ns |
1.01 |
array/random/rand/Float32 |
13504.5 ns |
13273 ns |
1.02 |
array/permutedims/4d |
56228.5 ns |
56320.5 ns |
1.00 |
array/permutedims/2d |
54845 ns |
54500 ns |
1.01 |
array/permutedims/3d |
55747 ns |
55121.5 ns |
1.01 |
array/sorting/1d |
2777979 ns |
2758806 ns |
1.01 |
array/sorting/by |
3357231.5 ns |
3345943 ns |
1.00 |
array/sorting/2d |
1086142.5 ns |
1082452 ns |
1.00 |
cuda/synchronization/stream/auto |
1027.2727272727273 ns |
1022.3076923076923 ns |
1.00 |
cuda/synchronization/stream/nonblocking |
7193.2 ns |
7461.6 ns |
0.96 |
cuda/synchronization/stream/blocking |
797.7777777777778 ns |
805.6082474226804 ns |
0.99 |
cuda/synchronization/context/auto |
1171.6 ns |
1182.4 ns |
0.99 |
cuda/synchronization/context/nonblocking |
7298.299999999999 ns |
7352 ns |
0.99 |
cuda/synchronization/context/blocking |
882.3461538461538 ns |
901.8522727272727 ns |
0.98 |
This comment was automatically generated by workflow using github-action-benchmark.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
CuSparseMatrixBSRfor generic sparsemv!added with CUDA13.0.1.mv!andmm!because we don't have any high-level way to reuse the buffer / descriptor. We just do more work for nothing.