Skip to content

Commit bec638d

Browse files
authored
Merge pull request #146 from fverdugo/fix_spmm
Fix sparse matrix matrix products
2 parents 635bd41 + 8acf342 commit bec638d

File tree

10 files changed

+250
-60
lines changed

10 files changed

+250
-60
lines changed

CHANGELOG.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,18 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8-
## Unreleased
8+
## [0.4.5] - 2024-05-17
99

1010
### Fixed
1111

1212
- Bug in `copy`.
13+
- Bug in sparse matrix-matrix products.
14+
- Performance improvements in `tuple_of_arrays`.
15+
16+
### Added
17+
18+
- Function `centralize` for sparse matrix.
19+
- `multicast` for arbitrary types.
1320

1421
## [0.4.4] - 2024-02-20
1522

PartitionedSolvers/src/amg.jl

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -369,9 +369,11 @@ function amg_setup(x,operator,b,amg_params)
369369
end
370370
r = similar(b)
371371
rc = similar(r,axes(Ac,2)) # we need ghost ids for the mul!(rc,R,r)
372+
rc2 = similar(r,axes(P,2)) # TODO
372373
e = similar(x)
373374
ec = similar(e,axes(Ac,2))
374-
level_setup = (;R,P,r,rc,e,ec,operator,coarse_operator,pre_setup,pos_setup,coarse_operator_setup)
375+
ec2 = similar(e,axes(P,2)) # TODO
376+
level_setup = (;R,P,r,rc,rc2,e,ec,ec2,operator,coarse_operator,pre_setup,pos_setup,coarse_operator_setup)
375377
x = ec
376378
b = rc
377379
operator = coarse_operator
@@ -400,15 +402,17 @@ function amg_cycle!(x,setup,b,level)
400402
level_params = amg_params.fine_params[level]
401403
level_setup = setup.fine_levels[level]
402404
(;pre_smoother,pos_smoother,cycle) = level_params
403-
(;R,P,r,rc,e,ec,operator,coarse_operator,pre_setup,pos_setup) = level_setup
405+
(;R,P,r,rc,rc2,e,ec,ec2,operator,coarse_operator,pre_setup,pos_setup) = level_setup
404406
solve!(pre_smoother)(x,pre_setup,b)
405407
A = matrix(operator)
406408
mul!(r,A,x)
407409
r .= b .- r
408-
mul!(rc,R,r)
410+
mul!(rc2,R,r)
411+
rc .= rc2
409412
fill!(ec,zero(eltype(ec)))
410413
cycle(ec,setup,rc,level+1)
411-
mul!(e,P,ec)
414+
ec2 .= ec
415+
mul!(e,P,ec2)
412416
x .+= e
413417
solve!(pos_smoother)(x,pos_setup,b)
414418
x

PartitionedSolvers/test/amg_tests.jl

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,11 +65,9 @@ Pl = preconditioner(amg(;fine_params),y,A,b)
6565
y .= 0
6666
cg!(y,A,b;Pl,verbose=true)
6767

68-
69-
7068
# Now in parallel
7169

72-
parts_per_dir = (1,2)
70+
parts_per_dir = (2,2)
7371
np = prod(parts_per_dir)
7472
parts = DebugArray(LinearIndices((np,)))
7573

@@ -91,6 +89,7 @@ finalize!(solver)(S)
9189

9290
# Now with a nullspace
9391

92+
solver = amg()
9493
O = attach_nullspace(A,default_nullspace(A))
9594
S = setup(solver)(y,O,b)
9695
solve!(solver)(y,S,b)
@@ -120,4 +119,20 @@ Pl = preconditioner(solver,y,A,b)
120119
y .= 0
121120
cg!(y,A,b;Pl,verbose=true)
122121

122+
123+
println("----")
124+
nodes_per_dir = (40,40,40)
125+
parts_per_dir = (2,2,1)
126+
nparts = prod(parts_per_dir)
127+
parts = LinearIndices((nparts,))
128+
A = laplace_matrix(nodes_per_dir,parts_per_dir,parts)
129+
x_exact = pones(partition(axes(A,2)))
130+
b = A*x_exact
131+
x = similar(b,axes(A,2))
132+
x .= 0
133+
Pl = preconditioner(amg(),x,A,b)
134+
_, history = cg!(x,A,b;Pl,log=true)
135+
display(history)
136+
137+
123138
end

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "PartitionedArrays"
22
uuid = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
33
authors = ["Francesc Verdugo <f.verdugo.rojano@vu.nl> and contributors"]
4-
version = "0.4.4"
4+
version = "0.4.5"
55

66
[deps]
77
CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"

docs/Manifest.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,11 +313,11 @@ version = "2.8.1"
313313
deps = ["CircularArrays", "Distances", "FillArrays", "IterativeSolvers", "LinearAlgebra", "MPI", "Printf", "Random", "SparseArrays", "SparseMatricesCSR"]
314314
path = ".."
315315
uuid = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
316-
version = "0.4.3"
316+
version = "0.4.5"
317317

318318
[[deps.PartitionedSolvers]]
319319
deps = ["IterativeSolvers", "LinearAlgebra", "PartitionedArrays", "Random", "SparseArrays"]
320-
path = "../extensions/PartitionedSolvers"
320+
path = "../PartitionedSolvers"
321321
uuid = "11b65f7f-80ac-401b-9ef2-3db765482d62"
322322
version = "0.1.0"
323323

docs/src/usage.md

Lines changed: 119 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,124 @@ the error.
140140

141141
When using MPI, the computational time to run some code can be different for each one of
142142
the processes. Usually, one measures the time for each process and computes some statistics
143-
of the resulting values. To this end, the library provides a special timer type called
144-
[`PTimer`](@ref).
143+
of the resulting values. This is done by doing time measurements with the tool of your choice and then `gather`ing the results
144+
on the root for further analysis. Note that this is possible thanks to the changes in version 0.4.1
145+
that allow one to use `gather` on arbitrary objects.
146+
147+
In the following example, we force different computation times at each of the processes
148+
by sleeping a value proportional to the rank id. We gather all the timings in the main process and compute some statistics:
149+
150+
```julia
151+
using PartitionedArrays
152+
using Statistics
153+
with_mpi() do distribute
154+
np = 3
155+
ranks = distribute(LinearIndices((np,)))
156+
t = @elapsed map(ranks) do rank
157+
sleep(rank)
158+
end
159+
ts = gather(map(rank->t,ranks))
160+
map_main(ts) do ts
161+
@show ts
162+
@show maximum(ts)
163+
@show minimum(ts)
164+
@show Statistics.mean(ts)
165+
end
166+
end
167+
```
168+
169+
```
170+
ts = [1.001268313, 2.0023204, 3.001216396]
171+
maximum(ts) = 3.001216396
172+
minimum(ts) = 1.001268313
173+
Statistics.mean(ts) = 2.001601703
174+
```
175+
176+
This mechanism also works for the other back-ends. For sequential ones, it provides the time
177+
spend by all parts combined. Note how we define `t` (outside the call to `map`) and the object passed to `gather`.
178+
179+
```julia
180+
using PartitionedArrays
181+
using Statistics
182+
with_debug() do distribute
183+
np = 3
184+
ranks = distribute(LinearIndices((np,)))
185+
t = @elapsed map(ranks) do rank
186+
sleep(rank)
187+
end
188+
ts = gather(map(rank->t,ranks))
189+
map_main(ts) do ts
190+
@show ts
191+
@show maximum(ts)
192+
@show minimum(ts)
193+
@show Statistics.mean(ts)
194+
end
195+
end;
196+
```
197+
198+
```
199+
ts = [6.009726399, 6.009726399, 6.009726399]
200+
maximum(ts) = 6.009726399
201+
minimum(ts) = 6.009726399
202+
Statistics.mean(ts) = 6.009726398999999
203+
```
204+
205+
We can also consider more sophisticated ways of measuring the times, e.g., with [TimerOutputs](https://github.com/KristofferC/TimerOutputs.jl).
206+
207+
```julia
208+
using PartitionedArrays
209+
using Statistics
210+
using TimerOutputs
211+
with_mpi() do distribute
212+
np = 3
213+
ranks = distribute(LinearIndices((np,)))
214+
to = TimerOutput()
215+
@timeit to "phase 1" map(ranks) do rank
216+
sleep(rank)
217+
end
218+
@timeit to "phase 2" map(ranks) do rank
219+
sleep(2*rank)
220+
end
221+
tos = gather(map(rank->to,ranks))
222+
map_main(tos) do tos
223+
# check the timings on the first rank
224+
display(tos[1])
225+
# compute statistics for phase 1
226+
ts = map(tos) do to
227+
TimerOutputs.time(to["phase 1"])
228+
end
229+
@show ts
230+
@show maximum(ts)
231+
@show minimum(ts)
232+
@show Statistics.mean(ts)
233+
end
234+
end
235+
```
236+
237+
```
238+
────────────────────────────────────────────────────────────────────
239+
Time Allocations
240+
─────────────────────── ────────────────────────
241+
Tot / % measured: 10.3s / 29.3% 44.9MiB / 0.0%
242+
243+
Section ncalls time %tot avg alloc %tot avg
244+
────────────────────────────────────────────────────────────────────
245+
phase 2 1 2.00s 66.6% 2.00s 120B 50.0% 120B
246+
phase 1 1 1.00s 33.4% 1.00s 120B 50.0% 120B
247+
────────────────────────────────────────────────────────────────────
248+
ts = [1002323746, 2001614329, 3004363808]
249+
maximum(ts) = 3004363808
250+
minimum(ts) = 1002323746
251+
Statistics.mean(ts) = 2.0027672943333333e9
252+
```
253+
254+
In addition, the library provides a special timer type called [`PTimer`](@ref).
255+
256+
!!! note
257+
`PTimer` has been deprecated. Do time measurements with the tool of your choice and then `gather` the results
258+
on the root for further analysis (see above).
259+
260+
145261
In the following example we force different computation times at each of the processes
146262
by sleeping a value proportional to the rank id.
147263
When displayed, the instance of [`PTimer`](@ref) shows some statistics of the
@@ -170,7 +286,7 @@ Sleep 3.021e+00 1.021e+00 2.021e+00
170286
───────────────────────────────────────────
171287
```
172288

173-
This mechanism also works for the other back-ends. For sequential ones, it provides the type
289+
This mechanism also works for the other back-ends. For sequential ones, it provides the time
174290
spend by all parts combined.
175291

176292
```julia

src/PartitionedArrays.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ export spmm
158158
export spmm!
159159
export spmtm
160160
export spmtm!
161+
export centralize
161162
include("p_sparse_matrix.jl")
162163

163164
export PTimer

src/mpi_array.jl

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -392,10 +392,19 @@ function multicast_impl!(
392392
@assert rcv.comm === snd.comm
393393
comm = snd.comm
394394
root = source - 1
395-
if MPI.Comm_rank(comm) == root
396-
rcv.item = snd.item
395+
if isbitstype(T)
396+
if MPI.Comm_rank(comm) == root
397+
rcv.item = snd.item
398+
end
399+
MPI.Bcast!(rcv.item_ref,root,comm)
400+
else
401+
if MPI.Comm_rank(comm) == root
402+
rcv.item_ref[] = MPI.bcast(snd.item,comm;root)
403+
else
404+
rcv.item_ref[] = MPI.bcast(nothing,comm;root)
405+
end
397406
end
398-
MPI.Bcast!(rcv.item_ref,root,comm)
407+
rcv
399408
end
400409

401410
function multicast_impl!(
@@ -408,6 +417,7 @@ function multicast_impl!(
408417
rcv.item = snd.item
409418
end
410419
MPI.Bcast!(rcv.item,root,comm)
420+
rcv
411421
end
412422

413423
function scan!(op,b::MPIArray,a::MPIArray;init,type)

0 commit comments

Comments
 (0)