Merge pull request #146 from fverdugo/fix_spmm

fverdugo · web-flow · commit bec638d6fd54 · 2024-05-17T19:43:55.000+02:00
Fix sparse matrix matrix products
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,11 +5,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## Unreleased
+## [0.4.5] - 2024-05-17
 
 ### Fixed
 
 - Bug in `copy`.
+- Bug in sparse matrix-matrix products.
+- Performance improvements in `tuple_of_arrays`.
+
+### Added
+
+- Function `centralize` for sparse matrix.
+- `multicast` for arbitrary types.
 
 ## [0.4.4] - 2024-02-20
 
diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
@@ -369,9 +369,11 @@ function amg_setup(x,operator,b,amg_params)
         end
         r = similar(b)
         rc = similar(r,axes(Ac,2)) # we need ghost ids for the mul!(rc,R,r)
+        rc2 = similar(r,axes(P,2)) # TODO
         e = similar(x)
         ec = similar(e,axes(Ac,2))
-        level_setup = (;R,P,r,rc,e,ec,operator,coarse_operator,pre_setup,pos_setup,coarse_operator_setup)
+        ec2 = similar(e,axes(P,2)) # TODO
+        level_setup = (;R,P,r,rc,rc2,e,ec,ec2,operator,coarse_operator,pre_setup,pos_setup,coarse_operator_setup)
         x = ec
         b = rc
         operator = coarse_operator
@@ -400,15 +402,17 @@ function amg_cycle!(x,setup,b,level)
     level_params = amg_params.fine_params[level]
     level_setup = setup.fine_levels[level]
     (;pre_smoother,pos_smoother,cycle) = level_params
-    (;R,P,r,rc,e,ec,operator,coarse_operator,pre_setup,pos_setup) = level_setup
+    (;R,P,r,rc,rc2,e,ec,ec2,operator,coarse_operator,pre_setup,pos_setup) = level_setup
     solve!(pre_smoother)(x,pre_setup,b)
     A = matrix(operator)
     mul!(r,A,x)
     r .= b .- r
-    mul!(rc,R,r)
+    mul!(rc2,R,r)
+    rc .= rc2
     fill!(ec,zero(eltype(ec)))
     cycle(ec,setup,rc,level+1)
-    mul!(e,P,ec)
+    ec2 .= ec
+    mul!(e,P,ec2)
     x .+= e
     solve!(pos_smoother)(x,pos_setup,b)
     x
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
@@ -65,11 +65,9 @@ Pl = preconditioner(amg(;fine_params),y,A,b)
 y .= 0
 cg!(y,A,b;Pl,verbose=true)
 
-
-
 # Now in parallel
 
-parts_per_dir = (1,2)
+parts_per_dir = (2,2)
 np = prod(parts_per_dir)
 parts = DebugArray(LinearIndices((np,)))
 
@@ -91,6 +89,7 @@ finalize!(solver)(S)
 
 # Now with a nullspace
 
+solver = amg()
 O = attach_nullspace(A,default_nullspace(A))
 S = setup(solver)(y,O,b)
 solve!(solver)(y,S,b)
@@ -120,4 +119,20 @@ Pl = preconditioner(solver,y,A,b)
 y .= 0
 cg!(y,A,b;Pl,verbose=true)
 
+
+println("----")
+nodes_per_dir = (40,40,40)
+parts_per_dir = (2,2,1)
+nparts = prod(parts_per_dir)
+parts = LinearIndices((nparts,))
+A = laplace_matrix(nodes_per_dir,parts_per_dir,parts)
+x_exact = pones(partition(axes(A,2)))
+b = A*x_exact
+x = similar(b,axes(A,2))
+x .= 0
+Pl = preconditioner(amg(),x,A,b)
+_, history = cg!(x,A,b;Pl,log=true)
+display(history)
+
+
 end
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "PartitionedArrays"
 uuid = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
 authors = ["Francesc Verdugo <f.verdugo.rojano@vu.nl> and contributors"]
-version = "0.4.4"
+version = "0.4.5"
 
 [deps]
 CircularArrays = "7a955b69-7140-5f4e-a0ed-f168c5e2e749"
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
@@ -313,11 +313,11 @@ version = "2.8.1"
 deps = ["CircularArrays", "Distances", "FillArrays", "IterativeSolvers", "LinearAlgebra", "MPI", "Printf", "Random", "SparseArrays", "SparseMatricesCSR"]
 path = ".."
 uuid = "5a9dfac6-5c52-46f7-8278-5e2210713be9"
-version = "0.4.3"
+version = "0.4.5"
 
 [[deps.PartitionedSolvers]]
 deps = ["IterativeSolvers", "LinearAlgebra", "PartitionedArrays", "Random", "SparseArrays"]
-path = "../extensions/PartitionedSolvers"
+path = "../PartitionedSolvers"
 uuid = "11b65f7f-80ac-401b-9ef2-3db765482d62"
 version = "0.1.0"
 
diff --git a/docs/src/usage.md b/docs/src/usage.md
@@ -140,8 +140,124 @@ the error.
 
 When using MPI, the computational time to run some code can be different for each one of
 the processes. Usually, one measures the time for each process and computes some statistics
-of the resulting values. To this end, the library provides a special timer type called
-[`PTimer`](@ref).
+of the resulting values. This is done by doing time measurements with the tool of your choice and then `gather`ing the results
+on the root for further analysis. Note that this is possible thanks to the changes in version 0.4.1
+that allow one to use  `gather` on arbitrary objects.
+
+In the following example, we force different computation times at each of the processes
+by sleeping a value proportional to the rank id. We gather all the timings in the main process and compute some statistics:
+
+```julia
+using PartitionedArrays
+using Statistics
+with_mpi() do distribute
+    np = 3
+    ranks = distribute(LinearIndices((np,)))
+    t = @elapsed map(ranks) do rank
+        sleep(rank)
+    end
+    ts = gather(map(rank->t,ranks))
+    map_main(ts) do ts
+        @show ts
+        @show maximum(ts)
+        @show minimum(ts)
+        @show Statistics.mean(ts)
+    end
+end
+```
+
+```
+ts = [1.001268313, 2.0023204, 3.001216396]
+maximum(ts) = 3.001216396
+minimum(ts) = 1.001268313
+Statistics.mean(ts) = 2.001601703
+```
+
+This mechanism also works for the other back-ends. For sequential ones, it provides the time
+spend by all parts combined. Note how we define `t` (outside the call to `map`) and the object passed to `gather`.
+
+```julia
+using PartitionedArrays
+using Statistics
+with_debug() do distribute
+    np = 3
+    ranks = distribute(LinearIndices((np,)))
+    t = @elapsed map(ranks) do rank
+        sleep(rank)
+    end
+    ts = gather(map(rank->t,ranks))
+    map_main(ts) do ts
+        @show ts
+        @show maximum(ts)
+        @show minimum(ts)
+        @show Statistics.mean(ts)
+    end
+end;
+```
+
+```
+ts = [6.009726399, 6.009726399, 6.009726399]
+maximum(ts) = 6.009726399
+minimum(ts) = 6.009726399
+Statistics.mean(ts) = 6.009726398999999
+```
+
+We can also consider more sophisticated ways of measuring the times, e.g., with [TimerOutputs](https://github.com/KristofferC/TimerOutputs.jl).
+
+```julia
+using PartitionedArrays
+using Statistics
+using TimerOutputs
+with_mpi() do distribute
+    np = 3
+    ranks = distribute(LinearIndices((np,)))
+    to = TimerOutput()
+    @timeit to "phase 1" map(ranks) do rank
+        sleep(rank)
+    end
+    @timeit to "phase 2" map(ranks) do rank
+        sleep(2*rank)
+    end
+    tos = gather(map(rank->to,ranks))
+    map_main(tos) do tos
+        # check the timings on the first rank
+        display(tos[1])
+        # compute statistics for phase 1
+        ts = map(tos) do to
+            TimerOutputs.time(to["phase 1"])
+        end
+        @show ts
+        @show maximum(ts)
+        @show minimum(ts)
+        @show Statistics.mean(ts)
+    end
+end
+```
+
+```
+ ────────────────────────────────────────────────────────────────────
+                            Time                    Allocations      
+                   ───────────────────────   ────────────────────────
+ Tot / % measured:      10.3s /  29.3%           44.9MiB /   0.0%    
+
+ Section   ncalls     time    %tot     avg     alloc    %tot      avg
+ ────────────────────────────────────────────────────────────────────
+ phase 2        1    2.00s   66.6%   2.00s      120B   50.0%     120B
+ phase 1        1    1.00s   33.4%   1.00s      120B   50.0%     120B
+ ────────────────────────────────────────────────────────────────────
+ts = [1002323746, 2001614329, 3004363808]
+maximum(ts) = 3004363808
+minimum(ts) = 1002323746
+Statistics.mean(ts) = 2.0027672943333333e9
+```
+
+In addition, the library provides a special timer type called [`PTimer`](@ref).
+
+!!! note
+    `PTimer` has been deprecated. Do time measurements with the tool of your choice and then `gather` the results
+    on the root for further analysis (see above).
+
+
 In the following example we force different computation times at each of the processes
 by sleeping a value proportional to the rank id.
 When displayed, the instance of [`PTimer`](@ref) shows some statistics of the
@@ -170,7 +286,7 @@ Sleep     3.021e+00   1.021e+00   2.021e+00
 ───────────────────────────────────────────
 ```
 
-This mechanism also works for the other back-ends. For sequential ones, it provides the type
+This mechanism also works for the other back-ends. For sequential ones, it provides the time
 spend by all parts combined.
 
 ```julia
diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl
@@ -158,6 +158,7 @@ export spmm
 export spmm!
 export spmtm
 export spmtm!
+export centralize
 include("p_sparse_matrix.jl")
 
 export PTimer
diff --git a/src/mpi_array.jl b/src/mpi_array.jl
@@ -392,10 +392,19 @@ function multicast_impl!(
     @assert rcv.comm === snd.comm
     comm = snd.comm
     root = source - 1
-    if MPI.Comm_rank(comm) == root
-        rcv.item = snd.item
+    if isbitstype(T)
+        if MPI.Comm_rank(comm) == root
+            rcv.item = snd.item
+        end
+        MPI.Bcast!(rcv.item_ref,root,comm)
+    else
+        if MPI.Comm_rank(comm) == root
+            rcv.item_ref[] = MPI.bcast(snd.item,comm;root)
+        else
+            rcv.item_ref[] = MPI.bcast(nothing,comm;root)
+        end
     end
-    MPI.Bcast!(rcv.item_ref,root,comm)
+    rcv
 end
 
 function multicast_impl!(
@@ -408,6 +417,7 @@ function multicast_impl!(
         rcv.item = snd.item
     end
     MPI.Bcast!(rcv.item,root,comm)
+    rcv
 end
 
 function scan!(op,b::MPIArray,a::MPIArray;init,type)
diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
diff --git a/test/p_sparse_matrix_tests.jl b/test/p_sparse_matrix_tests.jl