new performance tips

GiggleLiu · GiggleLiu · commit d1656ec7318e · 2022-02-09T20:43:15.000-05:00
diff --git a/README.md b/README.md
@@ -50,117 +50,79 @@ julia> using GraphTensorNetworks, Random, Graphs
 julia> graph = (Random.seed!(2); Graphs.smallgraph(:petersen))
 {10, 15} undirected simple Int64 graph
 
-julia> problem = IndependentSet(graph; optimizer=TreeSA(sc_target=0, sc_weight=1.0, ntrials=10, βs=0.01:0.1:15.0, niters=20, rw_weight=0.2));
-┌ Warning: target space complexity not found, got: 4.0, with time complexity 7.965784284662087, read-right complexity 8.661778097771988.
-└ @ OMEinsumContractionOrders ~/.julia/dev/OMEinsumContractionOrders/src/treesa.jl:71
-time/space complexity is (7.965784284662086, 4.0)
+julia> problem = IndependentSet(graph);
 ```
 
-Here, the `problem` is a `IndependentSet` instance, it contains the tensor network contraction tree for the target graph.
-Here, we choose the `TreeSA` optimizer to optimize the tensor network contraciton tree, it is a local search based algorithm, check [arXiv: 2108.05665](https://arxiv.org/abs/2108.05665). You will see some warnings, do not panic, this is because we set `sc_target` (target space complex) to 1 for agressive optimization of space complexity. Type `?TreeSA` in a Julia REPL for more information about the key word arguments.
-Similarly, one can select tensor network structures for solving other problems like `MaximalIS`, `MaxCut`, `Matching`, `Coloring{K}`, `PaintShop` and `set_packing`.
+Here, the `problem` is a `IndependentSet` instance, it contains the tensor network contraction tree for the target graph (the `code` field).
 
 #### 1. find MIS size, count MISs and count ISs
+* maximum independent set size
 ```julia
-# maximum independent set size
-julia> solve(problem, "size max")
-0-dimensional Array{TropicalNumbers.TropicalF64, 0}:
+julia> solve(problem, SizeMax())[]
 4.0ₜ
+```
+Here, the `solve` function returns you a 0-dimensional array.
+For open graphs, this output tensor can have nonzero dimensionality. Each entry corresponds to a different boundary condition.
 
-# all independent sets
-julia> solve(problem, "counting sum")
-0-dimensional Array{Float64, 0}:
+* all independent sets
+```julia
+julia> solve(problem, CountingAll())[]
 76.0
-
-# counting maximum independent sets
-julia> solve(problem, "counting max")
-0-dimensional Array{TropicalNumbers.CountingTropicalF64, 0}:
-(4.0, 5.0)ₜ
-
-# counting independent sets of max two sizes
-julia> solve(problem, "counting max2")
-0-dimensional Array{Max2Poly{Float64, Float64}, 0}:
-30.0*x^3 + 5.0*x^4
 ```
 
-Here, `solve` function returns you a 0-dimensional array.
-For open graphs, this output tensor can have higher dimensions. Each entry corresponds to a different boundary condition.
-You can speed up the Tropical number computation when computing "size max" on CPU by using the `TropicalGEMM`
+* counting maximum independent sets
+```julia
+julia> solve(problem, CountingMax())[]
+(4.0, 5.0)ₜ  # first field is MIS size, second is its counting.
+```
 
+* counting independent sets of max two sizes with truncated polynomial
 ```julia
-julia> using TropicalGEMM
+julia> solve(problem, CountingMax(2))[]
+0-dimensional Array{Max2Poly{Float64, Float64}, 0}:
+30.0*x^3 + 5.0*x^4
 ```
 
 #### 2. compute the independence polynomial
 
-```julia
-# using `Polynomial` type
-julia> solve(problem, "counting all")
-0-dimensional Array{Polynomial{Float64, :x}, 0}:
-Polynomial(1.0 + 10.0*x + 30.0*x^2 + 30.0*x^3 + 5.0*x^4)
-
-# using the finitefield approach
-julia> solve(problem, "counting all (finitefield)")
-0-dimensional Array{Polynomial{BigInt, :x}, 0}:
-Polynomial(1 + 10*x + 30*x^2 + 30*x^3 + 5*x^4)
-
-# using the fourier approach
-julia> solve(problem, "counting all (fft)", r=1.0)
-0-dimensional Array{Polynomial{ComplexF64, :x}, 0}:
-Polynomial(1.0000000000000029 + 2.664535259100376e-16im + (10.000000000000004 - 1.9512435398857492e-16im)x + (30.0 - 1.9622216671393801e-16im)x^2 + (30.0 + 1.1553104311877194e-15im)x^3 + (5.0 - 1.030417436395244e-15im)x^4)
-```
+The following code computes independence polynomial using the finite field algebra (default) approach.
 
-The `finitefield` approach is the most recommended, because it has no round off error is can be upload to GPU. To upload the computing to GPU,
 ```julia
-julia> using CUDA
-[ Info: OMEinsum loaded the CUDA module successfully
-
-julia> solve(problem, "counting all (finitefield)", usecuda=true)
-0-dimensional Array{Polynomial{BigInt, :x}, 0}:
+julia> solve(problem, GraphPolynomial())[]
 Polynomial(1 + 10*x + 30*x^2 + 30*x^3 + 5*x^4)
 ```
 
-The `fft` approach is fast but with round off errors. Its imaginary part can be regarded as the precision,
-keyword argument `r` controls the round off errors in high/low IS size region.
+The program use `finitefield` method as the default approach, because it has no round off error is can be upload to GPU.
 
 #### 3. find/enumerate solutions
+* find one of the best solutions,
 ```julia
-# one of MISs
-julia> solve(problem, "config max")
-0-dimensional Array{CountingTropical{Float64, ConfigSampler{10, 1, 1}}, 0}:
-(4.0, ConfigSampler{10, 1, 1}(1010000011))ₜ
-
-julia> solve(problem, "config max (bounded)")
-0-dimensional Array{CountingTropical{Float64, ConfigSampler{10, 1, 1}}, 0}:
+julia> solve(problem, SingleConfigMax())[]
 (4.0, ConfigSampler{10, 1, 1}(1010000011))ₜ
+```
 
-# enumerate all MISs
-julia> solve(problem, "configs max")  # not recommended
-0-dimensional Array{CountingTropical{Float64, ConfigEnumerator{10, 1, 1}}, 0}:
-(4.0, {1010000011, 0100100110, 1001001100, 0010111000, 0101010001})ₜ
-
-julia> solve(problem, "configs max (bounded)")
+* enumerate all MISs
+```julia
+julia> cs = solve(problem, ConfigsMax())[]
 0-dimensional Array{CountingTropical{Int64, ConfigEnumerator{10, 1, 1}}, 0}:
 (4, {1010000011, 0100100110, 1001001100, 0010111000, 0101010001})ₜ
-
-# enumerate all MIS and MIS-1 configurations
-julia> solve(problem, "configs max2")
-0-dimensional Array{Max2Poly{ConfigEnumerator{10, 1, 1}, Float64}, 0}:
-{0010101000, 0101000001, 0100100010, 0010100010, 0100000011, 0010000011, 1001001000, 1010001000, 1001000001, 1010000001, 1010000010, 1000000011, 0100100100, 0000101100, 0101000100, 0001001100, 0000100110, 0100000110, 1001000100, 1000001100, 1000000110, 0100110000, 0000111000, 0101010000, 0001011000, 0010110000, 0010011000, 0001010001, 0100010001, 0010010001}*x^3 + {1010000011, 0100100110, 1001001100, 0010111000, 0101010001}*x^4
-
-# enumerate all IS configurations
-julia> solve(problem, "configs all")
-0-dimensional Array{Polynomial{ConfigEnumerator{10, 1, 1}, :x}, 0}:
-Polynomial({0000000000} + {0010000000, 0000100000, 0001000000, 0100000000, 0000001000, 0000000001, 0000000010, 1000000000, 0000000100, 0000010000}*x + {1000000010, 0010100000, 0010001000, 0100100000, 0000101000, 0101000000, 0001001000, 0001000001, 0100000001, 0010000001, 0000100010, 0100000010, 0010000010, 0000000011, 1001000000, 1000001000, 1010000000, 1000000001, 0000000110, 0000100100, 0001000100, 0100000100, 0000001100, 1000000100, 0010010000, 0000110000, 0001010000, 0100010000, 0000011000, 0000010001}*x^2 + {1010000010, 1000000011, 0010101000, 0101000001, 0100100010, 0010100010, 0100000011, 0010000011, 1001001000, 1010001000, 1001000001, 1010000001, 0000100110, 0100000110, 0100100100, 0000101100, 0101000100, 0001001100, 1001000100, 1000001100, 1000000110, 0010110000, 0010011000, 0100110000, 0000111000, 0101010000, 0001011000, 0001010001, 0100010001, 0010010001}*x^3 + {1010000011, 0100100110, 1001001100, 0010111000, 0101010001}*x^4)
 ```
-
-If you want to enumerate all MISs, we highly recommend using the bounded version to save the computational effort. One can also store the configurations on your disk by typing
+It will use the bounded version to save the computational effort.  If you want to save/load your configurations, you can type
 ```julia
-julia> cs = solve(problem, "configs max (bounded)")[1].c  # the `c` field is a `ConfigEnumerator`
-{1010000011, 0100100110, 1001001100, 0010111000, 0101010001}
-
-julia> save_configs("configs.dat", cs; format=:text)  # `:text` or `:binary`
+julia> save_configs("configs.dat", cs.c; format=:text)  # `:text` or `:binary`
 
 julia> load_configs("configs.dat"; format=:text)
 {1010000011, 0100100110, 1001001100, 0010111000, 0101010001}
 ```
+
+* enumerate all configurations of size α(G) and α(G)-1
+```julia
+julia> solve(problem, ConfigsMax(2))[]
+{0010101000, 0101000001, 0100100010, 0010100010, 0100000011, 0010000011, 1001001000, 1010001000, 1001000001, 1010000001, 1010000010, 1000000011, 0100100100, 0000101100, 0101000100, 0001001100, 0000100110, 0100000110, 1001000100, 1000001100, 1000000110, 0100110000, 0000111000, 0101010000, 0001011000, 0010110000, 0010011000, 0001010001, 0100010001, 0010010001}*x^3 + {1010000011, 0100100110, 1001001100, 0010111000, 0101010001}*x^4
+```
+
+* enumerate all independent sets
+```julia
+julia> solve(problem, ConfigsAll())[]
+{0000000000, 0000010000, 1000000000, 0001000000, 0001010000, 1001000000, 0010000000, 0010010000, 1010000000, 0000001000, 0000011000, 1000001000, 0001001000, 0001011000, 1001001000, 0010001000, 0010011000, 1010001000, 0000000010, 1000000010, 0010000010, 1010000010, 0100000000, 0100010000, 0101000000, 0101010000, 0100000010, 0000000100, 1000000100, 0001000100, 1001000100, 0000001100, 1000001100, 0001001100, 1001001100, 0000000110, 1000000110, 0100000100, 0101000100, 0100000110, 0000100000, 0000110000, 0010100000, 0010110000, 0000101000, 0000111000, 0010101000, 0010111000, 0000100010, 0010100010, 0100100000, 0100110000, 0100100010, 0000100100, 0000101100, 0000100110, 0100100100, 0100100110, 0000000001, 0000010001, 1000000001, 0001000001, 0001010001, 1001000001, 0010000001, 0010010001, 1010000001, 0000000011, 1000000011, 0010000011, 1010000011, 0100000001, 0100010001, 0101000001, 0101010001, 0100000011}
+```
diff --git a/docs/make.jl b/docs/make.jl
@@ -38,6 +38,7 @@ makedocs(;
             "Coloring problem" => "tutorials/Coloring.md",
             "Other problems" => "tutorials/Others.md",
         ],
+        "Performance Tips" => "performancetips.md",
         "References" => "ref.md",
     ],
     doctest=false,
diff --git a/docs/src/performancetips.md b/docs/src/performancetips.md
@@ -0,0 +1,73 @@
+# Performance Tips
+
+## Optimize tensor network contraction order
+```julia
+julia> using GraphTensorNetworks, Graphs, Random
+
+julia> graph = random_regular_graph(120, 3)
+{120, 180} undirected simple Int64 graph
+
+julia> problem = IndependentSet(graph; optimizer=TreeSA(sc_target=20, sc_weight=1.0, rw_weight=3.0, ntrials=10, βs=0.01:0.1:15.0, niters=20), simplifier=MergeGreedy());
+```
+
+Key word argument `optimizer` decides the contraction order optimizer of the tensor network.
+Here, we choose the `TreeSA` optimizer to optimize the tensor network contraciton tree, it is a local search based algorithm.
+It is one of the state of the art tensor network contraction order optimizers, one may check [arXiv: 2108.05665](https://arxiv.org/abs/2108.05665) to learn more about the algorithm.
+Other optimizers include
+* [`GreedyMethod`](@ref) (default, fastest in searching speed but worse in contraction order)
+* [`TreeSA`](@ref)
+* [`KaHyParBipartite`](@ref)
+* [`SABipartite`](@ref)
+
+One can type `?TreeSA` in a Julia REPL for more information about how to configure the hyper-parameters of `TreeSA` method.
+`simplifier` keyword argument is not so important, it is a preprocessing routine to improve the searching speed of the `optimizer`.
+
+The returned instance `problem` contains a field `code` that specifies the tensor network contraction order. For an independence problem, its contraction time space complexity is ``2^{{\rm tw}(G)}``, where ``{\rm tw(G)}`` is the [tree-width](https://en.wikipedia.org/wiki/Treewidth) of ``G``.
+One can check the time, space and read-write complexity with the following function.
+
+```julia
+julia> timespacereadwrite_complexity(problem)
+```
+
+The return values are `log2` of the the number of iterations, the number elements in the max tensor and the number of read-write operations to tensor elements.
+
+## GEMM for Tropical numbers
+You can speed up the Tropical number matrix multiplication when computing `SizeMax()` by using the Tropical GEMM routines implemented in package [`TropicalGEMM.jl`](https://github.com/TensorBFS/TropicalGEMM.jl/).
+
+```julia
+julia> using BenchmarkTools
+
+julia> @btime solve(problem, SizeMax())
+  91.630 ms (19203 allocations: 23.72 MiB)
+0-dimensional Array{TropicalF64, 0}:
+53.0ₜ
+
+julia> using TropicalGEMM
+
+julia> @btime solve(problem, SizeMax())
+  8.960 ms (18532 allocations: 17.01 MiB)
+0-dimensional Array{TropicalF64, 0}:
+53.0ₜ
+```
+
+The `TropicalGEMM` pirates the `LinearAlgebra.mul!` interface, hence it takes effect upon using.
+The GEMM routine can speed up the computation on CPU for one order, with multi-threading, it can be even faster.
+Benchmark shows the performance of `TropicalGEMM` is close to the theoretical optimal value.
+
+## Make use of GPUs
+To upload the computing to GPU, you just add need to use CUDA, and offer a new key word argument.
+```julia
+julia> using CUDA
+[ Info: OMEinsum loaded the CUDA module successfully
+
+julia> solve(problem, SizeMax(), usecuda=true)
+0-dimensional CuArray{TropicalF64, 0, CUDA.Mem.DeviceBuffer}:
+53.0ₜ
+```
+
+CUDA backended properties are
+* [`SizeMax`](@ref)
+* [`CoutingAll`](@ref)
+* [`CountingMax`](@ref)
+* [`GraphPolynomial`](@ref)
+* [`SingleConfigMax`](@ref)
diff --git a/examples/IndependentSet.jl b/examples/IndependentSet.jl
@@ -36,22 +36,19 @@ show_graph(graph; locs=locations)
 # \end{matrix}\right)_{s_is_j}
 # ```
 # Let us contruct the problem instance with optimized tensor network contraction order as bellow.
-problem = IndependentSet(graph; optimizer=TreeSA(sc_weight=1.0, ntrials=10,
-                         βs=0.01:0.1:15.0, niters=20, rw_weight=0.2),
-                         simplifier=MergeGreedy());
+problem = IndependentSet(graph; optimizer=TreeSA());
 
 # In the input arguments of [`IndependentSet`](@ref), the `optimizer` is for optimizing the contraction orders.
-# Here we use the local search based optimizer in [arXiv:2108.05665](https://arxiv.org/abs/2108.05665).
-# If no optimizer is specified, the default fast (in terms of the speed of searching contraction order)
-# but worst (in term of contraction complexity) [`GreedyMethod`](@ref) will be used.
-# `simplifier` is a preprocessing routine to speed up the `optimizer`.
+# Here we use the local search based optimizer `TreeSA`.
 # The returned instance `problem` contains a field `code` that specifies the tensor network contraction order.
-# Its contraction time space complexity is ``2^{{\rm tw}(G)}``, where ``{\rm tw(G)}`` is the [tree-width](https://en.wikipedia.org/wiki/Treewidth) of ``G``.
+# The optimal contraction time and space complexity of an independent set problem is ``2^{{\rm tw}(G)}``,
+# where ``{\rm tw(G)}`` is the [tree-width](https://en.wikipedia.org/wiki/Treewidth) of ``G``.
 # One can check the time, space and read-write complexity with the following function.
 
 timespacereadwrite_complexity(problem)
 
 # The return values are `log2` of the the number of iterations, the number elements in the max tensor and the number of read-write operations to tensor elements.
+# For more information about the performance, please check the [Performance Tips](@ref).
 
 
 # ## Solving properties
diff --git a/examples/PaintShop.jl b/examples/PaintShop.jl
@@ -101,4 +101,4 @@ show_graph(graph; locs=locations, texts=string.(sequence), edge_colors=[sequence
 
 # Since we have different choices of initial color, the number of best solution is 4.
 # The following function will check the solution and return you the number of color switchs
-num_paint_shop_color_switch(sequence, painting2)
+num_paint_shop_color_switch(sequence, painting1)
diff --git a/src/configurations.jl b/src/configurations.jl
@@ -10,23 +10,19 @@ function best_solutions(gp::GraphProblem; all=false, usecuda=false)
     if all && usecuda
         throw(ArgumentError("ConfigEnumerator can not be computed on GPU!"))
     end
-    syms = symbols(gp)
-    T = (all ? set_type : sampler_type)(CountingTropical{Int64}, length(syms), nflavor(gp))
-    vertex_index = Dict([s=>i for (i, s) in enumerate(syms)])
     xst = generate_tensors(l->TropicalF64.(get_weights(gp, l)), gp)
     ymask = trues(fill(2, length(getiyv(gp.code)))...)
     if usecuda
         xst = CuArray.(xst)
         ymask = CuArray(ymask)
     end
     if all
-        xs = generate_tensors(l->_onehotv.(Ref(T), vertex_index[l], flavors(gp), get_weights(gp, l)), gp)
+        xs = generate_tensors(fx_solutions(gp, CountingTropical{Int64}, all), gp)
         return bounding_contract(AllConfigs{1}(), gp.code, xst, ymask, xs)
     else
         @assert ndims(ymask) == 0
         t, res = solution_ad(gp.code, xst, ymask)
-        N = length(vertex_index)
-        return fill(CountingTropical(asscalar(t).n, ConfigSampler(StaticBitVector(map(l->res[l], 1:N)))))
+        return fill(CountingTropical(asscalar(t).n, ConfigSampler(StaticBitVector(map(l->res[l], 1:length(res))))))
     end
 end
 
@@ -57,12 +53,9 @@ Finding optimal and suboptimal solutions.
 best2_solutions(gp::GraphProblem; all=true, usecuda=false) = solutions(gp, Max2Poly{Float64,Float64}; all=all, usecuda=usecuda)
 
 function bestk_solutions(gp::GraphProblem, k::Int)
-    syms = symbols(gp)
-    vertex_index = Dict([s=>i for (i, s) in enumerate(syms)])
     xst = generate_tensors(l->TropicalF64.(get_weights(gp, l)), gp)
     ymask = trues(fill(2, length(getiyv(gp.code)))...)
-    T = set_type(TruncatedPoly{k,Float64,Float64}, length(syms), nflavor(gp))
-    xs = generate_tensors(l->_onehotv.(Ref(T), vertex_index[l], flavors(gp), get_weights(gp, l)), gp)
+    xs = generate_tensors(fx_solutions(gp, TruncatedPoly{k,Float64,Float64}, true), gp)
     return bounding_contract(AllConfigs{k}(), gp.code, xst, ymask, xs)
 end