1+ # Licensed to the Apache Software Foundation (ASF) under one
2+ # or more contributor license agreements. See the NOTICE file
3+ # distributed with this work for additional information
4+ # regarding copyright ownership. The ASF licenses this file
5+ # to you under the Apache License, Version 2.0 (the
6+ # "License"); you may not use this file except in compliance
7+ # with the License. You may obtain a copy of the License at
8+ #
9+ # http://www.apache.org/licenses/LICENSE-2.0
10+ #
11+ # Unless required by applicable law or agreed to in writing, software
12+ # distributed under the License is distributed on an "AS IS" BASIS,
13+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+ # See the License for the specific language governing permissions and
15+ # limitations under the License.
16+
17+ """
18+ Arrow.jl Sparse Tensor Demo
19+
20+ This example demonstrates the usage of sparse tensor formats supported
21+ by Arrow.jl:
22+ - COO (Coordinate): General sparse tensor format
23+ - CSR/CSC (Compressed Sparse Row/Column): Efficient 2D sparse matrices
24+ - CSF (Compressed Sparse Fiber): Advanced N-dimensional sparse tensors
25+
26+ The demo shows construction, manipulation, and serialization of sparse tensors.
27+ """
28+
29+ using Arrow
30+ using SparseArrays
31+ using LinearAlgebra
32+
33+ println (" === Arrow.jl Sparse Tensor Demo ===\n " )
34+
35+ # ============================================================================
36+ # COO (Coordinate) Format Demo
37+ # ============================================================================
38+ println (" 1. COO (Coordinate) Format" )
39+ println (" - General purpose sparse tensor format" )
40+ println (" - Stores explicit coordinates and values for each non-zero element" )
41+ println ()
42+
43+ # Create a 4×4 sparse matrix with some non-zero elements
44+ println (" Creating a 4×4 sparse matrix:" )
45+ indices = [1 2 3 4 2 ; 1 2 3 1 4 ] # 2×5 matrix: coordinates (row, col)
46+ data = [1.0 , 4.0 , 9.0 , 2.0 , 8.0 ] # Values at those coordinates
47+ shape = (4 , 4 )
48+
49+ coo_tensor = Arrow. SparseTensorCOO {Float64,2} (indices, data, shape)
50+ println (" COO Tensor: $coo_tensor " )
51+ println (" Matrix representation:" )
52+ for i in 1 : 4
53+ row = [coo_tensor[i, j] for j in 1 : 4 ]
54+ println (" $row " )
55+ end
56+ println (" Non-zero elements: $(Arrow. nnz (coo_tensor)) " )
57+ println ()
58+
59+ # Demonstrate 3D COO tensor
60+ println (" Creating a 3×3×3 sparse 3D tensor:" )
61+ indices_3d = [1 2 3 1 ; 1 2 1 3 ; 1 1 3 3 ] # 3×4 matrix
62+ data_3d = [1.0 , 2.0 , 3.0 , 4.0 ]
63+ shape_3d = (3 , 3 , 3 )
64+
65+ coo_3d = Arrow. SparseTensorCOO {Float64,3} (indices_3d, data_3d, shape_3d)
66+ println (" 3D COO Tensor: $coo_3d " )
67+ println (" Sample elements:" )
68+ println (" [1,1,1] = $(coo_3d[1 ,1 ,1 ]) " )
69+ println (" [2,2,1] = $(coo_3d[2 ,2 ,1 ]) " )
70+ println (" [1,1,3] = $(coo_3d[1 ,1 ,3 ]) " )
71+ println (" [1,2,2] = $(coo_3d[1 ,2 ,2 ]) (zero element)" )
72+ println ()
73+
74+ # ============================================================================
75+ # CSR/CSC (Compressed Sparse Row/Column) Format Demo
76+ # ============================================================================
77+ println (" 2. CSX (Compressed Sparse Row/Column) Format" )
78+ println (" - Efficient for 2D sparse matrices" )
79+ println (" - CSR compresses rows, CSC compresses columns" )
80+ println ()
81+
82+ # Create the same 4×4 matrix in CSR format
83+ println (" Same 4×4 matrix in CSR (Compressed Sparse Row) format:" )
84+ # Matrix: [1.0 0 0 0 ]
85+ # [0 4.0 0 8.0]
86+ # [0 0 9.0 0 ]
87+ # [2.0 0 0 0 ]
88+ indptr_csr = [1 , 2 , 4 , 5 , 6 ] # Row pointers: where each row starts in data/indices
89+ indices_csr = [1 , 2 , 4 , 3 , 1 ] # Column indices for each value
90+ data_csr = [1.0 , 4.0 , 8.0 , 9.0 , 2.0 ]
91+
92+ csr_tensor = Arrow. SparseTensorCSX {Float64} (indptr_csr, indices_csr, data_csr, (4 , 4 ), :row )
93+ println (" CSR Tensor: $csr_tensor " )
94+ println (" Matrix representation:" )
95+ for i in 1 : 4
96+ row = [csr_tensor[i, j] for j in 1 : 4 ]
97+ println (" $row " )
98+ end
99+ println ()
100+
101+ # Create the same matrix in CSC format
102+ println (" Same matrix in CSC (Compressed Sparse Column) format:" )
103+ indptr_csc = [1 , 3 , 4 , 5 , 6 ] # Column pointers
104+ indices_csc = [1 , 4 , 2 , 3 , 2 ] # Row indices for each value
105+ data_csc = [1.0 , 2.0 , 4.0 , 9.0 , 8.0 ]
106+
107+ csc_tensor = Arrow. SparseTensorCSX {Float64} (indptr_csc, indices_csc, data_csc, (4 , 4 ), :col )
108+ println (" CSC Tensor: $csc_tensor " )
109+
110+ # Verify both formats give same results
111+ println (" Verification - CSR and CSC should give same values:" )
112+ println (" CSR[2,2] = $(csr_tensor[2 ,2 ]) , CSC[2,2] = $(csc_tensor[2 ,2 ]) " )
113+ println (" CSR[2,4] = $(csr_tensor[2 ,4 ]) , CSC[2,4] = $(csc_tensor[2 ,4 ]) " )
114+ println ()
115+
116+ # ============================================================================
117+ # Integration with Julia SparseArrays
118+ # ============================================================================
119+ println (" 3. Integration with Julia SparseArrays" )
120+ println (" - Convert Julia SparseMatrixCSC to Arrow sparse tensors" )
121+ println ()
122+
123+ # Create a Julia sparse matrix
124+ println (" Creating Julia SparseMatrixCSC:" )
125+ I_julia = [1 , 3 , 2 , 4 , 2 ]
126+ J_julia = [1 , 3 , 2 , 1 , 4 ]
127+ V_julia = [10.0 , 30.0 , 20.0 , 40.0 , 25.0 ]
128+ julia_sparse = sparse (I_julia, J_julia, V_julia, 4 , 4 )
129+ println (" Julia sparse matrix:" )
130+ display (julia_sparse)
131+ println ()
132+
133+ # Convert to Arrow COO format
134+ println (" Converting to Arrow COO format:" )
135+ coo_from_julia = Arrow. SparseTensorCOO (julia_sparse)
136+ println (" Arrow COO: $coo_from_julia " )
137+ println (" Verification - [3,3] = $(coo_from_julia[3 ,3 ]) (should be 30.0)" )
138+ println ()
139+
140+ # Convert to Arrow CSC format (natural fit)
141+ println (" Converting to Arrow CSC format:" )
142+ csc_from_julia = Arrow. SparseTensorCSX (julia_sparse, :col )
143+ println (" Arrow CSC: $csc_from_julia " )
144+ println ()
145+
146+ # Convert to Arrow CSR format
147+ println (" Converting to Arrow CSR format:" )
148+ csr_from_julia = Arrow. SparseTensorCSX (julia_sparse, :row )
149+ println (" Arrow CSR: $csr_from_julia " )
150+ println ()
151+
152+ # ============================================================================
153+ # CSF (Compressed Sparse Fiber) Format Demo
154+ # ============================================================================
155+ println (" 4. CSF (Compressed Sparse Fiber) Format" )
156+ println (" - Most advanced format for high-dimensional sparse tensors" )
157+ println (" - Provides excellent compression for structured sparse data" )
158+ println ()
159+
160+ # Create a simple 3D CSF tensor (simplified structure)
161+ println (" Creating a 2×2×2 CSF tensor:" )
162+ indices_buffers_csf = [
163+ [1 , 2 ], # Indices for dimension 1
164+ [1 , 2 ], # Indices for dimension 2
165+ [1 , 2 ] # Indices for dimension 3
166+ ]
167+ indptr_buffers_csf = [
168+ [1 , 2 , 3 ], # Pointers for level 0
169+ [1 , 2 , 3 ] # Pointers for level 1
170+ ]
171+ data_csf = [100.0 , 200.0 ]
172+ shape_csf = (2 , 2 , 2 )
173+
174+ csf_tensor = Arrow. SparseTensorCSF {Float64,3} (indices_buffers_csf, indptr_buffers_csf, data_csf, shape_csf)
175+ println (" CSF Tensor: $csf_tensor " )
176+ println (" Note: CSF format is complex - this is a simplified demonstration" )
177+ println ()
178+
179+ # ============================================================================
180+ # Serialization and Metadata Demo
181+ # ============================================================================
182+ println (" 5. Serialization and Metadata" )
183+ println (" - Sparse tensors can be serialized with format metadata" )
184+ println ()
185+
186+ # Generate metadata for different formats
187+ println (" COO metadata:" )
188+ coo_metadata = Arrow. sparse_tensor_metadata (coo_tensor)
189+ println (" $coo_metadata " )
190+ println ()
191+
192+ println (" CSR metadata:" )
193+ csr_metadata = Arrow. sparse_tensor_metadata (csr_tensor)
194+ println (" $csr_metadata " )
195+ println ()
196+
197+ # Demonstrate serialization round-trip
198+ println (" Serialization round-trip test:" )
199+ buffers, metadata = Arrow. serialize_sparse_tensor (coo_tensor)
200+ reconstructed = Arrow. deserialize_sparse_tensor (buffers, metadata, Float64)
201+ println (" Original: $coo_tensor " )
202+ println (" Reconstructed: $reconstructed " )
203+ println (" Round-trip successful: $(reconstructed[1 ,1 ] == coo_tensor[1 ,1 ] && Arrow. nnz (reconstructed) == Arrow. nnz (coo_tensor)) " )
204+ println ()
205+
206+ # ============================================================================
207+ # Performance and Sparsity Analysis
208+ # ============================================================================
209+ println (" 6. Performance and Sparsity Analysis" )
210+ println (" - Demonstrate efficiency gains with sparse storage" )
211+ println ()
212+
213+ # Create a large sparse matrix
214+ println (" Creating a large sparse matrix (1000×1000 with 0.1% non-zeros):" )
215+ n = 1000
216+ nnz_count = div (n * n, 1000 ) # 0.1% density
217+
218+ # Generate random sparse data
219+ Random. seed! (42 ) # For reproducible results
220+ using Random
221+ rows = rand (1 : n, nnz_count)
222+ cols = rand (1 : n, nnz_count)
223+ vals = rand (Float64, nnz_count)
224+
225+ # Remove duplicates by creating a dictionary
226+ sparse_dict = Dict {Tuple{Int,Int}, Float64} ()
227+ for (r, c, v) in zip (rows, cols, vals)
228+ sparse_dict[(r, c)] = v
229+ end
230+
231+ # Convert back to arrays
232+ coords = collect (keys (sparse_dict))
233+ values = collect (values (sparse_dict))
234+ actual_nnz = length (values)
235+
236+ indices_large = [getindex .(coords, 1 ) getindex .(coords, 2 )]' # 2×nnz matrix
237+ large_coo = Arrow. SparseTensorCOO {Float64,2} (indices_large, values, (n, n))
238+
239+ println (" Large COO tensor: $(large_coo) " )
240+ total_elements = n * n
241+ stored_elements = actual_nnz
242+ memory_saved = total_elements - stored_elements
243+ compression_ratio = total_elements / stored_elements
244+
245+ println (" Storage analysis:" )
246+ println (" Total elements: $(total_elements) " )
247+ println (" Stored elements: $(stored_elements) " )
248+ println (" Memory saved: $(memory_saved) elements" )
249+ println (" Compression ratio: $(round (compression_ratio, digits= 2 )) x" )
250+ println (" Storage efficiency: $(round ((1 - stored_elements/ total_elements) * 100 , digits= 2 )) %" )
251+ println ()
252+
253+ println (" === Demo Complete ===" )
254+ println (" Sparse tensors provide efficient storage and computation for" )
255+ println (" data where most elements are zero, with significant memory" )
256+ println (" savings and computational advantages for appropriate workloads." )
0 commit comments