@@ -2183,3 +2183,81 @@ def test_dimensions_parameter_with_numpy_input(tmp_path):
21832183 distances_2 , indices_2 = index_2 .query (queries , k = k )
21842184 assert distances_2 .shape == (nq , k )
21852185 assert indices_2 .shape == (nq , k )
2186+
2187+
2188+ def test_sparse_array_ingestion_with_trailing_nulls (tmp_path ):
2189+ """
2190+ Test that sparse matrices with trailing null columns are ingested correctly.
2191+
2192+ This test verifies the fix for a bug where sparse matrices with null entries
2193+ at the end were being read with incorrect dimensions. For example, a sparse
2194+ array with schema shape 10x100 might be read as 10x90 if columns 90-99 were
2195+ all empty.
2196+ """
2197+ dataset_dir = os .path .join (tmp_path , "dataset" )
2198+ os .mkdir (dataset_dir )
2199+
2200+ # Create a sparse array with 10 vectors of 100 dimensions
2201+ # Only populate columns 0-89, leaving 90-99 empty
2202+ num_vectors = 10
2203+ dimensions = 100
2204+ populated_dimensions = 90
2205+
2206+ # Create sparse array schema
2207+ schema = tiledb .ArraySchema (
2208+ domain = tiledb .Domain (
2209+ tiledb .Dim (
2210+ name = "rows" , domain = (0 , num_vectors - 1 ), tile = 10 , dtype = np .int32
2211+ ),
2212+ tiledb .Dim (
2213+ name = "cols" , domain = (0 , dimensions - 1 ), tile = dimensions , dtype = np .int32
2214+ ),
2215+ ),
2216+ attrs = [
2217+ tiledb .Attr (name = "values" , dtype = np .float32 , var = False , nullable = False ),
2218+ ],
2219+ sparse = True ,
2220+ )
2221+
2222+ sparse_array_uri = os .path .join (dataset_dir , "sparse_data.tdb" )
2223+ tiledb .Array .create (sparse_array_uri , schema )
2224+
2225+ # Populate the sparse array with data only in columns 0 to populated_dimensions-1
2226+ with tiledb .open (sparse_array_uri , "w" ) as A :
2227+ rows = []
2228+ cols = []
2229+ values = []
2230+ for i in range (num_vectors ):
2231+ for j in range (populated_dimensions ):
2232+ rows .append (i )
2233+ cols .append (j )
2234+ values .append (float (i * 100 + j ))
2235+
2236+ A [rows , cols ] = np .array (values , dtype = np .float32 )
2237+
2238+ # Ingest into a FLAT index
2239+ index_uri = os .path .join (tmp_path , "array" )
2240+ index = ingest (
2241+ index_type = "FLAT" ,
2242+ index_uri = index_uri ,
2243+ source_uri = sparse_array_uri ,
2244+ )
2245+ index .vacuum ()
2246+
2247+ # Verify the index has the correct dimensions
2248+ with tiledb .Group (index_uri , "r" ) as group :
2249+ assert group .meta ["dimensions" ] == dimensions
2250+
2251+ # Create a query vector with all dimensions populated
2252+ query = np .zeros ((1 , dimensions ), dtype = np .float32 )
2253+ query [0 , :populated_dimensions ] = np .arange (populated_dimensions , dtype = np .float32 )
2254+
2255+ # Query the index - should return the first vector (index 0)
2256+ distances , indices = index .query (query , k = 1 )
2257+
2258+ # The closest vector should be the first one (index 0)
2259+ assert indices [0 ][0 ] == 0
2260+
2261+ # Verify we can query successfully (no dimension mismatch errors)
2262+ assert len (distances [0 ]) == 1
2263+ assert len (indices [0 ]) == 1
0 commit comments