Skip to content

Commit 7024558

Browse files
giacomo-cirogciro-ifomkounelisagis
authored
fix(ingestion): Pad sparse matrices to correct dimensions (#580)
Co-authored-by: gciro-ifom <giacomo.ciro@ifom.eu> Co-authored-by: Agisilaos Kounelis <kounelisagis@gmail.com>
1 parent 101a608 commit 7024558

File tree

2 files changed

+95
-3
lines changed

2 files changed

+95
-3
lines changed

apis/python/src/tiledb/vector_search/ingestion.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,16 @@ def autodetect_source_type(source_uri: str) -> str:
409409
elif source_uri.endswith(".bvecs"):
410410
return "BVEC"
411411
else:
412-
return "TILEDB_ARRAY"
412+
# Check if it's a TileDB array and whether it's sparse or dense
413+
try:
414+
schema = tiledb.ArraySchema.load(source_uri)
415+
if schema.sparse:
416+
return "TILEDB_SPARSE_ARRAY"
417+
else:
418+
return "TILEDB_ARRAY"
419+
except Exception:
420+
# If we can't load the schema, assume it's a dense TileDB array
421+
return "TILEDB_ARRAY"
413422

414423
def read_source_metadata(
415424
source_uri: str, source_type: Optional[str] = None
@@ -946,15 +955,20 @@ def read_input_vectors(
946955
) as src_array:
947956
src_array_schema = src_array.schema
948957
data = src_array[start_pos:end_pos, 0:dimensions]
949-
return coo_matrix(
958+
959+
matrix = coo_matrix(
950960
(
951961
data[src_array_schema.attr(0).name],
952962
(
953963
data[src_array_schema.domain.dim(0).name] - start_pos,
954964
data[src_array_schema.domain.dim(1).name],
955965
),
956-
)
966+
),
967+
shape=(end_pos - start_pos, dimensions),
957968
).toarray()
969+
970+
return matrix
971+
958972
elif source_type == "TILEDB_PARTITIONED_ARRAY":
959973
with tiledb.open(
960974
source_uri, "r", timestamp=index_timestamp, config=config

apis/python/test/test_ingestion.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2183,3 +2183,81 @@ def test_dimensions_parameter_with_numpy_input(tmp_path):
21832183
distances_2, indices_2 = index_2.query(queries, k=k)
21842184
assert distances_2.shape == (nq, k)
21852185
assert indices_2.shape == (nq, k)
2186+
2187+
2188+
def test_sparse_array_ingestion_with_trailing_nulls(tmp_path):
2189+
"""
2190+
Test that sparse matrices with trailing null columns are ingested correctly.
2191+
2192+
This test verifies the fix for a bug where sparse matrices with null entries
2193+
at the end were being read with incorrect dimensions. For example, a sparse
2194+
array with schema shape 10x100 might be read as 10x90 if columns 90-99 were
2195+
all empty.
2196+
"""
2197+
dataset_dir = os.path.join(tmp_path, "dataset")
2198+
os.mkdir(dataset_dir)
2199+
2200+
# Create a sparse array with 10 vectors of 100 dimensions
2201+
# Only populate columns 0-89, leaving 90-99 empty
2202+
num_vectors = 10
2203+
dimensions = 100
2204+
populated_dimensions = 90
2205+
2206+
# Create sparse array schema
2207+
schema = tiledb.ArraySchema(
2208+
domain=tiledb.Domain(
2209+
tiledb.Dim(
2210+
name="rows", domain=(0, num_vectors - 1), tile=10, dtype=np.int32
2211+
),
2212+
tiledb.Dim(
2213+
name="cols", domain=(0, dimensions - 1), tile=dimensions, dtype=np.int32
2214+
),
2215+
),
2216+
attrs=[
2217+
tiledb.Attr(name="values", dtype=np.float32, var=False, nullable=False),
2218+
],
2219+
sparse=True,
2220+
)
2221+
2222+
sparse_array_uri = os.path.join(dataset_dir, "sparse_data.tdb")
2223+
tiledb.Array.create(sparse_array_uri, schema)
2224+
2225+
# Populate the sparse array with data only in columns 0 to populated_dimensions-1
2226+
with tiledb.open(sparse_array_uri, "w") as A:
2227+
rows = []
2228+
cols = []
2229+
values = []
2230+
for i in range(num_vectors):
2231+
for j in range(populated_dimensions):
2232+
rows.append(i)
2233+
cols.append(j)
2234+
values.append(float(i * 100 + j))
2235+
2236+
A[rows, cols] = np.array(values, dtype=np.float32)
2237+
2238+
# Ingest into a FLAT index
2239+
index_uri = os.path.join(tmp_path, "array")
2240+
index = ingest(
2241+
index_type="FLAT",
2242+
index_uri=index_uri,
2243+
source_uri=sparse_array_uri,
2244+
)
2245+
index.vacuum()
2246+
2247+
# Verify the index has the correct dimensions
2248+
with tiledb.Group(index_uri, "r") as group:
2249+
assert group.meta["dimensions"] == dimensions
2250+
2251+
# Create a query vector with all dimensions populated
2252+
query = np.zeros((1, dimensions), dtype=np.float32)
2253+
query[0, :populated_dimensions] = np.arange(populated_dimensions, dtype=np.float32)
2254+
2255+
# Query the index - should return the first vector (index 0)
2256+
distances, indices = index.query(query, k=1)
2257+
2258+
# The closest vector should be the first one (index 0)
2259+
assert indices[0][0] == 0
2260+
2261+
# Verify we can query successfully (no dimension mismatch errors)
2262+
assert len(distances[0]) == 1
2263+
assert len(indices[0]) == 1

0 commit comments

Comments
 (0)