Skip to content

Commit fe1e32b

Browse files
committed
Improve: Compare Levenshtein to CuDF
Currently results in 15x perf?!
1 parent da974f5 commit fe1e32b

File tree

3 files changed

+78
-31
lines changed

3 files changed

+78
-31
lines changed

README.md

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -244,15 +244,16 @@ __Who is this for?__
244244
<td align="center">⚪</td>
245245
<td align="center">⚪</td>
246246
<td align="center">
247-
via <code>NLTK</code> <sup>3</sup><br/>
248-
<span style="color:#ABABAB;">x86:</span> <b>2,490,161</b> &centerdot;
249-
<span style="color:#ABABAB;">arm:</span> <b>2,081,543</b> CUPS
247+
via <code>NLTK</code> <sup>3</sup> and <code>CuDF</code><br/>
248+
<span style="color:#ABABAB;">x86:</span> <b>1,615,306</b> &centerdot;
249+
<span style="color:#ABABAB;">arm:</span> <b>1,349,980</b> &centerdot;
250+
<span style="color:#ABABAB;">cuda:</span> <b>6,532,411,354</b> CUPS
250251
</td>
251252
<td align="center">
252253
<code>szs_levenshtein_distances_t</code><br/>
253-
<span style="color:#ABABAB;">x86:</span> <b>78,851,644</b> &centerdot;
254-
<span style="color:#ABABAB;">arm:</span> <b>36,857,367</b> &centerdot;
255-
<span style="color:#ABABAB;">cuda:</span> <b>3,369,569,512</b> CUPS
254+
<span style="color:#ABABAB;">x86:</span> <b>3,434,427,548</b> &centerdot;
255+
<span style="color:#ABABAB;">arm:</span> <b>1,605,340,403</b> &centerdot;
256+
<span style="color:#ABABAB;">cuda:</span> <b>93,662,026,653</b> CUPS
256257
</td>
257258
</tr>
258259
<!-- Alignment Score -->

scripts/bench_similarities.py

Lines changed: 70 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# "biopython",
1313
# "numpy",
1414
# "tqdm",
15+
# "cudf",
1516
# ]
1617
# ///
1718
"""
@@ -26,6 +27,7 @@
2627
- nltk: Natural language toolkit distances
2728
- edlib: Fast sequence alignment
2829
- biopython: Needleman-Wunsch alignment with BLOSUM matrices
30+
- cudf: GPU-accelerated RAPIDS edit distance
2931
3032
Example usage via UV:
3133
@@ -89,6 +91,14 @@
8991
except ImportError:
9092
BIOPYTHON_AVAILABLE = False
9193

94+
# For RAPIDS cuDF GPU-accelerated edit distance
95+
try:
96+
import cudf
97+
98+
CUDF_AVAILABLE = True
99+
except ImportError:
100+
CUDF_AVAILABLE = False
101+
92102
# Global state for initialized models
93103
_biopython_aligner = None
94104
_blosum_matrix = None
@@ -218,6 +228,7 @@ def benchmark_third_party_edit_distances(
218228
string_pairs: List[Tuple[str, str]],
219229
timeout_seconds: int = 10,
220230
filter_pattern: Optional[re.Pattern] = None,
231+
batch_size: int = 2048,
221232
):
222233
"""Benchmark various edit distance implementations."""
223234

@@ -312,6 +323,26 @@ def kernel(a: str, b: str) -> int:
312323
is_utf8=False, # Binary/bytes
313324
)
314325

326+
# cuDF edit_distance
327+
if name_matches(f"cudf.edit_distance(batch={batch_size})", filter_pattern) and CUDF_AVAILABLE:
328+
329+
def batch_kernel(a_list: List[str], b_list: List[str]) -> List[int]:
330+
# Create cuDF Series from string lists
331+
s1 = cudf.Series(a_list)
332+
s2 = cudf.Series(b_list)
333+
# Compute edit distances and return as list
334+
results = s1.str.edit_distance(s2)
335+
return results.to_arrow().to_numpy()
336+
337+
log_similarity_operation(
338+
f"cudf.edit_distance(batch={batch_size})",
339+
string_pairs,
340+
batch_kernel,
341+
timeout_seconds=timeout_seconds,
342+
batch_size=batch_size,
343+
is_utf8=True, # UTF-8 codepoints
344+
)
345+
315346

316347
def benchmark_stringzillas_edit_distances(
317348
string_pairs: List[Tuple[str, str]],
@@ -516,8 +547,12 @@ def benchmark_stringzillas_similarity_scores(
516547
# Single-input variants on 1 CPU core
517548
if name_matches(f"{szs_name}(1xCPU)", filter_pattern):
518549

519-
engine = szs_class(capabilities=default_scope, substitution_matrix=blosum,
520-
open=-10, extend=-2) # Same gap costs as BioPython
550+
engine = szs_class(
551+
capabilities=default_scope,
552+
substitution_matrix=blosum,
553+
open=-10,
554+
extend=-2,
555+
) # Same gap costs as BioPython
521556

522557
def kernel(a: str, b: str) -> int:
523558
a_array = sz.Strs([a])
@@ -536,8 +571,9 @@ def kernel(a: str, b: str) -> int:
536571
# Single-input variants on all CPU cores
537572
if name_matches(f"{szs_name}({cpu_cores}xCPU)", filter_pattern):
538573

539-
engine = szs_class(capabilities=cpu_scope, substitution_matrix=blosum,
540-
open=-10, extend=-2) # Same gap costs as BioPython
574+
engine = szs_class(
575+
capabilities=cpu_scope, substitution_matrix=blosum, open=-10, extend=-2
576+
) # Same gap costs as BioPython
541577

542578
def kernel(a: str, b: str) -> int:
543579
a_array = sz.Strs([a])
@@ -556,8 +592,9 @@ def kernel(a: str, b: str) -> int:
556592
# Single-input variants on GPU
557593
if name_matches(f"{szs_name}(1xGPU)", filter_pattern) and gpu_scope is not None:
558594

559-
engine = szs_class(capabilities=gpu_scope, substitution_matrix=blosum,
560-
open=-10, extend=-2) # Same gap costs as BioPython
595+
engine = szs_class(
596+
capabilities=gpu_scope, substitution_matrix=blosum, open=-10, extend=-2
597+
) # Same gap costs as BioPython
561598

562599
def kernel(a: str, b: str) -> int:
563600
a_array = sz.Strs([a])
@@ -576,8 +613,9 @@ def kernel(a: str, b: str) -> int:
576613
# Batch-input variants on 1 CPU core
577614
if name_matches(f"{szs_name}(1xCPU,batch={batch_size})", filter_pattern):
578615

579-
engine = szs_class(capabilities=default_scope, substitution_matrix=blosum,
580-
open=-10, extend=-2) # Same gap costs as BioPython
616+
engine = szs_class(
617+
capabilities=default_scope, substitution_matrix=blosum, open=-10, extend=-2
618+
) # Same gap costs as BioPython
581619

582620
def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
583621
a_array = sz.Strs(a_list)
@@ -596,8 +634,9 @@ def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
596634
# Batch-input variants on all CPU cores
597635
if name_matches(f"{szs_name}({cpu_cores}xCPU,batch={batch_size})", filter_pattern):
598636

599-
engine = szs_class(capabilities=cpu_scope, substitution_matrix=blosum,
600-
open=-10, extend=-2) # Same gap costs as BioPython
637+
engine = szs_class(
638+
capabilities=cpu_scope, substitution_matrix=blosum, open=-10, extend=-2
639+
) # Same gap costs as BioPython
601640

602641
def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
603642
a_array = sz.Strs(a_list)
@@ -616,8 +655,9 @@ def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
616655
# Batch-input variants on GPU
617656
if name_matches(f"{szs_name}(1xGPU,batch={batch_size})", filter_pattern) and gpu_scope is not None:
618657

619-
engine = szs_class(capabilities=gpu_scope, substitution_matrix=blosum,
620-
open=-10, extend=-2) # Same gap costs as BioPython
658+
engine = szs_class(
659+
capabilities=gpu_scope, substitution_matrix=blosum, open=-10, extend=-2
660+
) # Same gap costs as BioPython
621661

622662
def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
623663
a_array = sz.Strs(a_list)
@@ -670,21 +710,26 @@ def bench(
670710
print()
671711

672712
print("=== Edit Distance Benchmarks ===")
673-
benchmark_third_party_edit_distances(pairs, timeout_seconds, filter_pattern)
713+
benchmark_third_party_edit_distances(
714+
pairs,
715+
timeout_seconds=timeout_seconds,
716+
filter_pattern=filter_pattern,
717+
batch_size=batch_size,
718+
)
674719
benchmark_stringzillas_edit_distances(
675720
pairs,
676-
timeout_seconds,
677-
batch_size,
678-
filter_pattern,
721+
timeout_seconds=timeout_seconds,
722+
batch_size=batch_size,
723+
filter_pattern=filter_pattern,
679724
szs_class=szs.LevenshteinDistances,
680725
szs_name="szs.LevenshteinDistances",
681726
is_utf8=False,
682727
)
683728
benchmark_stringzillas_edit_distances(
684729
pairs,
685-
timeout_seconds,
686-
batch_size,
687-
filter_pattern,
730+
timeout_seconds=timeout_seconds,
731+
batch_size=batch_size,
732+
filter_pattern=filter_pattern,
688733
szs_class=szs.LevenshteinDistancesUTF8,
689734
szs_name="szs.LevenshteinDistancesUTF8",
690735
is_utf8=True,
@@ -696,17 +741,17 @@ def bench(
696741
benchmark_third_party_similarity_scores(pairs, timeout_seconds, filter_pattern)
697742
benchmark_stringzillas_similarity_scores(
698743
pairs,
699-
timeout_seconds,
700-
batch_size,
701-
filter_pattern,
744+
timeout_seconds=timeout_seconds,
745+
batch_size=batch_size,
746+
filter_pattern=filter_pattern,
702747
szs_class=szs.NeedlemanWunschScores,
703748
szs_name="szs.NeedlemanWunschScores",
704749
)
705750
benchmark_stringzillas_similarity_scores(
706751
pairs,
707-
timeout_seconds,
708-
batch_size,
709-
filter_pattern,
752+
timeout_seconds=timeout_seconds,
753+
batch_size=batch_size,
754+
filter_pattern=filter_pattern,
710755
szs_class=szs.SmithWatermanScores,
711756
szs_name="szs.SmithWatermanScores",
712757
)

scripts/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ distance # https://github.com/doukremt/distance
2525
polyleven # https://github.com/fujimotos/polyleven
2626
edlib # https://github.com/Martinsos/edlib
2727
nltk # https://github.com/nltk/nltk
28+
cudf-cu12 # https://github.com/rapidsai/cudf
2829

2930
# For Needleman-Wunsch and Smith-Waterman algorithms with custom scoring matrices:
3031
biopython # https://github.com/biopython/biopython

0 commit comments

Comments
 (0)