1212# "biopython",
1313# "numpy",
1414# "tqdm",
15+ # "cudf",
1516# ]
1617# ///
1718"""
2627- nltk: Natural language toolkit distances
2728- edlib: Fast sequence alignment
2829- biopython: Needleman-Wunsch alignment with BLOSUM matrices
30+ - cudf: GPU-accelerated RAPIDS edit distance
2931
3032Example usage via UV:
3133
8991except ImportError :
9092 BIOPYTHON_AVAILABLE = False
9193
94+ # For RAPIDS cuDF GPU-accelerated edit distance
95+ try :
96+ import cudf
97+
98+ CUDF_AVAILABLE = True
99+ except ImportError :
100+ CUDF_AVAILABLE = False
101+
92102# Global state for initialized models
93103_biopython_aligner = None
94104_blosum_matrix = None
@@ -218,6 +228,7 @@ def benchmark_third_party_edit_distances(
218228 string_pairs : List [Tuple [str , str ]],
219229 timeout_seconds : int = 10 ,
220230 filter_pattern : Optional [re .Pattern ] = None ,
231+ batch_size : int = 2048 ,
221232):
222233 """Benchmark various edit distance implementations."""
223234
@@ -312,6 +323,26 @@ def kernel(a: str, b: str) -> int:
312323 is_utf8 = False , # Binary/bytes
313324 )
314325
326+ # cuDF edit_distance
327+ if name_matches (f"cudf.edit_distance(batch={ batch_size } )" , filter_pattern ) and CUDF_AVAILABLE :
328+
329+ def batch_kernel (a_list : List [str ], b_list : List [str ]) -> List [int ]:
330+ # Create cuDF Series from string lists
331+ s1 = cudf .Series (a_list )
332+ s2 = cudf .Series (b_list )
333+ # Compute edit distances and return as list
334+ results = s1 .str .edit_distance (s2 )
335+ return results .to_arrow ().to_numpy ()
336+
337+ log_similarity_operation (
338+ f"cudf.edit_distance(batch={ batch_size } )" ,
339+ string_pairs ,
340+ batch_kernel ,
341+ timeout_seconds = timeout_seconds ,
342+ batch_size = batch_size ,
343+ is_utf8 = True , # UTF-8 codepoints
344+ )
345+
315346
316347def benchmark_stringzillas_edit_distances (
317348 string_pairs : List [Tuple [str , str ]],
@@ -516,8 +547,12 @@ def benchmark_stringzillas_similarity_scores(
516547 # Single-input variants on 1 CPU core
517548 if name_matches (f"{ szs_name } (1xCPU)" , filter_pattern ):
518549
519- engine = szs_class (capabilities = default_scope , substitution_matrix = blosum ,
520- open = - 10 , extend = - 2 ) # Same gap costs as BioPython
550+ engine = szs_class (
551+ capabilities = default_scope ,
552+ substitution_matrix = blosum ,
553+ open = - 10 ,
554+ extend = - 2 ,
555+ ) # Same gap costs as BioPython
521556
522557 def kernel (a : str , b : str ) -> int :
523558 a_array = sz .Strs ([a ])
@@ -536,8 +571,9 @@ def kernel(a: str, b: str) -> int:
536571 # Single-input variants on all CPU cores
537572 if name_matches (f"{ szs_name } ({ cpu_cores } xCPU)" , filter_pattern ):
538573
539- engine = szs_class (capabilities = cpu_scope , substitution_matrix = blosum ,
540- open = - 10 , extend = - 2 ) # Same gap costs as BioPython
574+ engine = szs_class (
575+ capabilities = cpu_scope , substitution_matrix = blosum , open = - 10 , extend = - 2
576+ ) # Same gap costs as BioPython
541577
542578 def kernel (a : str , b : str ) -> int :
543579 a_array = sz .Strs ([a ])
@@ -556,8 +592,9 @@ def kernel(a: str, b: str) -> int:
556592 # Single-input variants on GPU
557593 if name_matches (f"{ szs_name } (1xGPU)" , filter_pattern ) and gpu_scope is not None :
558594
559- engine = szs_class (capabilities = gpu_scope , substitution_matrix = blosum ,
560- open = - 10 , extend = - 2 ) # Same gap costs as BioPython
595+ engine = szs_class (
596+ capabilities = gpu_scope , substitution_matrix = blosum , open = - 10 , extend = - 2
597+ ) # Same gap costs as BioPython
561598
562599 def kernel (a : str , b : str ) -> int :
563600 a_array = sz .Strs ([a ])
@@ -576,8 +613,9 @@ def kernel(a: str, b: str) -> int:
576613 # Batch-input variants on 1 CPU core
577614 if name_matches (f"{ szs_name } (1xCPU,batch={ batch_size } )" , filter_pattern ):
578615
579- engine = szs_class (capabilities = default_scope , substitution_matrix = blosum ,
580- open = - 10 , extend = - 2 ) # Same gap costs as BioPython
616+ engine = szs_class (
617+ capabilities = default_scope , substitution_matrix = blosum , open = - 10 , extend = - 2
618+ ) # Same gap costs as BioPython
581619
582620 def kernel (a_list : List [str ], b_list : List [str ]) -> List [int ]:
583621 a_array = sz .Strs (a_list )
@@ -596,8 +634,9 @@ def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
596634 # Batch-input variants on all CPU cores
597635 if name_matches (f"{ szs_name } ({ cpu_cores } xCPU,batch={ batch_size } )" , filter_pattern ):
598636
599- engine = szs_class (capabilities = cpu_scope , substitution_matrix = blosum ,
600- open = - 10 , extend = - 2 ) # Same gap costs as BioPython
637+ engine = szs_class (
638+ capabilities = cpu_scope , substitution_matrix = blosum , open = - 10 , extend = - 2
639+ ) # Same gap costs as BioPython
601640
602641 def kernel (a_list : List [str ], b_list : List [str ]) -> List [int ]:
603642 a_array = sz .Strs (a_list )
@@ -616,8 +655,9 @@ def kernel(a_list: List[str], b_list: List[str]) -> List[int]:
616655 # Batch-input variants on GPU
617656 if name_matches (f"{ szs_name } (1xGPU,batch={ batch_size } )" , filter_pattern ) and gpu_scope is not None :
618657
619- engine = szs_class (capabilities = gpu_scope , substitution_matrix = blosum ,
620- open = - 10 , extend = - 2 ) # Same gap costs as BioPython
658+ engine = szs_class (
659+ capabilities = gpu_scope , substitution_matrix = blosum , open = - 10 , extend = - 2
660+ ) # Same gap costs as BioPython
621661
622662 def kernel (a_list : List [str ], b_list : List [str ]) -> List [int ]:
623663 a_array = sz .Strs (a_list )
@@ -670,21 +710,26 @@ def bench(
670710 print ()
671711
672712 print ("=== Edit Distance Benchmarks ===" )
673- benchmark_third_party_edit_distances (pairs , timeout_seconds , filter_pattern )
713+ benchmark_third_party_edit_distances (
714+ pairs ,
715+ timeout_seconds = timeout_seconds ,
716+ filter_pattern = filter_pattern ,
717+ batch_size = batch_size ,
718+ )
674719 benchmark_stringzillas_edit_distances (
675720 pairs ,
676- timeout_seconds ,
677- batch_size ,
678- filter_pattern ,
721+ timeout_seconds = timeout_seconds ,
722+ batch_size = batch_size ,
723+ filter_pattern = filter_pattern ,
679724 szs_class = szs .LevenshteinDistances ,
680725 szs_name = "szs.LevenshteinDistances" ,
681726 is_utf8 = False ,
682727 )
683728 benchmark_stringzillas_edit_distances (
684729 pairs ,
685- timeout_seconds ,
686- batch_size ,
687- filter_pattern ,
730+ timeout_seconds = timeout_seconds ,
731+ batch_size = batch_size ,
732+ filter_pattern = filter_pattern ,
688733 szs_class = szs .LevenshteinDistancesUTF8 ,
689734 szs_name = "szs.LevenshteinDistancesUTF8" ,
690735 is_utf8 = True ,
@@ -696,17 +741,17 @@ def bench(
696741 benchmark_third_party_similarity_scores (pairs , timeout_seconds , filter_pattern )
697742 benchmark_stringzillas_similarity_scores (
698743 pairs ,
699- timeout_seconds ,
700- batch_size ,
701- filter_pattern ,
744+ timeout_seconds = timeout_seconds ,
745+ batch_size = batch_size ,
746+ filter_pattern = filter_pattern ,
702747 szs_class = szs .NeedlemanWunschScores ,
703748 szs_name = "szs.NeedlemanWunschScores" ,
704749 )
705750 benchmark_stringzillas_similarity_scores (
706751 pairs ,
707- timeout_seconds ,
708- batch_size ,
709- filter_pattern ,
752+ timeout_seconds = timeout_seconds ,
753+ batch_size = batch_size ,
754+ filter_pattern = filter_pattern ,
710755 szs_class = szs .SmithWatermanScores ,
711756 szs_name = "szs.SmithWatermanScores" ,
712757 )
0 commit comments