1- use std:: cmp;
1+ use std:: { cmp, mem } ;
22
3- pub fn lev_distance ( me : & str , t : & str ) -> usize {
3+ /// Finds the [edit distance] between two strings.
4+ ///
5+ /// Returns `None` if the distance exceeds the limit.
6+ ///
7+ /// [edit distance]: https://en.wikipedia.org/wiki/Edit_distance
8+ pub fn lev_distance ( a : & str , b : & str , limit : usize ) -> Option < usize > {
49 // Comparing the strings lowercased will result in a difference in capitalization being less distance away
510 // than being a completely different letter. Otherwise `CHECK` is as far away from `check` as it
611 // is from `build` (both with a distance of 5). For a single letter shortcut (e.g. `b` or `c`), they will
712 // all be as far away from any capital single letter entry (all with a distance of 1).
813 // By first lowercasing the strings, `C` and `c` are closer than `C` and `b`, for example.
9- let me = me . to_lowercase ( ) ;
10- let t = t . to_lowercase ( ) ;
14+ let a = a . to_lowercase ( ) ;
15+ let b = b . to_lowercase ( ) ;
1116
12- let t_len = t. chars ( ) . count ( ) ;
13- if me. is_empty ( ) {
14- return t_len;
17+ let mut a = & a. chars ( ) . collect :: < Vec < _ > > ( ) [ ..] ;
18+ let mut b = & b. chars ( ) . collect :: < Vec < _ > > ( ) [ ..] ;
19+
20+ // Ensure that `b` is the shorter string, minimizing memory use.
21+ if a. len ( ) < b. len ( ) {
22+ mem:: swap ( & mut a, & mut b) ;
1523 }
16- if t. is_empty ( ) {
17- return me. chars ( ) . count ( ) ;
24+
25+ let min_dist = a. len ( ) - b. len ( ) ;
26+ // If we know the limit will be exceeded, we can return early.
27+ if min_dist > limit {
28+ return None ;
1829 }
1930
20- let mut dcol = ( 0 ..=t_len) . collect :: < Vec < _ > > ( ) ;
21- let mut t_last = 0 ;
31+ // Strip common prefix.
32+ while let Some ( ( ( b_char, b_rest) , ( a_char, a_rest) ) ) = b. split_first ( ) . zip ( a. split_first ( ) ) {
33+ if a_char != b_char {
34+ break ;
35+ }
36+ a = a_rest;
37+ b = b_rest;
38+ }
39+ // Strip common suffix.
40+ while let Some ( ( ( b_char, b_rest) , ( a_char, a_rest) ) ) = b. split_last ( ) . zip ( a. split_last ( ) ) {
41+ if a_char != b_char {
42+ break ;
43+ }
44+ a = a_rest;
45+ b = b_rest;
46+ }
2247
23- for ( i, sc) in me. chars ( ) . enumerate ( ) {
24- let mut current = i;
25- dcol[ 0 ] = current + 1 ;
48+ // If either string is empty, the distance is the length of the other.
49+ // We know that `b` is the shorter string, so we don't need to check `a`.
50+ if b. len ( ) == 0 {
51+ return Some ( min_dist) ;
52+ }
2653
27- for ( j, tc) in t. chars ( ) . enumerate ( ) {
28- let next = dcol[ j + 1 ] ;
54+ let mut prev_prev = vec ! [ usize :: MAX ; b. len( ) + 1 ] ;
55+ let mut prev = ( 0 ..=b. len ( ) ) . collect :: < Vec < _ > > ( ) ;
56+ let mut current = vec ! [ 0 ; b. len( ) + 1 ] ;
2957
30- if sc == tc {
31- dcol[ j + 1 ] = current;
32- } else {
33- dcol[ j + 1 ] = cmp:: min ( current, next) ;
34- dcol[ j + 1 ] = cmp:: min ( dcol[ j + 1 ] , dcol[ j] ) + 1 ;
35- }
58+ // row by row
59+ for i in 1 ..=a. len ( ) {
60+ current[ 0 ] = i;
61+ let a_idx = i - 1 ;
62+
63+ // column by column
64+ for j in 1 ..=b. len ( ) {
65+ let b_idx = j - 1 ;
66+
67+ // There is no cost to substitute a character with itself.
68+ let substitution_cost = if a[ a_idx] == b[ b_idx] { 0 } else { 1 } ;
3669
37- current = next;
38- t_last = j;
70+ current[ j] = cmp:: min (
71+ // deletion
72+ prev[ j] + 1 ,
73+ cmp:: min (
74+ // insertion
75+ current[ j - 1 ] + 1 ,
76+ // substitution
77+ prev[ j - 1 ] + substitution_cost,
78+ ) ,
79+ ) ;
80+
81+ if ( i > 1 ) && ( j > 1 ) && ( a[ a_idx] == b[ b_idx - 1 ] ) && ( a[ a_idx - 1 ] == b[ b_idx] ) {
82+ // transposition
83+ current[ j] = cmp:: min ( current[ j] , prev_prev[ j - 2 ] + 1 ) ;
84+ }
3985 }
86+
87+ // Rotate the buffers, reusing the memory.
88+ [ prev_prev, prev, current] = [ prev, current, prev_prev] ;
4089 }
4190
42- dcol[ t_last + 1 ]
91+ // `prev` because we already rotated the buffers.
92+ let distance = prev[ b. len ( ) ] ;
93+ ( distance <= limit) . then_some ( distance)
4394}
4495
4596/// Find the closest element from `iter` matching `choice`. The `key` callback
@@ -51,8 +102,7 @@ pub fn closest<'a, T>(
51102) -> Option < T > {
52103 // Only consider candidates with a lev_distance of 3 or less so we don't
53104 // suggest out-of-the-blue options.
54- iter. map ( |e| ( lev_distance ( choice, key ( & e) ) , e) )
55- . filter ( |& ( d, _) | d < 4 )
105+ iter. filter_map ( |e| Some ( ( lev_distance ( choice, key ( & e) , 3 ) ?, e) ) )
56106 . min_by_key ( |t| t. 0 )
57107 . map ( |t| t. 1 )
58108}
@@ -78,16 +128,16 @@ fn test_lev_distance() {
78128 . filter_map ( from_u32)
79129 . map ( |i| i. to_string ( ) )
80130 {
81- assert_eq ! ( lev_distance( & c, & c) , 0 ) ;
131+ assert_eq ! ( lev_distance( & c, & c, usize :: MAX ) , Some ( 0 ) ) ;
82132 }
83133
84134 let a = "\n Märy häd ä little lämb\n \n Little lämb\n " ;
85135 let b = "\n Mary häd ä little lämb\n \n Little lämb\n " ;
86136 let c = "Mary häd ä little lämb\n \n Little lämb\n " ;
87- assert_eq ! ( lev_distance( a, b) , 1 ) ;
88- assert_eq ! ( lev_distance( b, a) , 1 ) ;
89- assert_eq ! ( lev_distance( a, c) , 2 ) ;
90- assert_eq ! ( lev_distance( c, a) , 2 ) ;
91- assert_eq ! ( lev_distance( b, c) , 1 ) ;
92- assert_eq ! ( lev_distance( c, b) , 1 ) ;
137+ assert_eq ! ( lev_distance( a, b, usize :: MAX ) , Some ( 1 ) ) ;
138+ assert_eq ! ( lev_distance( b, a, usize :: MAX ) , Some ( 1 ) ) ;
139+ assert_eq ! ( lev_distance( a, c, usize :: MAX ) , Some ( 2 ) ) ;
140+ assert_eq ! ( lev_distance( c, a, usize :: MAX ) , Some ( 2 ) ) ;
141+ assert_eq ! ( lev_distance( b, c, usize :: MAX ) , Some ( 1 ) ) ;
142+ assert_eq ! ( lev_distance( c, b, usize :: MAX ) , Some ( 1 ) ) ;
93143}
0 commit comments