@@ -13,6 +13,7 @@ public static void main (String[] args) {
1313 Levenshtein l = new Levenshtein ();
1414
1515 System .out .println (l .distanceAbsolute ("My string" , "My $tring" ));
16+ System .out .println (l .distanceAbsolute ("My string" , "M string2" ));
1617 System .out .println (l .distance ("My string" , "My $tring" ));
1718 System .out .println (l .similarity ("My string" , "My $tring" ));
1819 }
@@ -24,7 +25,7 @@ public static int Distance(String s1, String s2) {
2425
2526 @ Override
2627 public double distance (String s1 , String s2 ) {
27- return (( double ) distanceAbsolute (s1 , s2 ) ) / Math .max (s1 .length (), s2 .length ());
28+ return (double ) distanceAbsolute (s1 , s2 ) / Math .max (s1 .length (), s2 .length ());
2829
2930 }
3031
@@ -35,7 +36,7 @@ public double similarity(String s1, String s2) {
3536
3637 /**
3738 * The Levenshtein distance, or edit distance, between two words is the
38- * minimum number of single-character edits (i.e. insertions, deletions or
39+ * minimum number of single-character edits (insertions, deletions or
3940 * substitutions) required to change one word into the other.
4041 *
4142 * http://en.wikipedia.org/wiki/Levenshtein_distance
@@ -45,57 +46,69 @@ public double similarity(String s1, String s2) {
4546 * It is zero if and only if the strings are equal.
4647 * If the strings are the same size, the Hamming distance is an upper bound
4748 * on the Levenshtein distance.
48- * The Levenshtein distance between two strings is no greater than the sum
49- * of their Levenshtein distances from a third string (triangle inequality).
49+ * The Levenshtein distance verifies the triangle inequality (the distance
50+ * between two strings is no greater than the sum Levenshtein distances from
51+ * a third string).
52+ *
53+ * Implementation uses dynamic programming (Wagner–Fischer algorithm), with
54+ * only 2 rows of data. The space requirement is thus O(m) and the algorithm
55+ * runs in O(mn).
5056 *
51- * @param s0
5257 * @param s1
58+ * @param s2
5359 * @return
5460 */
55- public int distanceAbsolute (String s0 , String s1 ) {
56- int len0 = s0 . length () + 1 ;
57- int len1 = s1 . length () + 1 ;
58-
59- // the array of distances
60- int [] cost = new int [ len0 ];
61- int [] newcost = new int [ len0 ] ;
62-
63- // initial cost of skipping prefix in String s0
64- for ( int i = 0 ; i < len0 ; i ++ ) {
65- cost [ i ] = i ;
61+ public int distanceAbsolute (String s1 , String s2 ) {
62+ if ( s1 . equals ( s2 )){
63+ return 0 ;
64+ }
65+
66+ if ( s1 . length () == 0 ) {
67+ return s2 . length () ;
68+ }
69+
70+ if ( s2 . length () == 0 ) {
71+ return s1 . length () ;
6672 }
6773
68- // dynamicaly computing the array of distances
69- // transformation cost for each letter in s1
70- for (int j = 1 ; j < len1 ; j ++) {
71-
72- // initial cost of skipping prefix in String s1
73- newcost [0 ] = j - 1 ;
74-
75- // transformation cost for each letter in s0
76- for (int i = 1 ; i < len0 ; i ++) {
77-
78- // matching current letters in both strings
79- int match = (s0 .charAt (i - 1 ) == s1 .charAt (j - 1 )) ? 0 : 1 ;
74+ // create two work vectors of integer distances
75+ int [] v0 = new int [s2 .length () + 1 ];
76+ int [] v1 = new int [s2 .length () + 1 ];
77+ int [] vtemp ;
8078
81- // computing cost for each transformation
82- int cost_replace = cost [i - 1 ] + match ;
83- int cost_insert = cost [i ] + 1 ;
84- int cost_delete = newcost [i - 1 ] + 1 ;
79+ // initialize v0 (the previous row of distances)
80+ // this row is A[0][i]: edit distance for an empty s
81+ // the distance is just the number of characters to delete from t
82+ for (int i = 0 ; i < v0 .length ; i ++) {
83+ v0 [i ] = i ;
84+ }
85+
86+ for (int i = 0 ; i < s1 .length (); i ++) {
87+ // calculate v1 (current row distances) from the previous row v0
88+ // first element of v1 is A[i+1][0]
89+ // edit distance is delete (i+1) chars from s to match empty t
90+ v1 [0 ] = i + 1 ;
8591
86- // keep minimum cost
87- newcost [i ] = Math .min (
88- Math .min (cost_insert , cost_delete ),
89- cost_replace );
92+ // use formula to fill in the rest of the row
93+ for (int j = 0 ; j < s2 .length (); j ++) {
94+ int cost = (s1 .charAt (i ) == s2 .charAt (j )) ? 0 : 1 ;
95+ v1 [j + 1 ] = Math .min (
96+ v1 [j ] + 1 , // Cost of insertion
97+ Math .min (
98+ v0 [j + 1 ] + 1 , // Cost of remove
99+ v0 [j ] + cost )); // Cost of substitution
90100 }
91-
92- // swap cost/newcost arrays
93- int [] swap = cost ;
94- cost = newcost ;
95- newcost = swap ;
101+
102+ // copy v1 (current row) to v0 (previous row) for next iteration
103+ //System.arraycopy(v1, 0, v0, 0, v0.length);
104+
105+ // Flip references to current and previous row
106+ vtemp = v0 ;
107+ v0 = v1 ;
108+ v1 = vtemp ;
109+
96110 }
97111
98- // the distance is the cost for transforming all letters in both strings
99- return cost [len0 - 1 ];
112+ return v0 [s2 .length ()];
100113 }
101114}
0 commit comments