3636public class WeightedLevenshtein implements StringDistance {
3737
3838 private final CharacterSubstitutionInterface charsub ;
39+ private final CharacterInsDelInterface charchange ;
3940
4041 /**
41- * Instatiate with provided character substitution.
42+ * Instantiate with provided character substitution.
4243 * @param charsub The strategy to determine character substitution weights.
4344 */
4445 public WeightedLevenshtein (final CharacterSubstitutionInterface charsub ) {
46+ this (charsub , null );
47+ }
48+
49+ /**
50+ * Instantiate with provided character substitution, insertion, and
51+ * deletion weights.
52+ * @param charsub The strategy to determine character substitution weights.
53+ * @param charchange The strategy to determine character insertion /
54+ * deletion weights.
55+ */
56+ public WeightedLevenshtein (final CharacterSubstitutionInterface charsub ,
57+ final CharacterInsDelInterface charchange ) {
4558 this .charsub = charsub ;
59+ this .charchange = charchange ;
4660 }
4761
4862 /**
@@ -79,28 +93,35 @@ public final double distance(final String s1, final String s2) {
7993 double [] vtemp ;
8094
8195 // initialize v0 (the previous row of distances)
82- // this row is A[0][i]: edit distance for an empty s
83- // the distance is just the number of characters to delete from t
84- for (int i = 0 ; i < v0 .length ; i ++) {
85- v0 [i ] = i ;
96+ // this row is A[0][i]: edit distance for an empty s1
97+ // the distance is the cost of inserting each character of s2
98+ v0 [0 ] = 0 ;
99+ for (int i = 1 ; i < v0 .length ; i ++) {
100+ v0 [i ] = v0 [i - 1 ] + insertionCost (s2 .charAt (i - 1 ));
86101 }
87102
88103 for (int i = 0 ; i < s1 .length (); i ++) {
104+ char s1i = s1 .charAt (i );
105+ double deletion_cost = deletionCost (s1i );
106+
89107 // calculate v1 (current row distances) from the previous row v0
90108 // first element of v1 is A[i+1][0]
91- // edit distance is delete (i+1) chars from s to match empty t
92- v1 [0 ] = i + 1 ;
109+ // Edit distance is the cost of deleting characters from s1
110+ // to match empty t.
111+ v1 [0 ] = v0 [0 ] + deletion_cost ;
93112
94113 // use formula to fill in the rest of the row
95114 for (int j = 0 ; j < s2 .length (); j ++) {
115+ char s2j = s2 .charAt (j );
96116 double cost = 0 ;
97- if (s1 . charAt ( i ) != s2 . charAt ( j ) ) {
98- cost = charsub .cost (s1 . charAt ( i ), s2 . charAt ( j ) );
117+ if (s1i != s2j ) {
118+ cost = charsub .cost (s1i , s2j );
99119 }
120+ double insertion_cost = insertionCost (s2j );
100121 v1 [j + 1 ] = Math .min (
101- v1 [j ] + 1 , // Cost of insertion
122+ v1 [j ] + insertion_cost , // Cost of insertion
102123 Math .min (
103- v0 [j + 1 ] + 1 , // Cost of remove
124+ v0 [j + 1 ] + deletion_cost , // Cost of deletion
104125 v0 [j ] + cost )); // Cost of substitution
105126 }
106127
@@ -115,4 +136,21 @@ public final double distance(final String s1, final String s2) {
115136
116137 return v0 [s2 .length ()];
117138 }
139+
140+
141+ private double insertionCost (final char c ) {
142+ if (charchange == null ) {
143+ return 1.0 ;
144+ } else {
145+ return charchange .insertionCost (c );
146+ }
147+ }
148+
149+ private double deletionCost (final char c ) {
150+ if (charchange == null ) {
151+ return 1.0 ;
152+ } else {
153+ return charchange .deletionCost (c );
154+ }
155+ }
118156}
0 commit comments