Skip to content

Commit 879be48

Browse files
authored
Merge pull request #46 from NationalBI/weighted-levenshtein-ins-del
WeightedLevenshtein ins/del weights.
2 parents a5d8421 + cfcde79 commit 879be48

File tree

3 files changed

+120
-11
lines changed

3 files changed

+120
-11
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package info.debatty.java.stringsimilarity;
2+
3+
4+
/**
5+
* As an adjunct to CharacterSubstitutionInterface, this interface
6+
* allows you to specify the cost of deletion or insertion of a
7+
* character.
8+
*/
9+
public interface CharacterInsDelInterface {
10+
/**
11+
* @param c The character being deleted.
12+
* @return The cost to be allocated to deleting the given character,
13+
* in the range [0, 1].
14+
*/
15+
double deletionCost(char c);
16+
17+
/**
18+
* @param c The character being inserted.
19+
* @return The cost to be allocated to inserting the given character,
20+
* in the range [0, 1].
21+
*/
22+
double insertionCost(char c);
23+
}

src/main/java/info/debatty/java/stringsimilarity/WeightedLevenshtein.java

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,27 @@
3636
public class WeightedLevenshtein implements StringDistance {
3737

3838
private final CharacterSubstitutionInterface charsub;
39+
private final CharacterInsDelInterface charchange;
3940

4041
/**
41-
* Instatiate with provided character substitution.
42+
* Instantiate with provided character substitution.
4243
* @param charsub The strategy to determine character substitution weights.
4344
*/
4445
public WeightedLevenshtein(final CharacterSubstitutionInterface charsub) {
46+
this(charsub, null);
47+
}
48+
49+
/**
50+
* Instantiate with provided character substitution, insertion, and
51+
* deletion weights.
52+
* @param charsub The strategy to determine character substitution weights.
53+
* @param charchange The strategy to determine character insertion /
54+
* deletion weights.
55+
*/
56+
public WeightedLevenshtein(final CharacterSubstitutionInterface charsub,
57+
final CharacterInsDelInterface charchange) {
4558
this.charsub = charsub;
59+
this.charchange = charchange;
4660
}
4761

4862
/**
@@ -79,28 +93,35 @@ public final double distance(final String s1, final String s2) {
7993
double[] vtemp;
8094

8195
// initialize v0 (the previous row of distances)
82-
// this row is A[0][i]: edit distance for an empty s
83-
// the distance is just the number of characters to delete from t
84-
for (int i = 0; i < v0.length; i++) {
85-
v0[i] = i;
96+
// this row is A[0][i]: edit distance for an empty s1
97+
// the distance is the cost of inserting each character of s2
98+
v0[0] = 0;
99+
for (int i = 1; i < v0.length; i++) {
100+
v0[i] = v0[i - 1] + insertionCost(s2.charAt(i - 1));
86101
}
87102

88103
for (int i = 0; i < s1.length(); i++) {
104+
char s1i = s1.charAt(i);
105+
double deletion_cost = deletionCost(s1i);
106+
89107
// calculate v1 (current row distances) from the previous row v0
90108
// first element of v1 is A[i+1][0]
91-
// edit distance is delete (i+1) chars from s to match empty t
92-
v1[0] = i + 1;
109+
// Edit distance is the cost of deleting characters from s1
110+
// to match empty t.
111+
v1[0] = v0[0] + deletion_cost;
93112

94113
// use formula to fill in the rest of the row
95114
for (int j = 0; j < s2.length(); j++) {
115+
char s2j = s2.charAt(j);
96116
double cost = 0;
97-
if (s1.charAt(i) != s2.charAt(j)) {
98-
cost = charsub.cost(s1.charAt(i), s2.charAt(j));
117+
if (s1i != s2j) {
118+
cost = charsub.cost(s1i, s2j);
99119
}
120+
double insertion_cost = insertionCost(s2j);
100121
v1[j + 1] = Math.min(
101-
v1[j] + 1, // Cost of insertion
122+
v1[j] + insertion_cost, // Cost of insertion
102123
Math.min(
103-
v0[j + 1] + 1, // Cost of remove
124+
v0[j + 1] + deletion_cost, // Cost of deletion
104125
v0[j] + cost)); // Cost of substitution
105126
}
106127

@@ -115,4 +136,21 @@ public final double distance(final String s1, final String s2) {
115136

116137
return v0[s2.length()];
117138
}
139+
140+
141+
private double insertionCost(final char c) {
142+
if (charchange == null) {
143+
return 1.0;
144+
} else {
145+
return charchange.insertionCost(c);
146+
}
147+
}
148+
149+
private double deletionCost(final char c) {
150+
if (charchange == null) {
151+
return 1.0;
152+
} else {
153+
return charchange.deletionCost(c);
154+
}
155+
}
118156
}

src/test/java/info/debatty/java/stringsimilarity/WeightedLevenshteinTest.java

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,54 @@ public double cost(char c1, char c2) {
2727
assertEquals(0.5, instance.distance("String1", "Srring1"), 0.1);
2828
assertEquals(1.5, instance.distance("String1", "Srring2"), 0.1);
2929

30+
// One insert or delete.
31+
assertEquals(1.0, instance.distance("Strng", "String"), 0.1);
32+
assertEquals(1.0, instance.distance("String", "Strng"), 0.1);
33+
34+
NullEmptyTests.testDistance(instance);
35+
}
36+
37+
@Test
38+
public void testDistanceCharacterInsDelInterface() {
39+
WeightedLevenshtein instance = new WeightedLevenshtein(
40+
new CharacterSubstitutionInterface() {
41+
public double cost(char c1, char c2) {
42+
if (c1 == 't' && c2 == 'r') {
43+
return 0.5;
44+
}
45+
return 1.0;
46+
}
47+
},
48+
new CharacterInsDelInterface() {
49+
public double deletionCost(char c) {
50+
if (c == 'i') {
51+
return 0.8;
52+
}
53+
return 1.0;
54+
}
55+
56+
public double insertionCost(char c) {
57+
if (c == 'i') {
58+
return 0.5;
59+
}
60+
return 1.0;
61+
}
62+
});
63+
64+
// Same as testDistance above.
65+
assertEquals(0.0, instance.distance("String1", "String1"), 0.1);
66+
assertEquals(0.5, instance.distance("String1", "Srring1"), 0.1);
67+
assertEquals(1.5, instance.distance("String1", "Srring2"), 0.1);
68+
69+
// Cost of insert of 'i' is less than normal, so these scores are
70+
// different than testDistance above. Note that the cost of delete
71+
// has been set differently than the cost of insert, so the distance
72+
// call is not symmetric in its arguments if an 'i' has changed.
73+
assertEquals(0.5, instance.distance("Strng", "String"), 0.1);
74+
assertEquals(0.8, instance.distance("String", "Strng"), 0.1);
75+
assertEquals(1.0, instance.distance("Strig", "String"), 0.1);
76+
assertEquals(1.0, instance.distance("String", "Strig"), 0.1);
77+
3078
NullEmptyTests.testDistance(instance);
3179
}
3280
}

0 commit comments

Comments
 (0)