Skip to content

Commit 797a7e2

Browse files
author
tibo
committed
JaroWinkler and Levenshtein
1 parent b4a122b commit 797a7e2

File tree

2 files changed

+191
-0
lines changed

2 files changed

+191
-0
lines changed
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
package info.debatty.stringsimilarity;
2+
3+
import java.util.Arrays;
4+
5+
/**
6+
*
7+
* @author tibo
8+
*/
9+
public class JaroWinkler {
10+
11+
/**
12+
* The Jaro–Winkler distance is in fact a measure of similarity between two
13+
* strings: 0 means no similarity and 1 is an exact match.
14+
* http://en.wikipedia.org/wiki/Jaro-Winkler_distance
15+
*
16+
* @param s0
17+
* @param s1
18+
* @return
19+
*/
20+
public static double Similarity(String s0, String s1) {
21+
JaroWinkler jw = new JaroWinkler();
22+
return jw.sim(s0, s1);
23+
}
24+
25+
26+
private float threshold = 0.7f;
27+
28+
private int[] matches(String s1, String s2) {
29+
String max, min;
30+
if (s1.length() > s2.length()) {
31+
max = s1;
32+
min = s2;
33+
} else {
34+
max = s2;
35+
min = s1;
36+
}
37+
int range = Math.max(max.length() / 2 - 1, 0);
38+
int[] matchIndexes = new int[min.length()];
39+
Arrays.fill(matchIndexes, -1);
40+
boolean[] matchFlags = new boolean[max.length()];
41+
int matches = 0;
42+
for (int mi = 0; mi < min.length(); mi++) {
43+
char c1 = min.charAt(mi);
44+
for (int xi = Math.max(mi - range, 0),
45+
xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
46+
if (!matchFlags[xi] && c1 == max.charAt(xi)) {
47+
matchIndexes[mi] = xi;
48+
matchFlags[xi] = true;
49+
matches++;
50+
break;
51+
}
52+
}
53+
}
54+
char[] ms1 = new char[matches];
55+
char[] ms2 = new char[matches];
56+
for (int i = 0, si = 0; i < min.length(); i++) {
57+
if (matchIndexes[i] != -1) {
58+
ms1[si] = min.charAt(i);
59+
si++;
60+
}
61+
}
62+
for (int i = 0, si = 0; i < max.length(); i++) {
63+
if (matchFlags[i]) {
64+
ms2[si] = max.charAt(i);
65+
si++;
66+
}
67+
}
68+
int transpositions = 0;
69+
for (int mi = 0; mi < ms1.length; mi++) {
70+
if (ms1[mi] != ms2[mi]) {
71+
transpositions++;
72+
}
73+
}
74+
int prefix = 0;
75+
for (int mi = 0; mi < min.length(); mi++) {
76+
if (s1.charAt(mi) == s2.charAt(mi)) {
77+
prefix++;
78+
} else {
79+
break;
80+
}
81+
}
82+
return new int[]{matches, transpositions / 2, prefix, max.length()};
83+
}
84+
85+
public float sim(String s1, String s2) {
86+
int[] mtp = matches(s1, s2);
87+
float m = mtp[0];
88+
if (m == 0) {
89+
return 0f;
90+
}
91+
float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
92+
float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
93+
* (1 - j);
94+
return jw;
95+
}
96+
97+
/**
98+
* Sets the threshold used to determine when Winkler bonus should be used.
99+
* Set to a negative value to get the Jaro distance.
100+
*
101+
* @param threshold the new value of the threshold
102+
*/
103+
public void setThreshold(float threshold) {
104+
this.threshold = threshold;
105+
}
106+
107+
/**
108+
* Returns the current value of the threshold used for adding the Winkler
109+
* bonus. The default value is 0.7.
110+
*
111+
* @return the current value of the threshold
112+
*/
113+
public float getThreshold() {
114+
return threshold;
115+
}
116+
117+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
package info.debatty.stringsimilarity;
2+
3+
/**
4+
*
5+
* @author tibo
6+
*/
7+
public class Levenshtein {
8+
9+
/**
10+
* The Levenshtein distance, or edit distance, between two words is the
11+
* minimum number of single-character edits (i.e. insertions, deletions or
12+
* substitutions) required to change one word into the other.
13+
*
14+
* http://en.wikipedia.org/wiki/Levenshtein_distance
15+
*
16+
* It is always at least the difference of the sizes of the two strings.
17+
* It is at most the length of the longer string.
18+
* It is zero if and only if the strings are equal.
19+
* If the strings are the same size, the Hamming distance is an upper bound
20+
* on the Levenshtein distance.
21+
* The Levenshtein distance between two strings is no greater than the sum
22+
* of their Levenshtein distances from a third string (triangle inequality).
23+
*
24+
* @param s0
25+
* @param s1
26+
* @return
27+
*/
28+
public static int Distance(String s0, String s1) {
29+
int len0 = s0.length() + 1;
30+
int len1 = s1.length() + 1;
31+
32+
// the array of distances
33+
int[] cost = new int[len0];
34+
int[] newcost = new int[len0];
35+
36+
// initial cost of skipping prefix in String s0
37+
for (int i = 0; i < len0; i++) {
38+
cost[i] = i;
39+
}
40+
41+
// dynamicaly computing the array of distances
42+
// transformation cost for each letter in s1
43+
for (int j = 1; j < len1; j++) {
44+
45+
// initial cost of skipping prefix in String s1
46+
newcost[0] = j - 1;
47+
48+
// transformation cost for each letter in s0
49+
for (int i = 1; i < len0; i++) {
50+
51+
// matching current letters in both strings
52+
int match = (s0.charAt(i - 1) == s1.charAt(j - 1)) ? 0 : 1;
53+
54+
// computing cost for each transformation
55+
int cost_replace = cost[i - 1] + match;
56+
int cost_insert = cost[i] + 1;
57+
int cost_delete = newcost[i - 1] + 1;
58+
59+
// keep minimum cost
60+
newcost[i] = Math.min(
61+
Math.min(cost_insert, cost_delete),
62+
cost_replace);
63+
}
64+
65+
// swap cost/newcost arrays
66+
int[] swap = cost;
67+
cost = newcost;
68+
newcost = swap;
69+
}
70+
71+
// the distance is the cost for transforming all letters in both strings
72+
return cost[len0 - 1];
73+
}
74+
}

0 commit comments

Comments
 (0)