|
| 1 | +package info.debatty.stringsimilarity; |
| 2 | + |
| 3 | +import java.util.Arrays; |
| 4 | + |
| 5 | +/** |
| 6 | + * |
| 7 | + * @author tibo |
| 8 | + */ |
| 9 | +public class JaroWinkler { |
| 10 | + |
| 11 | + /** |
| 12 | + * The Jaro–Winkler distance is in fact a measure of similarity between two |
| 13 | + * strings: 0 means no similarity and 1 is an exact match. |
| 14 | + * http://en.wikipedia.org/wiki/Jaro-Winkler_distance |
| 15 | + * |
| 16 | + * @param s0 |
| 17 | + * @param s1 |
| 18 | + * @return |
| 19 | + */ |
| 20 | + public static double Similarity(String s0, String s1) { |
| 21 | + JaroWinkler jw = new JaroWinkler(); |
| 22 | + return jw.sim(s0, s1); |
| 23 | + } |
| 24 | + |
| 25 | + |
| 26 | + private float threshold = 0.7f; |
| 27 | + |
| 28 | + private int[] matches(String s1, String s2) { |
| 29 | + String max, min; |
| 30 | + if (s1.length() > s2.length()) { |
| 31 | + max = s1; |
| 32 | + min = s2; |
| 33 | + } else { |
| 34 | + max = s2; |
| 35 | + min = s1; |
| 36 | + } |
| 37 | + int range = Math.max(max.length() / 2 - 1, 0); |
| 38 | + int[] matchIndexes = new int[min.length()]; |
| 39 | + Arrays.fill(matchIndexes, -1); |
| 40 | + boolean[] matchFlags = new boolean[max.length()]; |
| 41 | + int matches = 0; |
| 42 | + for (int mi = 0; mi < min.length(); mi++) { |
| 43 | + char c1 = min.charAt(mi); |
| 44 | + for (int xi = Math.max(mi - range, 0), |
| 45 | + xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) { |
| 46 | + if (!matchFlags[xi] && c1 == max.charAt(xi)) { |
| 47 | + matchIndexes[mi] = xi; |
| 48 | + matchFlags[xi] = true; |
| 49 | + matches++; |
| 50 | + break; |
| 51 | + } |
| 52 | + } |
| 53 | + } |
| 54 | + char[] ms1 = new char[matches]; |
| 55 | + char[] ms2 = new char[matches]; |
| 56 | + for (int i = 0, si = 0; i < min.length(); i++) { |
| 57 | + if (matchIndexes[i] != -1) { |
| 58 | + ms1[si] = min.charAt(i); |
| 59 | + si++; |
| 60 | + } |
| 61 | + } |
| 62 | + for (int i = 0, si = 0; i < max.length(); i++) { |
| 63 | + if (matchFlags[i]) { |
| 64 | + ms2[si] = max.charAt(i); |
| 65 | + si++; |
| 66 | + } |
| 67 | + } |
| 68 | + int transpositions = 0; |
| 69 | + for (int mi = 0; mi < ms1.length; mi++) { |
| 70 | + if (ms1[mi] != ms2[mi]) { |
| 71 | + transpositions++; |
| 72 | + } |
| 73 | + } |
| 74 | + int prefix = 0; |
| 75 | + for (int mi = 0; mi < min.length(); mi++) { |
| 76 | + if (s1.charAt(mi) == s2.charAt(mi)) { |
| 77 | + prefix++; |
| 78 | + } else { |
| 79 | + break; |
| 80 | + } |
| 81 | + } |
| 82 | + return new int[]{matches, transpositions / 2, prefix, max.length()}; |
| 83 | + } |
| 84 | + |
| 85 | + public float sim(String s1, String s2) { |
| 86 | + int[] mtp = matches(s1, s2); |
| 87 | + float m = mtp[0]; |
| 88 | + if (m == 0) { |
| 89 | + return 0f; |
| 90 | + } |
| 91 | + float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3; |
| 92 | + float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2] |
| 93 | + * (1 - j); |
| 94 | + return jw; |
| 95 | + } |
| 96 | + |
| 97 | + /** |
| 98 | + * Sets the threshold used to determine when Winkler bonus should be used. |
| 99 | + * Set to a negative value to get the Jaro distance. |
| 100 | + * |
| 101 | + * @param threshold the new value of the threshold |
| 102 | + */ |
| 103 | + public void setThreshold(float threshold) { |
| 104 | + this.threshold = threshold; |
| 105 | + } |
| 106 | + |
| 107 | + /** |
| 108 | + * Returns the current value of the threshold used for adding the Winkler |
| 109 | + * bonus. The default value is 0.7. |
| 110 | + * |
| 111 | + * @return the current value of the threshold |
| 112 | + */ |
| 113 | + public float getThreshold() { |
| 114 | + return threshold; |
| 115 | + } |
| 116 | + |
| 117 | +} |
0 commit comments