Skip to content

Commit 21f36df

Browse files
committed
Checked and marked as Immutable + code clean
1 parent 28a50f9 commit 21f36df

File tree

4 files changed

+91
-62
lines changed

4 files changed

+91
-62
lines changed

src/main/java/info/debatty/java/stringsimilarity/JaroWinkler.java

Lines changed: 49 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,39 +3,43 @@
33
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
44
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
55
import java.util.Arrays;
6+
import net.jcip.annotations.Immutable;
67

78
/**
8-
* The Jaro–Winkler distance metric is designed and best suited for short
9-
* strings such as person names, and to detect typos; it is (roughly) a
10-
* variation of Damerau-Levenshtein, where the substitution of 2 close
9+
* The Jaro–Winkler distance metric is designed and best suited for short
10+
* strings such as person names, and to detect typos; it is (roughly) a
11+
* variation of Damerau-Levenshtein, where the substitution of 2 close
1112
* characters is considered less important then the substitution of 2 characters
1213
* that a far from each other.
13-
* Jaro-Winkler was developed in the area of record linkage (duplicate
14+
* Jaro-Winkler was developed in the area of record linkage (duplicate
1415
* detection) (Winkler, 1990). It returns a value in the interval [0.0, 1.0].
1516
* The distance is computed as 1 - Jaro-Winkler similarity.
1617
* @author Thibault Debatty
1718
*/
18-
public class JaroWinkler implements NormalizedStringSimilarity, NormalizedStringDistance {
19-
19+
@Immutable
20+
public class JaroWinkler
21+
implements NormalizedStringSimilarity, NormalizedStringDistance {
2022

23+
private static final double DEFAULT_THRESHOLD = 0.7;
24+
private static final int THREE = 3;
25+
private static final double JW_COEF = 0.1;
26+
private final double threshold;
27+
28+
/**
29+
* Instantiate with default threshold (0.7).
30+
*
31+
*/
2132
public JaroWinkler() {
22-
33+
this.threshold = DEFAULT_THRESHOLD;
2334
}
24-
25-
public JaroWinkler(double threshold) {
26-
this.setThreshold(threshold);
27-
}
28-
29-
private double threshold = 0.7;
30-
35+
3136
/**
32-
* Sets the threshold used to determine when Winkler bonus should be used.
33-
* Set to a negative value to get the Jaro distance.
34-
* Default value is 0.7
35-
*
36-
* @param threshold the new value of the threshold
37+
* Instantiate with given threshold to determine when Winkler bonus should
38+
* be used.
39+
* Set threshold to a negative value to get the Jaro distance.
40+
* @param threshold
3741
*/
38-
public final void setThreshold(double threshold) {
42+
public JaroWinkler(final double threshold) {
3943
this.threshold = threshold;
4044
}
4145

@@ -45,29 +49,44 @@ public final void setThreshold(double threshold) {
4549
*
4650
* @return the current value of the threshold
4751
*/
48-
public double getThreshold() {
52+
public final double getThreshold() {
4953
return threshold;
5054
}
5155

52-
public double similarity(String s1, String s2) {
56+
/**
57+
* Compute JW similarity.
58+
* @param s1
59+
* @param s2
60+
* @return
61+
*/
62+
public final double similarity(final String s1, final String s2) {
5363
int[] mtp = matches(s1, s2);
5464
float m = mtp[0];
5565
if (m == 0) {
5666
return 0f;
5767
}
58-
float j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m)) / 3;
59-
float jw = j < getThreshold() ? j : j + Math.min(0.1f, 1f / mtp[3]) * mtp[2]
60-
* (1 - j);
68+
double j = ((m / s1.length() + m / s2.length() + (m - mtp[1]) / m))
69+
/ THREE;
70+
double jw = j;
71+
72+
if (j > getThreshold()) {
73+
jw = j + Math.min(JW_COEF, 1.0 / mtp[THREE]) * mtp[2] * (1 - j);
74+
}
6175
return jw;
6276
}
63-
64-
65-
public double distance(String s1, String s2) {
77+
78+
79+
/**
80+
* Return 1 - similarity.
81+
* @param s1
82+
* @param s2
83+
* @return
84+
*/
85+
public final double distance(final String s1, final String s2) {
6686
return 1.0 - similarity(s1, s2);
6787
}
6888

69-
70-
private int[] matches(String s1, String s2) {
89+
private int[] matches(final String s1, final String s2) {
7190
String max, min;
7291
if (s1.length() > s2.length()) {
7392
max = s1;

src/main/java/info/debatty/java/stringsimilarity/LongestCommonSubsequence.java

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package info.debatty.java.stringsimilarity;
22

33
import info.debatty.java.stringsimilarity.interfaces.StringDistance;
4+
import net.jcip.annotations.Immutable;
45

56
/**
67
* The longest common subsequence (LCS) problem consists in finding the longest
@@ -22,18 +23,19 @@
2223
*
2324
* @author Thibault Debatty
2425
*/
26+
@Immutable
2527
public class LongestCommonSubsequence implements StringDistance {
2628

2729
/**
2830
* Return the LCS distance between strings s1 and s2, computed as |s1| +
29-
* |s2| - 2 * |LCS(s1, s2)|
31+
* |s2| - 2 * |LCS(s1, s2)|.
3032
*
3133
* @param s1
3234
* @param s2
3335
* @return the LCS distance between strings s1 and s2, computed as |s1| +
3436
* |s2| - 2 * |LCS(s1, s2)|
3537
*/
36-
public double distance(String s1, String s2) {
38+
public final double distance(final String s1, final String s2) {
3739
return s1.length() + s2.length() - 2 * length(s1, s2);
3840
}
3941

@@ -45,16 +47,16 @@ public double distance(String s1, String s2) {
4547
* @param s2
4648
* @return the length of LCS(s1, s2)
4749
*/
48-
protected int length(String s1, String s2) {
50+
public final int length(final String s1, final String s2) {
4951
/* function LCSLength(X[1..m], Y[1..n])
5052
C = array(0..m, 0..n)
51-
53+
5254
for i := 0..m
5355
C[i,0] = 0
54-
56+
5557
for j := 0..n
5658
C[0,j] = 0
57-
59+
5860
for i := 1..m
5961
for j := 1..n
6062
if X[i] = Y[j]
@@ -65,30 +67,30 @@ protected int length(String s1, String s2) {
6567
*/
6668
int m = s1.length();
6769
int n = s2.length();
68-
char[] X = s1.toCharArray();
69-
char[] Y = s2.toCharArray();
70+
char[] x = s1.toCharArray();
71+
char[] y = s2.toCharArray();
7072

71-
int[][] C = new int[m + 1][n + 1];
73+
int[][] c = new int[m + 1][n + 1];
7274

7375
for (int i = 0; i <= m; i++) {
74-
C[i][0] = 0;
76+
c[i][0] = 0;
7577
}
7678

7779
for (int j = 0; j <= n; j++) {
78-
C[0][j] = 0;
80+
c[0][j] = 0;
7981
}
8082

8183
for (int i = 1; i <= m; i++) {
8284
for (int j = 1; j <= n; j++) {
83-
if (X[i - 1] == Y[j - 1]) {
84-
C[i][j] = C[i - 1][j - 1] + 1;
85+
if (x[i - 1] == y[j - 1]) {
86+
c[i][j] = c[i - 1][j - 1] + 1;
8587

8688
} else {
87-
C[i][j] = Math.max(C[i][j - 1], C[i - 1][j]);
89+
c[i][j] = Math.max(c[i][j - 1], c[i - 1][j]);
8890
}
8991
}
9092
}
9193

92-
return C[m][n];
94+
return c[m][n];
9395
}
9496
}

src/main/java/info/debatty/java/stringsimilarity/MetricLCS.java

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,26 +26,31 @@
2626

2727
import info.debatty.java.stringsimilarity.interfaces.MetricStringDistance;
2828
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
29+
import net.jcip.annotations.Immutable;
2930

3031
/**
31-
* Distance metric based on Longest Common Subsequence, from the notes "An
32+
* Distance metric based on Longest Common Subsequence, from the notes "An
3233
* LCS-based string metric" by Daniel Bakkelund.
3334
* @author Thibault Debatty
3435
*/
35-
public class MetricLCS implements MetricStringDistance, NormalizedStringDistance {
36+
@Immutable
37+
public class MetricLCS
38+
implements MetricStringDistance, NormalizedStringDistance {
3639

3740
private final LongestCommonSubsequence lcs = new LongestCommonSubsequence();
38-
41+
3942
/**
40-
* Distance metric based on Longest Common Subsequence, computed as
41-
* 1 - |LCS(s1, s2)| / max(|s1|, |s2|)
43+
* Distance metric based on Longest Common Subsequence, computed as
44+
* 1 - |LCS(s1, s2)| / max(|s1|, |s2|).
4245
* @param s1
4346
* @param s2
44-
* @return
47+
* @return
4548
*/
46-
public double distance(String s1, String s2) {
47-
return 1.0 - ((double) lcs.length(s1, s2)) / Math.max(s1.length(), s2.length());
48-
49+
public final double distance(final String s1, final String s2) {
50+
return 1.0
51+
- (1.0 * lcs.length(s1, s2))
52+
/ Math.max(s1.length(), s2.length());
53+
4954
}
50-
55+
5156
}

src/test/java/info/debatty/java/stringsimilarity/JaroWinklerTest.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,6 @@
2424

2525
package info.debatty.java.stringsimilarity;
2626

27-
import org.junit.After;
28-
import org.junit.AfterClass;
29-
import org.junit.Before;
30-
import org.junit.BeforeClass;
3127
import org.junit.Test;
3228
import static org.junit.Assert.*;
3329

@@ -42,10 +38,17 @@ public class JaroWinklerTest {
4238
* Test of similarity method, of class JaroWinkler.
4339
*/
4440
@Test
45-
public void testSimilarity() {
41+
public final void testSimilarity() {
4642
System.out.println("similarity");
4743
JaroWinkler instance = new JaroWinkler();
48-
assertEquals(0.9740740656852722, instance.similarity("My string", "My tsring"), 0.0);
49-
assertEquals(0.8962963223457336, instance.similarity("My string", "My ntrisg"), 0.0);
44+
assertEquals(
45+
0.974074,
46+
instance.similarity("My string", "My tsring"),
47+
0.000001);
48+
49+
assertEquals(
50+
0.896296,
51+
instance.similarity("My string", "My ntrisg"),
52+
0.000001);
5053
}
5154
}

0 commit comments

Comments
 (0)