Skip to content

Commit 790b12f

Browse files
committed
Marked classes as Immutable + code clean
1 parent eb757f0 commit 790b12f

File tree

9 files changed

+189
-157
lines changed

9 files changed

+189
-157
lines changed

src/main/java/info/debatty/java/stringsimilarity/Cosine.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
2727
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
28+
import net.jcip.annotations.Immutable;
2829

2930
/**
3031
* The similarity between the two strings is the cosine of the angle between
@@ -33,6 +34,7 @@
3334
*
3435
* @author Thibault Debatty
3536
*/
37+
@Immutable
3638
public class Cosine extends ShingleBased implements
3739
NormalizedStringDistance, NormalizedStringSimilarity {
3840

@@ -49,7 +51,11 @@ public Cosine(final int k) {
4951
}
5052

5153
/**
52-
*
54+
* Implements Cosine Similarity between strings. The strings are first
55+
* transformed in vectors of occurrences of k-shingles (sequences of k
56+
* characters). In this n-dimensional space, the similarity between the two
57+
* strings is the cosine of their respective vectors.
58+
* Default k is 3.
5359
*/
5460
public Cosine() {
5561
super();
@@ -63,10 +69,10 @@ public Cosine() {
6369
*/
6470
public final double similarity(final String s1, final String s2) {
6571

66-
if (s1.length() < k || s2.length() < k) {
72+
if (s1.length() < getK() || s2.length() < getK()) {
6773
return 0;
6874
}
69-
KShingling ks = new KShingling(k);
75+
KShingling ks = new KShingling(getK());
7076
int[] profile1 = ks.getArrayProfile(s1);
7177
int[] profile2 = ks.getArrayProfile(s2);
7278

@@ -80,7 +86,7 @@ public final double similarity(final String s1, final String s2) {
8086
* @param profile
8187
* @return L2 norm
8288
*/
83-
protected static double norm(final int[] profile) {
89+
private static double norm(final int[] profile) {
8490
double agg = 0;
8591

8692
for (int v : profile) {
@@ -90,7 +96,7 @@ protected static double norm(final int[] profile) {
9096
return Math.sqrt(agg);
9197
}
9298

93-
protected static double dotProduct(
99+
private static double dotProduct(
94100
final int[] profile1, final int[] profile2) {
95101

96102
// profiles may not have the same length

src/main/java/info/debatty/java/stringsimilarity/Damerau.java

Lines changed: 42 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,14 @@
2525

2626
import info.debatty.java.stringsimilarity.interfaces.MetricStringDistance;
2727
import java.util.HashMap;
28-
import info.debatty.java.stringsimilarity.interfaces.StringDistance;
28+
import net.jcip.annotations.Immutable;
2929

3030
/**
31-
* Implementation of Damerau-Levenshtein distance with transposition (also
31+
* Implementation of Damerau-Levenshtein distance with transposition (also
3232
* sometimes calls unrestricted Damerau-Levenshtein distance).
33-
* It is the minimum number of operations needed to transform one string into
34-
* the other, where an operation is defined as an insertion, deletion, or
35-
* substitution of a single character, or a transposition of two adjacent
33+
* It is the minimum number of operations needed to transform one string into
34+
* the other, where an operation is defined as an insertion, deletion, or
35+
* substitution of a single character, or a transposition of two adjacent
3636
* characters.
3737
* It does respect triangle inequality, and is thus a metric distance.
3838
*
@@ -41,73 +41,84 @@
4141
*
4242
* @author Thibault Debatty
4343
*/
44-
public class Damerau implements StringDistance, MetricStringDistance {
45-
46-
public double distance(String s1, String s2) {
44+
@Immutable
45+
public class Damerau implements MetricStringDistance {
46+
47+
/**
48+
* Compute the distance between strings: the minimum number of operations
49+
* needed to transform one string into the other (insertion, deletion,
50+
* substitution of a single character, or a transposition of two adjacent
51+
* characters).
52+
* @param s1
53+
* @param s2
54+
* @return
55+
*/
56+
public final double distance(final String s1, final String s2) {
4757

4858
// INFinite distance is the max possible distance
49-
int INF = s1.length() + s2.length();
59+
int inf = s1.length() + s2.length();
5060

5161
// Create and initialize the character array indices
52-
HashMap<Character, Integer> DA = new HashMap<Character, Integer>();
62+
HashMap<Character, Integer> da = new HashMap<Character, Integer>();
5363

5464
for (int d = 0; d < s1.length(); d++) {
55-
if (!DA.containsKey(s1.charAt(d))) {
56-
DA.put(s1.charAt(d), 0);
65+
if (!da.containsKey(s1.charAt(d))) {
66+
da.put(s1.charAt(d), 0);
5767
}
5868
}
5969

6070
for (int d = 0; d < s2.length(); d++) {
61-
if (!DA.containsKey(s2.charAt(d))) {
62-
DA.put(s2.charAt(d), 0);
71+
if (!da.containsKey(s2.charAt(d))) {
72+
da.put(s2.charAt(d), 0);
6373
}
6474
}
6575

6676
// Create the distance matrix H[0 .. s1.length+1][0 .. s2.length+1]
67-
int[][] H = new int[s1.length() + 2][s2.length() + 2];
77+
int[][] h = new int[s1.length() + 2][s2.length() + 2];
6878

6979
// initialize the left and top edges of H
7080
for (int i = 0; i <= s1.length(); i++) {
71-
H[i + 1][0] = INF;
72-
H[i + 1][1] = i;
81+
h[i + 1][0] = inf;
82+
h[i + 1][1] = i;
7383
}
7484

7585
for (int j = 0; j <= s2.length(); j++) {
76-
H[0][j + 1] = INF;
77-
H[1][j + 1] = j;
86+
h[0][j + 1] = inf;
87+
h[1][j + 1] = j;
7888

7989
}
8090

8191
// fill in the distance matrix H
8292
// look at each character in s1
8393
for (int i = 1; i <= s1.length(); i++) {
84-
int DB = 0;
94+
int db = 0;
8595

8696
// look at each character in b
8797
for (int j = 1; j <= s2.length(); j++) {
88-
int i1 = DA.get(s2.charAt(j - 1));
89-
int j1 = DB;
98+
int i1 = da.get(s2.charAt(j - 1));
99+
int j1 = db;
90100

91101
int cost = 1;
92102
if (s1.charAt(i - 1) == s2.charAt(j - 1)) {
93103
cost = 0;
94-
DB = j;
104+
db = j;
95105
}
96106

97-
H[i + 1][j + 1] = min(
98-
H[i][j] + cost, // substitution
99-
H[i + 1][j] + 1, // insertion
100-
H[i][j + 1] + 1, // deletion
101-
H[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
107+
h[i + 1][j + 1] = min(
108+
h[i][j] + cost, // substitution
109+
h[i + 1][j] + 1, // insertion
110+
h[i][j + 1] + 1, // deletion
111+
h[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
102112
}
103113

104-
DA.put(s1.charAt(i - 1), i);
114+
da.put(s1.charAt(i - 1), i);
105115
}
106116

107-
return H[s1.length() + 1][s2.length() + 1];
117+
return h[s1.length() + 1][s2.length() + 1];
108118
}
109119

110-
protected static int min(int a, int b, int c, int d) {
120+
private static int min(
121+
final int a, final int b, final int c, final int d) {
111122
return Math.min(a, Math.min(b, Math.min(c, d)));
112123
}
113124

src/main/java/info/debatty/java/stringsimilarity/Jaccard.java

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -27,66 +27,82 @@
2727
import info.debatty.java.stringsimilarity.interfaces.MetricStringDistance;
2828
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringSimilarity;
2929
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
30+
import net.jcip.annotations.Immutable;
3031

3132
/**
32-
* Each input string is converted into a set of n-grams, the Jaccard index is
33+
* Each input string is converted into a set of n-grams, the Jaccard index is
3334
* then computed as |V1 inter V2| / |V1 union V2|.
34-
* Like Q-Gram distance, the input strings are first converted into sets of
35-
* n-grams (sequences of n characters, also called k-shingles), but this time
36-
* the cardinality of each n-gram is not taken into account.
35+
* Like Q-Gram distance, the input strings are first converted into sets of
36+
* n-grams (sequences of n characters, also called k-shingles), but this time
37+
* the cardinality of each n-gram is not taken into account.
3738
* Distance is computed as 1 - cosine similarity.
3839
* Jaccard index is a metric distance.
3940
* @author Thibault Debatty
4041
*/
41-
public class Jaccard extends ShingleBased implements
42-
MetricStringDistance, NormalizedStringDistance, NormalizedStringSimilarity {
43-
42+
@Immutable
43+
public class Jaccard extends ShingleBased implements
44+
MetricStringDistance, NormalizedStringDistance,
45+
NormalizedStringSimilarity {
46+
4447
/**
4548
* The strings are first transformed into sets of k-shingles (sequences of k
4649
* characters), then Jaccard index is computed as |A inter B| / |A union B|.
4750
* The default value of k is 3.
48-
*
49-
* @param k
51+
*
52+
* @param k
5053
*/
51-
public Jaccard(int k) {
54+
public Jaccard(final int k) {
5255
super(k);
5356
}
54-
57+
5558
/**
56-
*
59+
* The strings are first transformed into sets of k-shingles (sequences of k
60+
* characters), then Jaccard index is computed as |A inter B| / |A union B|.
61+
* The default value of k is 3.
5762
*/
5863
public Jaccard() {
5964
super();
6065
}
6166

62-
63-
public double similarity(String s1, String s2) {
64-
KShingling ks = new KShingling(k);
67+
/**
68+
* Compute jaccard index: |A inter B| / |A union B|.
69+
* @param s1
70+
* @param s2
71+
* @return
72+
*/
73+
public final double similarity(final String s1, final String s2) {
74+
KShingling ks = new KShingling(getK());
6575
int[] profile1 = ks.getArrayProfile(s1);
6676
int[] profile2 = ks.getArrayProfile(s2);
67-
77+
6878
int length = Math.max(profile1.length, profile2.length);
6979
profile1 = java.util.Arrays.copyOf(profile1, length);
7080
profile2 = java.util.Arrays.copyOf(profile2, length);
71-
81+
7282
int inter = 0;
7383
int union = 0;
74-
84+
7585
for (int i = 0; i < length; i++) {
7686
if (profile1[i] > 0 || profile2[i] > 0) {
7787
union++;
78-
88+
7989
if (profile1[i] > 0 && profile2[i] > 0) {
8090
inter++;
8191
}
8292
}
8393
}
84-
85-
return (double) inter / union;
94+
95+
return 1.0 * inter / union;
8696
}
87-
8897

89-
public double distance(String s1, String s2) {
98+
99+
/**
100+
* Distance is computed as 1 - similarity.
101+
* @param s1
102+
* @param s2
103+
* @return
104+
*/
105+
public final double distance(final String s1, final String s2) {
90106
return 1.0 - similarity(s1, s2);
91107
}
92108
}

0 commit comments

Comments
 (0)