Skip to content

Commit 43354ca

Browse files
committed
Added Sift4 algorithm as mentioned in issue #25 but basic test fails
1 parent d9439d6 commit 43354ca

File tree

2 files changed

+241
-0
lines changed
  • src
    • main/java/info/debatty/java/stringsimilarity/experimental
    • test/java/info/debatty/java/stringsimilarity/experimental

2 files changed

+241
-0
lines changed
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2016 Thibault Debatty.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
package info.debatty.java.stringsimilarity.experimental;
25+
26+
import info.debatty.java.stringsimilarity.interfaces.StringDistance;
27+
import java.util.LinkedList;
28+
29+
/**
30+
* Sift4 - a general purpose string distance algorithm inspired by JaroWinkler
31+
* and Longest Common Subsequence.
32+
* Original JavaScript algorithm by siderite, java port by Nathan Fischer 2016.
33+
* https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-
34+
* distance.html
35+
*
36+
* @author Thibault Debatty
37+
*/
38+
public class Sift4 implements StringDistance {
39+
40+
private static final int DEFAULT_MAX_OFFSET = 10;
41+
42+
private int max_offset = DEFAULT_MAX_OFFSET;
43+
44+
/**
45+
* Set the maximum distance to search for character transposition.
46+
* Compute cost of algorithm is O(n . max_offset)
47+
* @param max_offset
48+
*/
49+
public final void setMaxOffset(final int max_offset) {
50+
this.max_offset = max_offset;
51+
}
52+
53+
/**
54+
* Sift4 - a general purpose string distance algorithm inspired by
55+
* JaroWinkler and Longest Common Subsequence.
56+
* Original JavaScript algorithm by siderite, java port by Nathan Fischer
57+
* 2016.
58+
* https://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-
59+
* distance.html
60+
*
61+
* @param s1
62+
* @param s2
63+
* @return
64+
*/
65+
public final double distance(final String s1, final String s2) {
66+
67+
/**
68+
* Used to store relation between same character in different positions
69+
* c1 and c2 in the input strings.
70+
*/
71+
class Offset {
72+
73+
private final int c1;
74+
private final int c2;
75+
private boolean trans;
76+
77+
Offset(final int c1, final int c2, final boolean trans) {
78+
this.c1 = c1;
79+
this.c2 = c2;
80+
this.trans = trans;
81+
}
82+
}
83+
84+
if (s1 == null || s1.isEmpty()) {
85+
if (s2 == null) {
86+
return 0;
87+
}
88+
89+
return s2.length();
90+
}
91+
92+
if (s2 == null || s2.isEmpty()) {
93+
return s1.length();
94+
}
95+
96+
int l1 = s1.length();
97+
int l2 = s2.length();
98+
99+
int c1 = 0; //cursor for string 1
100+
int c2 = 0; //cursor for string 2
101+
int lcss = 0; //largest common subsequence
102+
int local_cs = 0; //local common substring
103+
int trans = 0; //number of transpositions ('ab' vs 'ba')
104+
105+
// offset pair array, for computing the transpositions
106+
LinkedList<Offset> offset_arr = new LinkedList<Offset>();
107+
108+
while ((c1 < l1) && (c2 < l2)) {
109+
if (s1.charAt(c1) == s2.charAt(c2)) {
110+
local_cs++;
111+
boolean is_trans = false;
112+
// see if current match is a transposition
113+
int i = 0;
114+
while (i < offset_arr.size()) {
115+
Offset ofs = offset_arr.get(i);
116+
if (c1 <= ofs.c1 || c2 <= ofs.c2) {
117+
// when two matches cross, the one considered a
118+
// transposition is the one with the largest difference
119+
// in offsets
120+
is_trans =
121+
Math.abs(c2 - c1) >= Math.abs(ofs.c2 - ofs.c1);
122+
if (is_trans) {
123+
124+
trans++;
125+
} else {
126+
if (!ofs.trans) {
127+
ofs.trans = true;
128+
trans++;
129+
}
130+
}
131+
132+
break;
133+
} else {
134+
if (c1 > ofs.c2 && c2 > ofs.c1) {
135+
offset_arr.remove(i);
136+
} else {
137+
i++;
138+
}
139+
}
140+
}
141+
offset_arr.push(new Offset(c1, c2, is_trans));
142+
143+
} else {
144+
145+
// s1.charAt(c1) != s2.charAt(c2)
146+
lcss += local_cs;
147+
local_cs = 0;
148+
if (c1 != c2) {
149+
//using min allows the computation of transpositions
150+
c1 = Math.min(c1, c2);
151+
c2 = c1;
152+
}
153+
154+
// if matching characters are found, remove 1 from both cursors
155+
// (they get incremented at the end of the loop)
156+
// so that we can have only one code block handling matches
157+
for (
158+
int i = 0;
159+
i < max_offset && (c1 + i < l1 || c2 + i < l2);
160+
i++) {
161+
162+
if ((c1 + i < l1) && (s1.charAt(c1 + i) == s2.charAt(c2))) {
163+
c1 += i - 1;
164+
c2--;
165+
break;
166+
}
167+
168+
if ((c2 + i < l2) && (s1.charAt(c1) == s2.charAt(c2 + i))) {
169+
c1--;
170+
c2 += i - 1;
171+
break;
172+
}
173+
}
174+
}
175+
c1++;
176+
c2++;
177+
// this covers the case where the last match is on the last token
178+
// in list, so that it can compute transpositions correctly
179+
if ((c1 >= l1) || (c2 >= l2)) {
180+
lcss += local_cs;
181+
local_cs = 0;
182+
c1 = Math.min(c1, c2);
183+
c2 = c1;
184+
}
185+
}
186+
lcss += local_cs;
187+
// add the cost of transpositions to the final result
188+
return Math.round(Math.max(l1, l2) - lcss + trans);
189+
}
190+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2016 Thibault Debatty.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
package info.debatty.java.stringsimilarity.experimental;
26+
27+
import static org.junit.Assert.assertEquals;
28+
import org.junit.Test;
29+
30+
/**
31+
*
32+
* @author Thibault Debatty
33+
*/
34+
public class Sift4Test {
35+
36+
/**
37+
* Test of distance method, of class Sift4.
38+
*/
39+
@Test
40+
public void testDistance() {
41+
System.out.println("SIFT4 distance");
42+
String s1 = "This is the first string";
43+
String s2 = "And this is another string";
44+
Sift4 sift4 = new Sift4();
45+
sift4.setMaxOffset(5);
46+
double expResult = 11.0;
47+
double result = sift4.distance(s1, s2);
48+
assertEquals(expResult, result, 0.0);
49+
}
50+
51+
}

0 commit comments

Comments
 (0)