Skip to content

Commit bc148b5

Browse files
committed
Added MetricLCS distance
1 parent 045a060 commit bc148b5

File tree

4 files changed

+142
-3
lines changed

4 files changed

+142
-3
lines changed

README.md

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -249,11 +249,34 @@ public class MyApp {
249249
}
250250
```
251251

252-
## Metric Longest Common Subsequence
253-
tbd
254-
252+
## Metric LCS (Longest Common Subsequence)
253+
Distance metric based on Longest Common Subsequence, from the notes "An LCS-based string metric" by Daniel Bakkelund.
255254
http://heim.ifi.uio.no/~danielry/StringMetric.pdf
256255

256+
The distance is computed as 1 - |LCS(s1, s2)| / max(|s1|, |s2|)
257+
```java
258+
public class MyApp {
259+
260+
public static void main(String[] args) {
261+
262+
info.debatty.java.stringsimilarity.MetricLCS lcs =
263+
new info.debatty.java.stringsimilarity.MetricLCS();
264+
265+
String s1 = "ABCDEFG";
266+
String s2 = "ABCDEFHJKL";
267+
// LCS: ABCDEF => length = 6
268+
// longest = s2 => length = 10
269+
// => 1 - 6/10 = 0.4
270+
System.out.println(lcs.distance(s1, s2));
271+
272+
// LCS: ABDF => length = 4
273+
// longest = ABDEF => length = 5
274+
// => 1 - 4 / 5 = 0.2
275+
System.out.println(lcs.distance("ABDEF", "ABDIF"));
276+
}
277+
}
278+
```
279+
257280
## N-Gram distance (Kondrak)
258281

259282
Normalized N-Gram distance as defined by Kondrak, "N-Gram Similarity and Distance", String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.

src/main/java/info/debatty/java/stringsimilarity/LongestCommonSubsequence.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,24 @@ public static void main(String[] args) {
4040
}
4141

4242

43+
/**
44+
* Return the LCS distance between strings s1 and s2,
45+
* computed as |s1| + |s2| - 2 * |LCS(s1, s2)|
46+
* @param s1
47+
* @param s2
48+
* @return
49+
*/
4350
public double distance(String s1, String s2) {
4451
return s1.length() + s2.length() - 2 * length(s1, s2);
4552
}
4653

54+
/**
55+
* Return the length of Longest Common Subsequence (LCS) between strings s1
56+
* and s2.
57+
* @param s1
58+
* @param s2
59+
* @return
60+
*/
4761
protected int length(String s1, String s2) {
4862
/* function LCSLength(X[1..m], Y[1..n])
4963
C = array(0..m, 0..n)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2015 Thibault Debatty.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
package info.debatty.java.stringsimilarity;
26+
27+
import info.debatty.java.stringsimilarity.interfaces.MetricStringDistance;
28+
import info.debatty.java.stringsimilarity.interfaces.NormalizedStringDistance;
29+
30+
/**
31+
* Distance metric based on Longest Common Subsequence, from the notes "An
32+
* LCS-based string metric" by Daniel Bakkelund.
33+
* @author Thibault Debatty
34+
*/
35+
public class MetricLCS implements MetricStringDistance, NormalizedStringDistance {
36+
37+
private final LongestCommonSubsequence lcs = new LongestCommonSubsequence();
38+
39+
/**
40+
* Distance metric based on Longest Common Subsequence, computed as
41+
* 1 - |LCS(s1, s2)| / max(|s1|, |s2|)
42+
* @param s1
43+
* @param s2
44+
* @return
45+
*/
46+
public double distance(String s1, String s2) {
47+
return 1.0 - ((double) lcs.length(s1, s2)) / Math.max(s1.length(), s2.length());
48+
49+
}
50+
51+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2015 Thibault Debatty.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
25+
package info.debatty.java.stringsimilarity.examples;
26+
27+
/**
28+
*
29+
* @author Thibault Debatty
30+
*/
31+
public class MetricLCS {
32+
33+
public static void main(String[] args) {
34+
35+
info.debatty.java.stringsimilarity.MetricLCS lcs =
36+
new info.debatty.java.stringsimilarity.MetricLCS();
37+
38+
String s1 = "ABCDEFG";
39+
String s2 = "ABCDEFHJKL";
40+
// LCS: ABCDEF => length = 6
41+
// longest = s2 => length = 10
42+
// => 1 - 6/10 = 0.4
43+
System.out.println(lcs.distance(s1, s2));
44+
45+
// LCS: ABDF => length = 4
46+
// longest = ABDEF => length = 5
47+
// => 1 - 4 / 5 = 0.2
48+
System.out.println(lcs.distance("ABDEF", "ABDIF"));
49+
}
50+
51+
}

0 commit comments

Comments
 (0)