Skip to content

Commit e01071d

Browse files
committed
Merge remote-tracking branch 'bertsky/normalized-cer' into v3-api
2 parents 88497a1 + a33b713 commit e01071d

11 files changed

+23
-35
lines changed

src/dinglehopper/character_error_rate.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,7 @@ def character_error_rate_n(
2020
:return: character error rate and length of the reference
2121
"""
2222

23-
d = distance(reference, compared)
24-
n = len(reference)
25-
26-
if d == 0:
27-
return 0, n
28-
if n == 0:
29-
return float("inf"), n
30-
return d / n, n
23+
return distance(reference, compared), len(reference)
3124

3225
# XXX Should we really count newlines here?
3326

src/dinglehopper/edit_distance.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,18 @@
99

1010

1111
@multimethod
12-
def distance(seq1: List[str], seq2: List[str]) -> int:
12+
def distance(seq1: List[str], seq2: List[str]) -> float:
1313
"""Compute the Levenshtein edit distance between two lists of grapheme clusters.
1414
1515
This assumes that the grapheme clusters are already normalized.
1616
1717
Use distance(str, str) instead if you need to compare two Unicode strings.
1818
"""
19-
return Levenshtein.distance(seq1, seq2)
19+
return Levenshtein.normalized_distance(seq1, seq2)
2020

2121

2222
@distance.register
23-
def _(s1: str, s2: str) -> int:
23+
def _(s1: str, s2: str) -> float:
2424
"""Compute the Levenshtein edit distance between two Unicode strings
2525
2626
Note that this is different from levenshtein() as this function knows about Unicode
@@ -29,12 +29,12 @@ def _(s1: str, s2: str) -> int:
2929
"""
3030
seq1 = list(grapheme_clusters(unicodedata.normalize("NFC", s1)))
3131
seq2 = list(grapheme_clusters(unicodedata.normalize("NFC", s2)))
32-
return Levenshtein.distance(seq1, seq2)
32+
return Levenshtein.normalized_distance(seq1, seq2)
3333

3434

3535
@distance.register
36-
def _(s1: ExtractedText, s2: ExtractedText) -> int:
37-
return Levenshtein.distance(s1.grapheme_clusters, s2.grapheme_clusters)
36+
def _(s1: ExtractedText, s2: ExtractedText) -> float:
37+
return Levenshtein.normalized_distance(s1.grapheme_clusters, s2.grapheme_clusters)
3838

3939

4040
def editops(word1, word2):

src/dinglehopper/tests/test_character_error_rate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ def test_character_error_rate():
1414
assert character_error_rate("Foo", "") == 3 / 3
1515

1616
assert character_error_rate("", "") == 0
17-
assert math.isinf(character_error_rate("", "Foo"))
17+
assert character_error_rate("", "Foo") == 3 / 3
1818

19-
assert character_error_rate("Foo", "Food") == 1 / 3
19+
assert character_error_rate("Foo", "Food") == 1 / 4
2020
assert character_error_rate("Fnord", "Food") == 2 / 5
2121
assert character_error_rate("Müll", "Mull") == 1 / 4
2222
assert character_error_rate("Abstand", "Sand") == 4 / 7

src/dinglehopper/tests/test_edit_distance.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66

77

88
def test_distance():
9-
assert distance("Fnord", "Food") == 2
10-
assert distance("Müll", "Mull") == 1
9+
assert distance("Fnord", "Food") == 2 / 5
10+
assert distance("Müll", "Mull") == 1 / 4
1111

1212
word1 = unicodedata.normalize("NFC", "Schlyñ")
1313
word2 = unicodedata.normalize("NFD", "Schlyñ") # Different, decomposed!
@@ -21,4 +21,4 @@ def test_distance():
2121
assert (
2222
len(word2) == 7
2323
) # This, OTOH, ends with LATIN SMALL LETTER M + COMBINING TILDE, 7 code points
24-
assert distance(word1, word2) == 1
24+
assert distance(word1, word2) == 1 / 6

src/dinglehopper/tests/test_integ_character_error_rate_ocr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,4 +56,4 @@ def test_character_error_rate_between_page_alto_2():
5656
)
5757
)
5858

59-
assert character_error_rate(gt, ocr) == 8 / 591 # Manually verified
59+
assert character_error_rate(gt, ocr) == 8 / 594 # Manually verified

src/dinglehopper/tests/test_integ_cli_valid_json.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,11 @@ def test_cli_json_cer_is_infinity(tmp_path):
3232

3333
with working_directory(tmp_path):
3434
with open("gt.txt", "w") as gtf:
35-
gtf.write("") # Empty to yield CER == inf
35+
gtf.write("")
3636
with open("ocr.txt", "w") as ocrf:
3737
ocrf.write("Not important")
3838

3939
process("gt.txt", "ocr.txt", "report")
4040
with open("report.json", "r") as jsonf:
4141
j = json.load(jsonf)
42-
assert j["cer"] == pytest.approx(float("inf"))
42+
assert j["cer"] == pytest.approx(1.0)

src/dinglehopper/tests/test_integ_edit_distance_ocr.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def test_distance_between_page_files():
1717
# → 2 differences
1818
gt = page_text(ET.parse(os.path.join(data_dir, "test-gt.page2018.xml")))
1919
ocr = page_text(ET.parse(os.path.join(data_dir, "test-fake-ocr.page2018.xml")))
20-
assert distance(gt, ocr) == 2
20+
assert distance(gt, ocr) == 2 / 827
2121

2222

2323
@pytest.mark.integration
@@ -52,4 +52,4 @@ def test_distance_between_page_alto_2():
5252
)
5353
)
5454

55-
assert distance(gt, ocr) == 8 # Manually verified
55+
assert distance(gt, ocr) == 8 / 594 # Manually verified

src/dinglehopper/tests/test_integ_empty_files.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
@pytest.mark.parametrize(
1313
"gt_file_content,ocr_file_content,cer_expected",
1414
[
15-
("", "Lorem ipsum", math.inf),
15+
("", "Lorem ipsum", 1.0),
1616
("Lorem ipsum", "", 1.0),
17-
("\ufeff", "Lorem ipsum", math.inf),
17+
("\ufeff", "Lorem ipsum", 1.0),
1818
("Lorem ipsum", "\ufeff", 1.0),
1919
("", "", 0.0),
2020
("\ufeff", "", 0.0),

src/dinglehopper/tests/test_integ_word_error_rate_ocr.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,5 @@ def test_word_error_rate_between_page_alto_2():
6464
)
6565

6666
assert (
67-
word_error_rate(gt, ocr) == 7 / gt_word_count
67+
word_error_rate(gt, ocr) == 7 / (gt_word_count + 1)
6868
) # Manually verified, 6 words are wrong, 1 got split (=2 errors)

src/dinglehopper/tests/test_word_error_rate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def test_word_error_rate():
7676
)
7777

7878
assert word_error_rate("Dies ist ein Beispielsatz!", "") == 4 / 4
79-
assert math.isinf(word_error_rate("", "Dies ist ein Beispielsatz!"))
79+
assert word_error_rate("", "Dies ist ein Beispielsatz!") == 4 / 4
8080
assert word_error_rate("", "") == 0
8181

8282
assert (

0 commit comments

Comments
 (0)