Skip to content

Commit c3aa48e

Browse files
committed
2 parents 628594e + d7814db commit c3aa48e

26 files changed

+367
-33
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ dmypy.json
2525

2626
# User-specific stuff
2727
.idea
28+
.*.swp
2829

2930
# Build artifacts
3031
/build

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,13 @@ You also may want to compare a directory of GT text files (i.e. `gt/line0001.gt.
112112
with a directory of OCR text files (i.e. `ocr/line0001.some-ocr.txt`) with a separate
113113
CLI interface:
114114

115-
~~~
115+
```
116116
dinglehopper-line-dirs gt/ ocr/
117-
~~~
117+
```
118+
119+
The CLI `dinglehopper-line-dirs` can also work with GT text files in the same
120+
directories as the the OCR text files. You should read `dinglehopper-line-dirs --help`
121+
in this case.
118122

119123
### dinglehopper-extract
120124
The tool `dinglehopper-extract` extracts the text of the given input file on

src/dinglehopper/cli.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,15 +114,20 @@ def process(
114114
metrics: bool = True,
115115
differences: bool = False,
116116
textequiv_level: str = "region",
117+
plain_encoding: str = "autodetect",
117118
) -> None:
118119
"""Check OCR result against GT.
119120
120121
The @click decorators change the signature of the decorated functions, so we keep
121122
this undecorated version and use Click on a wrapper.
122123
"""
123124

124-
gt_text = extract(gt, textequiv_level=textequiv_level)
125-
ocr_text = extract(ocr, textequiv_level=textequiv_level)
125+
gt_text = extract(
126+
gt, textequiv_level=textequiv_level, plain_encoding=plain_encoding
127+
)
128+
ocr_text = extract(
129+
ocr, textequiv_level=textequiv_level, plain_encoding=plain_encoding
130+
)
126131
gt_words: List[str] = list(words_normalized(gt_text))
127132
ocr_words: List[str] = list(words_normalized(ocr_text))
128133

@@ -195,6 +200,7 @@ def process_dir(
195200
metrics: bool = True,
196201
differences: bool = False,
197202
textequiv_level: str = "region",
203+
plain_encoding: str = "autodetect",
198204
) -> None:
199205
for gt_file in os.listdir(gt):
200206
gt_file_path = os.path.join(gt, gt_file)
@@ -209,6 +215,7 @@ def process_dir(
209215
metrics=metrics,
210216
differences=differences,
211217
textequiv_level=textequiv_level,
218+
plain_encoding=plain_encoding,
212219
)
213220
else:
214221
print("Skipping {0} and {1}".format(gt_file_path, ocr_file_path))
@@ -233,6 +240,11 @@ def process_dir(
233240
help="PAGE TextEquiv level to extract text from",
234241
metavar="LEVEL",
235242
)
243+
@click.option(
244+
"--plain-encoding",
245+
default="autodetect",
246+
help='Encoding (e.g. "utf-8") of plain text files',
247+
)
236248
@click.option("--progress", default=False, is_flag=True, help="Show progress bar")
237249
@click.version_option()
238250
def main(
@@ -243,6 +255,7 @@ def main(
243255
metrics,
244256
differences,
245257
textequiv_level,
258+
plain_encoding,
246259
progress,
247260
):
248261
"""
@@ -280,6 +293,7 @@ def main(
280293
metrics=metrics,
281294
differences=differences,
282295
textequiv_level=textequiv_level,
296+
plain_encoding=plain_encoding,
283297
)
284298
else:
285299
process(
@@ -290,6 +304,7 @@ def main(
290304
metrics=metrics,
291305
differences=differences,
292306
textequiv_level=textequiv_level,
307+
plain_encoding=plain_encoding,
293308
)
294309

295310

src/dinglehopper/cli_extract.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
help="PAGE TextEquiv level to extract text from",
1313
metavar="LEVEL",
1414
)
15-
def main(input_file, textequiv_level):
15+
@click.option(
16+
"--plain-encoding",
17+
default="autodetect",
18+
help='Encoding (e.g. "utf-8") of plain text files',
19+
)
20+
def main(input_file, textequiv_level, plain_encoding):
1621
"""
1722
Extract the text of the given INPUT_FILE.
1823
@@ -23,7 +28,9 @@ def main(input_file, textequiv_level):
2328
use "--textequiv-level line" to extract from the level of TextLine tags.
2429
"""
2530
initLogging()
26-
input_text = extract(input_file, textequiv_level=textequiv_level).text
31+
input_text = extract(
32+
input_file, textequiv_level=textequiv_level, plain_encoding=plain_encoding
33+
).text
2734
print(input_text)
2835

2936

src/dinglehopper/cli_line_dirs.py

Lines changed: 132 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import itertools
22
import os
3+
from typing import Callable, Iterator, List, Optional, Tuple
34

45
import click
56
from jinja2 import Environment, FileSystemLoader
@@ -12,6 +13,41 @@
1213
from .word_error_rate import word_error_rate_n, words_normalized
1314

1415

16+
def removesuffix(text, suffix):
17+
"""
18+
Remove suffix from text.
19+
20+
Can be replaced with str.removesuffix when we only support Python >= 3.9.
21+
"""
22+
if suffix and text.endswith(suffix):
23+
return text[: -len(suffix)]
24+
return text
25+
26+
27+
def is_hidden(filepath):
28+
filename = os.path.basename(os.path.abspath(filepath))
29+
return filename.startswith(".")
30+
31+
32+
def find_all_files(
33+
dir_: str, pred: Optional[Callable[[str], bool]] = None, return_hidden: bool = False
34+
) -> Iterator[str]:
35+
"""
36+
Find all files in dir_, returning filenames
37+
38+
If pred is given, pred(filename) must be True for the filename.
39+
40+
Does not return hidden files by default.
41+
"""
42+
for root, _, filenames in os.walk(dir_):
43+
for fn in filenames:
44+
if not return_hidden and is_hidden(fn):
45+
continue
46+
if pred and not pred(fn):
47+
continue
48+
yield os.path.join(root, fn)
49+
50+
1551
def all_equal(iterable):
1652
g = itertools.groupby(iterable)
1753
return next(g, True) and not next(g, False)
@@ -25,15 +61,63 @@ def common_suffix(its):
2561
return reversed(common_prefix(reversed(it) for it in its))
2662

2763

28-
def removesuffix(text, suffix):
29-
if suffix and text.endswith(suffix):
30-
return text[: -len(suffix)]
31-
return text
64+
def find_gt_and_ocr_files(
65+
gt_dir: str, gt_suffix: str, ocr_dir: str, ocr_suffix: str
66+
) -> Iterator[Tuple[str, str]]:
67+
"""
68+
Find GT files and matching OCR files.
69+
70+
Returns pairs of GT and OCR files.
71+
"""
72+
for gt_fn in find_all_files(gt_dir, lambda fn: fn.endswith(gt_suffix)):
73+
ocr_fn = os.path.join(
74+
ocr_dir,
75+
removesuffix(os.path.relpath(gt_fn, start=gt_dir), gt_suffix) + ocr_suffix,
76+
)
77+
if not os.path.exists(ocr_fn):
78+
raise RuntimeError(f"{ocr_fn} (matching {gt_fn}) does not exist")
79+
80+
yield gt_fn, ocr_fn
81+
82+
83+
def find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir):
84+
"""
85+
Find GT files and matching OCR files, autodetect suffixes.
86+
87+
This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
88+
files with a common suffix. Currently the files must have a suffix, e.g.
89+
".gt.txt" (e.g. ".ocr.txt").
90+
91+
Returns pairs of GT and OCR files.
92+
"""
93+
94+
# Autodetect suffixes
95+
gt_files = find_all_files(gt_dir)
96+
gt_suffix = "".join(common_suffix(gt_files))
97+
if len(gt_suffix) == 0:
98+
raise RuntimeError(
99+
f"Files in GT directory {gt_dir} do not have a common suffix"
100+
)
101+
ocr_files = find_all_files(ocr_dir)
102+
ocr_suffix = "".join(common_suffix(ocr_files))
103+
if len(ocr_suffix) == 0:
104+
raise RuntimeError(
105+
f"Files in OCR directory {ocr_dir} do not have a common suffix"
106+
)
107+
108+
yield from find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
32109

33110

34-
def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
35-
gt_suffix = "".join(common_suffix(os.listdir(gt_dir)))
36-
ocr_suffix = "".join(common_suffix(os.listdir(ocr_dir)))
111+
def process(
112+
gt_dir,
113+
ocr_dir,
114+
report_prefix,
115+
*,
116+
metrics=True,
117+
gt_suffix=None,
118+
ocr_suffix=None,
119+
plain_encoding="autodetect",
120+
):
37121

38122
cer = None
39123
n_characters = None
@@ -42,16 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
42126
n_words = None
43127
word_diff_report = ""
44128

45-
for k, gt in enumerate(os.listdir(gt_dir)):
46-
# Find a match by replacing the suffix
47-
ocr = removesuffix(gt, gt_suffix) + ocr_suffix
129+
if gt_suffix is not None and ocr_suffix is not None:
130+
gt_ocr_files = find_gt_and_ocr_files(gt_dir, gt_suffix, ocr_dir, ocr_suffix)
131+
else:
132+
gt_ocr_files = find_gt_and_ocr_files_autodetect(gt_dir, ocr_dir)
48133

49-
gt_text = plain_extract(os.path.join(gt_dir, gt), include_filename_in_id=True)
134+
for k, (gt_fn, ocr_fn) in enumerate(gt_ocr_files):
135+
gt_text = plain_extract(
136+
gt_fn, include_filename_in_id=True, encoding=plain_encoding
137+
)
50138
ocr_text = plain_extract(
51-
os.path.join(ocr_dir, ocr), include_filename_in_id=True
139+
ocr_fn, include_filename_in_id=True, encoding=plain_encoding
52140
)
53-
gt_words = words_normalized(gt_text)
54-
ocr_words = words_normalized(ocr_text)
141+
gt_words: List[str] = list(words_normalized(gt_text))
142+
ocr_words: List[str] = list(words_normalized(ocr_text))
55143

56144
# Compute CER
57145
l_cer, l_n_characters = character_error_rate_n(gt_text, ocr_text)
@@ -81,15 +169,15 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
81169
joiner="",
82170
none="·",
83171
score_hint=score_hint(l_cer, l_n_characters),
84-
)
172+
)[0]
85173
word_diff_report += gen_diff_report(
86174
gt_words,
87175
ocr_words,
88176
css_prefix="l{0}-w".format(k),
89177
joiner=" ",
90178
none="⋯",
91179
score_hint=score_hint(l_wer, l_n_words),
92-
)
180+
)[0]
93181

94182
env = Environment(
95183
loader=FileSystemLoader(
@@ -123,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
123211
@click.option(
124212
"--metrics/--no-metrics", default=True, help="Enable/disable metrics and green/red"
125213
)
126-
def main(gt, ocr, report_prefix, metrics):
214+
@click.option("--gt-suffix", help="Suffix of GT line text files")
215+
@click.option("--ocr-suffix", help="Suffix of OCR line text files")
216+
@click.option(
217+
"--plain-encoding",
218+
default="autodetect",
219+
help='Encoding (e.g. "utf-8") of plain text files',
220+
)
221+
def main(gt, ocr, report_prefix, metrics, gt_suffix, ocr_suffix, plain_encoding):
127222
"""
128223
Compare the GT line text directory against the OCR line text directory.
129224
130225
This assumes that the GT line text directory contains textfiles with a common
131226
suffix like ".gt.txt", and the OCR line text directory contains textfiles with
132227
a common suffix like ".some-ocr.txt". The text files also need to be paired,
133-
i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
134-
in the OCT lines directory.
228+
i.e. the GT filename "line001.gt.txt" needs to match a filename
229+
"line001.some-ocr.txt" in the OCR lines directory.
230+
231+
GT and OCR directories may contain line text files in matching subdirectories,
232+
e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
135233
136-
The GT and OCR directories are usually round truth line texts and the results of
234+
GT and OCR directories can also be the same directory, but in this case you need
235+
to give --gt-suffix and --ocr-suffix explicitly.
236+
237+
The GT and OCR directories are usually ground truth line texts and the results of
137238
an OCR software, but you may use dinglehopper to compare two OCR results. In
138239
that case, use --no-metrics to disable the then meaningless metrics and also
139240
change the color scheme from green/red to blue.
@@ -142,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics):
142243
$REPORT_PREFIX defaults to "report". The reports include the character error
143244
rate (CER) and the word error rate (WER).
144245
246+
It is recommended to specify the encoding of the text files, for example with
247+
--plain-encoding utf-8. If this option is not given, we try to auto-detect it.
145248
"""
146249
initLogging()
147-
process(gt, ocr, report_prefix, metrics=metrics)
250+
process(
251+
gt,
252+
ocr,
253+
report_prefix,
254+
metrics=metrics,
255+
gt_suffix=gt_suffix,
256+
ocr_suffix=ocr_suffix,
257+
plain_encoding=plain_encoding,
258+
)
148259

149260

150261
if __name__ == "__main__":

0 commit comments

Comments
 (0)