11import itertools
22import os
3+ from typing import Callable , Iterator , List , Optional , Tuple
34
45import click
56from jinja2 import Environment , FileSystemLoader
1213from .word_error_rate import word_error_rate_n , words_normalized
1314
1415
16+ def removesuffix (text , suffix ):
17+ """
18+ Remove suffix from text.
19+
20+ Can be replaced with str.removesuffix when we only support Python >= 3.9.
21+ """
22+ if suffix and text .endswith (suffix ):
23+ return text [: - len (suffix )]
24+ return text
25+
26+
27+ def is_hidden (filepath ):
28+ filename = os .path .basename (os .path .abspath (filepath ))
29+ return filename .startswith ("." )
30+
31+
32+ def find_all_files (
33+ dir_ : str , pred : Optional [Callable [[str ], bool ]] = None , return_hidden : bool = False
34+ ) -> Iterator [str ]:
35+ """
36+ Find all files in dir_, returning filenames
37+
38+ If pred is given, pred(filename) must be True for the filename.
39+
40+ Does not return hidden files by default.
41+ """
42+ for root , _ , filenames in os .walk (dir_ ):
43+ for fn in filenames :
44+ if not return_hidden and is_hidden (fn ):
45+ continue
46+ if pred and not pred (fn ):
47+ continue
48+ yield os .path .join (root , fn )
49+
50+
1551def all_equal (iterable ):
1652 g = itertools .groupby (iterable )
1753 return next (g , True ) and not next (g , False )
@@ -25,15 +61,63 @@ def common_suffix(its):
2561 return reversed (common_prefix (reversed (it ) for it in its ))
2662
2763
28- def removesuffix (text , suffix ):
29- if suffix and text .endswith (suffix ):
30- return text [: - len (suffix )]
31- return text
64+ def find_gt_and_ocr_files (
65+ gt_dir : str , gt_suffix : str , ocr_dir : str , ocr_suffix : str
66+ ) -> Iterator [Tuple [str , str ]]:
67+ """
68+ Find GT files and matching OCR files.
69+
70+ Returns pairs of GT and OCR files.
71+ """
72+ for gt_fn in find_all_files (gt_dir , lambda fn : fn .endswith (gt_suffix )):
73+ ocr_fn = os .path .join (
74+ ocr_dir ,
75+ removesuffix (os .path .relpath (gt_fn , start = gt_dir ), gt_suffix ) + ocr_suffix ,
76+ )
77+ if not os .path .exists (ocr_fn ):
78+ raise RuntimeError (f"{ ocr_fn } (matching { gt_fn } ) does not exist" )
79+
80+ yield gt_fn , ocr_fn
81+
82+
83+ def find_gt_and_ocr_files_autodetect (gt_dir , ocr_dir ):
84+ """
85+ Find GT files and matching OCR files, autodetect suffixes.
86+
87+ This only works if gt_dir (or respectivley ocr_dir) only contains GT (OCR)
88+ files with a common suffix. Currently the files must have a suffix, e.g.
89+ ".gt.txt" (e.g. ".ocr.txt").
90+
91+ Returns pairs of GT and OCR files.
92+ """
93+
94+ # Autodetect suffixes
95+ gt_files = find_all_files (gt_dir )
96+ gt_suffix = "" .join (common_suffix (gt_files ))
97+ if len (gt_suffix ) == 0 :
98+ raise RuntimeError (
99+ f"Files in GT directory { gt_dir } do not have a common suffix"
100+ )
101+ ocr_files = find_all_files (ocr_dir )
102+ ocr_suffix = "" .join (common_suffix (ocr_files ))
103+ if len (ocr_suffix ) == 0 :
104+ raise RuntimeError (
105+ f"Files in OCR directory { ocr_dir } do not have a common suffix"
106+ )
107+
108+ yield from find_gt_and_ocr_files (gt_dir , gt_suffix , ocr_dir , ocr_suffix )
32109
33110
34- def process (gt_dir , ocr_dir , report_prefix , * , metrics = True ):
35- gt_suffix = "" .join (common_suffix (os .listdir (gt_dir )))
36- ocr_suffix = "" .join (common_suffix (os .listdir (ocr_dir )))
111+ def process (
112+ gt_dir ,
113+ ocr_dir ,
114+ report_prefix ,
115+ * ,
116+ metrics = True ,
117+ gt_suffix = None ,
118+ ocr_suffix = None ,
119+ plain_encoding = "autodetect" ,
120+ ):
37121
38122 cer = None
39123 n_characters = None
@@ -42,16 +126,20 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
42126 n_words = None
43127 word_diff_report = ""
44128
45- for k , gt in enumerate (os .listdir (gt_dir )):
46- # Find a match by replacing the suffix
47- ocr = removesuffix (gt , gt_suffix ) + ocr_suffix
129+ if gt_suffix is not None and ocr_suffix is not None :
130+ gt_ocr_files = find_gt_and_ocr_files (gt_dir , gt_suffix , ocr_dir , ocr_suffix )
131+ else :
132+ gt_ocr_files = find_gt_and_ocr_files_autodetect (gt_dir , ocr_dir )
48133
49- gt_text = plain_extract (os .path .join (gt_dir , gt ), include_filename_in_id = True )
134+ for k , (gt_fn , ocr_fn ) in enumerate (gt_ocr_files ):
135+ gt_text = plain_extract (
136+ gt_fn , include_filename_in_id = True , encoding = plain_encoding
137+ )
50138 ocr_text = plain_extract (
51- os . path . join ( ocr_dir , ocr ), include_filename_in_id = True
139+ ocr_fn , include_filename_in_id = True , encoding = plain_encoding
52140 )
53- gt_words = words_normalized (gt_text )
54- ocr_words = words_normalized (ocr_text )
141+ gt_words : List [ str ] = list ( words_normalized (gt_text ) )
142+ ocr_words : List [ str ] = list ( words_normalized (ocr_text ) )
55143
56144 # Compute CER
57145 l_cer , l_n_characters = character_error_rate_n (gt_text , ocr_text )
@@ -81,15 +169,15 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
81169 joiner = "" ,
82170 none = "·" ,
83171 score_hint = score_hint (l_cer , l_n_characters ),
84- )
172+ )[ 0 ]
85173 word_diff_report += gen_diff_report (
86174 gt_words ,
87175 ocr_words ,
88176 css_prefix = "l{0}-w" .format (k ),
89177 joiner = " " ,
90178 none = "⋯" ,
91179 score_hint = score_hint (l_wer , l_n_words ),
92- )
180+ )[ 0 ]
93181
94182 env = Environment (
95183 loader = FileSystemLoader (
@@ -123,17 +211,30 @@ def process(gt_dir, ocr_dir, report_prefix, *, metrics=True):
123211@click .option (
124212 "--metrics/--no-metrics" , default = True , help = "Enable/disable metrics and green/red"
125213)
126- def main (gt , ocr , report_prefix , metrics ):
214+ @click .option ("--gt-suffix" , help = "Suffix of GT line text files" )
215+ @click .option ("--ocr-suffix" , help = "Suffix of OCR line text files" )
216+ @click .option (
217+ "--plain-encoding" ,
218+ default = "autodetect" ,
219+ help = 'Encoding (e.g. "utf-8") of plain text files' ,
220+ )
221+ def main (gt , ocr , report_prefix , metrics , gt_suffix , ocr_suffix , plain_encoding ):
127222 """
128223 Compare the GT line text directory against the OCR line text directory.
129224
130225 This assumes that the GT line text directory contains textfiles with a common
131226 suffix like ".gt.txt", and the OCR line text directory contains textfiles with
132227 a common suffix like ".some-ocr.txt". The text files also need to be paired,
133- i.e. the GT file "line001.gt.txt" needs to match a file "line001.some-ocr.txt"
134- in the OCT lines directory.
228+ i.e. the GT filename "line001.gt.txt" needs to match a filename
229+ "line001.some-ocr.txt" in the OCR lines directory.
230+
231+ GT and OCR directories may contain line text files in matching subdirectories,
232+ e.g. "GT/goethe_faust/line1.gt.txt" and "OCR/goethe_faust/line1.pred.txt".
135233
136- The GT and OCR directories are usually round truth line texts and the results of
234+ GT and OCR directories can also be the same directory, but in this case you need
235+ to give --gt-suffix and --ocr-suffix explicitly.
236+
237+ The GT and OCR directories are usually ground truth line texts and the results of
137238 an OCR software, but you may use dinglehopper to compare two OCR results. In
138239 that case, use --no-metrics to disable the then meaningless metrics and also
139240 change the color scheme from green/red to blue.
@@ -142,9 +243,19 @@ def main(gt, ocr, report_prefix, metrics):
142243 $REPORT_PREFIX defaults to "report". The reports include the character error
143244 rate (CER) and the word error rate (WER).
144245
246+ It is recommended to specify the encoding of the text files, for example with
247+ --plain-encoding utf-8. If this option is not given, we try to auto-detect it.
145248 """
146249 initLogging ()
147- process (gt , ocr , report_prefix , metrics = metrics )
250+ process (
251+ gt ,
252+ ocr ,
253+ report_prefix ,
254+ metrics = metrics ,
255+ gt_suffix = gt_suffix ,
256+ ocr_suffix = ocr_suffix ,
257+ plain_encoding = plain_encoding ,
258+ )
148259
149260
150261if __name__ == "__main__" :
0 commit comments