|
1 | | -import json |
| 1 | +from functools import cached_property |
2 | 2 | import os |
| 3 | +from typing import Optional |
3 | 4 |
|
4 | 5 | import click |
5 | | -import importlib_resources |
| 6 | +from ocrd_models import OcrdFileType |
6 | 7 | from ocrd import Processor |
7 | 8 | from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor |
8 | | -from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id |
| 9 | +from ocrd_utils import make_file_id |
9 | 10 |
|
10 | 11 | from .cli import process as cli_process |
11 | 12 |
|
12 | | -OCRD_TOOL = json.loads( |
13 | | - importlib_resources.files(__name__) |
14 | | - .joinpath("ocrd-tool.json") |
15 | | - .read_text(encoding="utf-8", errors="strict") |
16 | | -) |
17 | | - |
18 | | - |
19 | 13 | @click.command() |
20 | 14 | @ocrd_cli_options |
21 | 15 | def ocrd_dinglehopper(*args, **kwargs): |
22 | 16 | return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs) |
23 | 17 |
|
24 | | - |
25 | 18 | class OcrdDinglehopperEvaluate(Processor): |
26 | | - def __init__(self, *args, **kwargs): |
27 | | - kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"] |
28 | | - kwargs["version"] = OCRD_TOOL["version"] |
29 | | - super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs) |
30 | 19 |
|
31 | | - def process(self): |
32 | | - assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR") |
33 | | - assert_file_grp_cardinality(self.output_file_grp, 1) |
| 20 | + @cached_property |
| 21 | + def executable(self): |
| 22 | + return 'ocrd-dinglehopper' |
34 | 23 |
|
35 | | - log = getLogger("processor.OcrdDinglehopperEvaluate") |
| 24 | + def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None: |
36 | 25 |
|
| 26 | + assert self.parameter |
37 | 27 | metrics = self.parameter["metrics"] |
38 | 28 | textequiv_level = self.parameter["textequiv_level"] |
39 | | - gt_grp, ocr_grp = self.input_file_grp.split(",") |
40 | | - |
41 | | - input_file_tuples = self.zip_input_files(on_error="abort") |
42 | | - for n, (gt_file, ocr_file) in enumerate(input_file_tuples): |
43 | | - if not gt_file or not ocr_file: |
44 | | - # file/page was not found in this group |
45 | | - continue |
46 | | - gt_file = self.workspace.download_file(gt_file) |
47 | | - ocr_file = self.workspace.download_file(ocr_file) |
48 | | - page_id = gt_file.pageId |
49 | 29 |
|
50 | | - log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file) |
51 | | - |
52 | | - file_id = make_file_id(ocr_file, self.output_file_grp) |
53 | | - report_prefix = os.path.join(self.output_file_grp, file_id) |
54 | | - |
55 | | - # Process the files |
56 | | - try: |
57 | | - os.mkdir(self.output_file_grp) |
58 | | - except FileExistsError: |
59 | | - pass |
60 | | - cli_process( |
61 | | - gt_file.local_filename, |
62 | | - ocr_file.local_filename, |
63 | | - report_prefix, |
64 | | - metrics=metrics, |
65 | | - textequiv_level=textequiv_level, |
| 30 | + # wrong number of inputs: let fail |
| 31 | + gt_file, ocr_file = input_files |
| 32 | + # missing on either side: skip (zip_input_files already warned) |
| 33 | + if not gt_file or not ocr_file: |
| 34 | + return |
| 35 | + # missing download (i.e. OCRD_DOWNLOAD_INPUT=false): |
| 36 | + if not gt_file.local_filename: |
| 37 | + if config.OCRD_MISSING_INPUT == 'ABORT': |
| 38 | + raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype) |
| 39 | + return |
| 40 | + if not ocr_file.local_filename: |
| 41 | + if config.OCRD_MISSING_INPUT == 'ABORT': |
| 42 | + raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype) |
| 43 | + return |
| 44 | + |
| 45 | + page_id = gt_file.pageId |
| 46 | + |
| 47 | + file_id = make_file_id(ocr_file, self.output_file_grp) |
| 48 | + cli_process( |
| 49 | + gt_file.local_filename, |
| 50 | + ocr_file.local_filename, |
| 51 | + file_id, |
| 52 | + self.output_file_grp, |
| 53 | + metrics=metrics, |
| 54 | + textequiv_level=textequiv_level, |
| 55 | + ) |
| 56 | + |
| 57 | + # Add reports to the workspace |
| 58 | + for report_suffix, mimetype in [ |
| 59 | + [".html", "text/html"], |
| 60 | + [".json", "application/json"], |
| 61 | + ]: |
| 62 | + output_file_id = file_id + report_suffix |
| 63 | + output_file = next(self.workspace.mets.find_files(ID=output_file_id), None) |
| 64 | + if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE': |
| 65 | + raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set") |
| 66 | + self.workspace.add_file( |
| 67 | + file_id=output_file_id, |
| 68 | + file_grp=self.output_file_grp, |
| 69 | + page_id=page_id, |
| 70 | + mimetype=mimetype, |
| 71 | + local_filename=file_id + report_suffix, |
66 | 72 | ) |
67 | 73 |
|
68 | | - # Add reports to the workspace |
69 | | - for report_suffix, mimetype in [ |
70 | | - [".html", "text/html"], |
71 | | - [".json", "application/json"], |
72 | | - ]: |
73 | | - self.workspace.add_file( |
74 | | - file_id=file_id + report_suffix, |
75 | | - file_grp=self.output_file_grp, |
76 | | - page_id=page_id, |
77 | | - mimetype=mimetype, |
78 | | - local_filename=report_prefix + report_suffix, |
79 | | - ) |
80 | | - |
81 | 74 |
|
82 | 75 | if __name__ == "__main__": |
83 | 76 | ocrd_dinglehopper() |
0 commit comments