Skip to content

Commit eb4b247

Browse files
committed
Port to OCR-D/core API v3
1 parent 071e6a8 commit eb4b247

File tree

5 files changed

+87
-76
lines changed

5 files changed

+87
-76
lines changed

.dockerignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
src/dinglehopper/tests
2+
dist
3+
build
4+
*.egg-info
5+
.git

Dockerfile

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,30 @@ LABEL \
66
maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
77
org.label-schema.vcs-ref=$VCS_REF \
88
org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
9-
org.label-schema.build-date=$BUILD_DATE
9+
org.label-schema.build-date=$BUILD_DATE \
10+
org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \
11+
org.opencontainers.image.title="dinglehopper" \
12+
org.opencontainers.image.description="The OCR evaluation tool" \
13+
org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
14+
org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
15+
org.opencontainers.image.revision=$VCS_REF \
16+
org.opencontainers.image.created=$BUILD_DATE \
17+
org.opencontainers.image.base.name=ocrd/core
18+
19+
ENV LANG=C.UTF-8
20+
ENV LC_ALL=C.UTF-8
21+
22+
# avoid HOME/.local/share (hard to predict USER here)
23+
# so let XDG_DATA_HOME coincide with fixed system location
24+
# (can still be overridden by derived stages)
25+
ENV XDG_DATA_HOME /usr/local/share
26+
# avoid the need for an extra volume for persistent resource user db
27+
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
28+
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
1029

1130
WORKDIR /build/dinglehopper
12-
COPY pyproject.toml .
13-
COPY src/dinglehopper/ocrd-tool.json .
14-
COPY src ./src
15-
COPY requirements.txt .
16-
COPY README.md .
17-
COPY Makefile .
18-
RUN make install
19-
RUN rm -rf /build/dinglehopper
31+
COPY . .
32+
RUN make install && rm -rf /build/dinglehopper
2033

2134
WORKDIR /data
22-
VOLUME ["/data"]
35+
VOLUME /data

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
PYTHON = python3
22
PIP = pip3
33
PYTHONIOENCODING=utf8
4+
PYTEST_ARGS = -vv
45

5-
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
6+
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
67
DOCKER_TAG = ocrd/dinglehopper
78

89
help:
@@ -16,6 +17,12 @@ help:
1617
install:
1718
$(PIP) install .
1819

20+
install-dev:
21+
$(PIP) install -e .
22+
23+
test:
24+
pytest $(PYTEST_ARGS)
25+
1926
docker:
2027
docker build \
2128
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \

src/dinglehopper/ocrd-tool.json

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
{
22
"version": "0.9.7",
33
"git_url": "https://github.com/qurator-spk/dinglehopper",
4+
"dockerhub": "ocrd/dinglehopper",
45
"tools": {
56
"ocrd-dinglehopper": {
67
"executable": "ocrd-dinglehopper",
8+
"input_file_grp_cardinality": 2,
9+
"output_file_grp_cardinality": 1,
710
"description": "Evaluate OCR text against ground truth with dinglehopper",
8-
"input_file_grp": [
9-
"OCR-D-GT-PAGE",
10-
"OCR-D-OCR"
11-
],
12-
"output_file_grp": [
13-
"OCR-D-OCR-EVAL"
14-
],
1511
"categories": [
1612
"Quality assurance"
1713
],

src/dinglehopper/ocrd_cli.py

Lines changed: 48 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,73 @@
1-
import json
1+
from functools import cached_property
22
import os
3+
from typing import Optional
34

45
import click
5-
import importlib_resources
6+
from ocrd_models import OcrdFileType
67
from ocrd import Processor
78
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
8-
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
9+
from ocrd_utils import make_file_id
910

1011
from .cli import process as cli_process
1112

12-
OCRD_TOOL = json.loads(
13-
importlib_resources.files(__name__)
14-
.joinpath("ocrd-tool.json")
15-
.read_text(encoding="utf-8", errors="strict")
16-
)
17-
18-
1913
@click.command()
2014
@ocrd_cli_options
2115
def ocrd_dinglehopper(*args, **kwargs):
2216
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
2317

24-
2518
class OcrdDinglehopperEvaluate(Processor):
26-
def __init__(self, *args, **kwargs):
27-
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
28-
kwargs["version"] = OCRD_TOOL["version"]
29-
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
3019

31-
def process(self):
32-
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
33-
assert_file_grp_cardinality(self.output_file_grp, 1)
20+
@cached_property
21+
def executable(self):
22+
return 'ocrd-dinglehopper'
3423

35-
log = getLogger("processor.OcrdDinglehopperEvaluate")
24+
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
3625

26+
assert self.parameter
3727
metrics = self.parameter["metrics"]
3828
textequiv_level = self.parameter["textequiv_level"]
39-
gt_grp, ocr_grp = self.input_file_grp.split(",")
40-
41-
input_file_tuples = self.zip_input_files(on_error="abort")
42-
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
43-
if not gt_file or not ocr_file:
44-
# file/page was not found in this group
45-
continue
46-
gt_file = self.workspace.download_file(gt_file)
47-
ocr_file = self.workspace.download_file(ocr_file)
48-
page_id = gt_file.pageId
4929

50-
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
51-
52-
file_id = make_file_id(ocr_file, self.output_file_grp)
53-
report_prefix = os.path.join(self.output_file_grp, file_id)
54-
55-
# Process the files
56-
try:
57-
os.mkdir(self.output_file_grp)
58-
except FileExistsError:
59-
pass
60-
cli_process(
61-
gt_file.local_filename,
62-
ocr_file.local_filename,
63-
report_prefix,
64-
metrics=metrics,
65-
textequiv_level=textequiv_level,
30+
try:
31+
gt_file, ocr_file = input_files
32+
assert gt_file, 'missing GT file'
33+
assert ocr_file, 'missing OCR file'
34+
assert gt_file.local_filename
35+
assert ocr_file.local_filename
36+
except (ValueError, AssertionError) as err:
37+
self.logger.warning(f'Missing either GT file, OCR file or both: {err}') # TODO how to log which page?
38+
return
39+
40+
page_id = gt_file.pageId
41+
42+
file_id = make_file_id(ocr_file, self.output_file_grp)
43+
report_prefix = os.path.join(self.output_file_grp, file_id)
44+
45+
# Process the files
46+
try:
47+
os.mkdir(self.output_file_grp)
48+
except FileExistsError:
49+
pass
50+
cli_process(
51+
gt_file.local_filename,
52+
ocr_file.local_filename,
53+
report_prefix,
54+
metrics=metrics,
55+
textequiv_level=textequiv_level,
56+
)
57+
58+
# Add reports to the workspace
59+
for report_suffix, mimetype in [
60+
[".html", "text/html"],
61+
[".json", "application/json"],
62+
]:
63+
self.workspace.add_file(
64+
file_id=file_id + report_suffix,
65+
file_grp=self.output_file_grp,
66+
page_id=page_id,
67+
mimetype=mimetype,
68+
local_filename=report_prefix + report_suffix,
6669
)
6770

68-
# Add reports to the workspace
69-
for report_suffix, mimetype in [
70-
[".html", "text/html"],
71-
[".json", "application/json"],
72-
]:
73-
self.workspace.add_file(
74-
file_id=file_id + report_suffix,
75-
file_grp=self.output_file_grp,
76-
page_id=page_id,
77-
mimetype=mimetype,
78-
local_filename=report_prefix + report_suffix,
79-
)
80-
8171

8272
if __name__ == "__main__":
8373
ocrd_dinglehopper()

0 commit comments

Comments
 (0)