Skip to content

Commit b1c109b

Browse files
authored
Merge pull request #128 from kba/v3-api
V3 api
2 parents bf6633b + 13ab1ae commit b1c109b

File tree

6 files changed

+95
-78
lines changed

6 files changed

+95
-78
lines changed

.dockerignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
src/dinglehopper/tests
2+
dist
3+
build
4+
*.egg-info
5+
.git

Dockerfile

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,33 @@ LABEL \
66
maintainer="https://github.com/qurator-spk/dinglehopper/issues" \
77
org.label-schema.vcs-ref=$VCS_REF \
88
org.label-schema.vcs-url="https://github.com/qurator-spk/dinglehopper" \
9-
org.label-schema.build-date=$BUILD_DATE
9+
org.label-schema.build-date=$BUILD_DATE \
10+
org.opencontainers.image.vendor="qurator" \
11+
org.opencontainers.image.title="dinglehopper" \
12+
org.opencontainers.image.description="An OCR evaluation tool" \
13+
org.opencontainers.image.source="https://github.com/qurator-spk/dinglehopper" \
14+
org.opencontainers.image.documentation="https://github.com/qurator-spk/dinglehopper/blob/${VCS_REF}/README.md" \
15+
org.opencontainers.image.revision=$VCS_REF \
16+
org.opencontainers.image.created=$BUILD_DATE \
17+
org.opencontainers.image.base.name=ocrd/core
18+
19+
ENV LANG=C.UTF-8
20+
ENV LC_ALL=C.UTF-8
21+
22+
# avoid HOME/.local/share (hard to predict USER here)
23+
# so let XDG_DATA_HOME coincide with fixed system location
24+
# (can still be overridden by derived stages)
25+
ENV XDG_DATA_HOME /usr/local/share
26+
# avoid the need for an extra volume for persistent resource user db
27+
# (i.e. XDG_CONFIG_HOME/ocrd/resources.yml)
28+
ENV XDG_CONFIG_HOME /usr/local/share/ocrd-resources
1029

1130
WORKDIR /build/dinglehopper
12-
COPY pyproject.toml .
13-
COPY src/dinglehopper/ocrd-tool.json .
14-
COPY src ./src
15-
COPY requirements.txt .
16-
COPY README.md .
17-
COPY Makefile .
18-
RUN make install
19-
RUN rm -rf /build/dinglehopper
31+
COPY . .
32+
COPY ocrd-tool.json .
33+
# prepackage ocrd-tool.json as ocrd-all-tool.json
34+
RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json
35+
RUN make install && rm -rf /build/dinglehopper
2036

2137
WORKDIR /data
22-
VOLUME ["/data"]
38+
VOLUME /data

Makefile

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
PYTHON = python3
22
PIP = pip3
33
PYTHONIOENCODING=utf8
4+
PYTEST_ARGS = -vv
45

5-
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v2.69.0
6+
DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.3.0
67
DOCKER_TAG = ocrd/dinglehopper
78

89
help:
@@ -16,11 +17,17 @@ help:
1617
install:
1718
$(PIP) install .
1819

20+
install-dev:
21+
$(PIP) install -e .
22+
23+
test:
24+
pytest $(PYTEST_ARGS)
25+
1926
docker:
2027
docker build \
2128
--build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \
2229
--build-arg VCS_REF=$$(git rev-parse --short HEAD) \
2330
--build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \
2431
-t $(DOCKER_TAG) .
2532

26-
.PHONY: help install docker
33+
.PHONY: help install install-dev test docker

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ uniseg >= 0.9.1
55
numpy
66
colorama
77
MarkupSafe
8-
ocrd >= 2.65.0
8+
ocrd >= 3.3.0
99
attrs
1010
multimethod >= 1.3
1111
tqdm

src/dinglehopper/ocrd-tool.json

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
{
22
"version": "0.9.7",
33
"git_url": "https://github.com/qurator-spk/dinglehopper",
4+
"dockerhub": "ocrd/dinglehopper",
45
"tools": {
56
"ocrd-dinglehopper": {
67
"executable": "ocrd-dinglehopper",
8+
"input_file_grp_cardinality": 2,
9+
"output_file_grp_cardinality": 1,
710
"description": "Evaluate OCR text against ground truth with dinglehopper",
8-
"input_file_grp": [
9-
"OCR-D-GT-PAGE",
10-
"OCR-D-OCR"
11-
],
12-
"output_file_grp": [
13-
"OCR-D-OCR-EVAL"
14-
],
1511
"categories": [
1612
"Quality assurance"
1713
],

src/dinglehopper/ocrd_cli.py

Lines changed: 51 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,83 +1,76 @@
1-
import json
1+
from functools import cached_property
22
import os
3+
from typing import Optional
34

45
import click
5-
import importlib_resources
6+
from ocrd_models import OcrdFileType
67
from ocrd import Processor
78
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
8-
from ocrd_utils import assert_file_grp_cardinality, getLogger, make_file_id
9+
from ocrd_utils import make_file_id
910

1011
from .cli import process as cli_process
1112

12-
OCRD_TOOL = json.loads(
13-
importlib_resources.files(__name__)
14-
.joinpath("ocrd-tool.json")
15-
.read_text(encoding="utf-8", errors="strict")
16-
)
17-
18-
1913
@click.command()
2014
@ocrd_cli_options
2115
def ocrd_dinglehopper(*args, **kwargs):
2216
return ocrd_cli_wrap_processor(OcrdDinglehopperEvaluate, *args, **kwargs)
2317

24-
2518
class OcrdDinglehopperEvaluate(Processor):
26-
def __init__(self, *args, **kwargs):
27-
kwargs["ocrd_tool"] = OCRD_TOOL["tools"]["ocrd-dinglehopper"]
28-
kwargs["version"] = OCRD_TOOL["version"]
29-
super(OcrdDinglehopperEvaluate, self).__init__(*args, **kwargs)
3019

31-
def process(self):
32-
assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
33-
assert_file_grp_cardinality(self.output_file_grp, 1)
20+
@cached_property
21+
def executable(self):
22+
return 'ocrd-dinglehopper'
3423

35-
log = getLogger("processor.OcrdDinglehopperEvaluate")
24+
def process_page_file(self, *input_files: Optional[OcrdFileType]) -> None:
3625

26+
assert self.parameter
3727
metrics = self.parameter["metrics"]
3828
textequiv_level = self.parameter["textequiv_level"]
39-
gt_grp, ocr_grp = self.input_file_grp.split(",")
40-
41-
input_file_tuples = self.zip_input_files(on_error="abort")
42-
for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
43-
if not gt_file or not ocr_file:
44-
# file/page was not found in this group
45-
continue
46-
gt_file = self.workspace.download_file(gt_file)
47-
ocr_file = self.workspace.download_file(ocr_file)
48-
page_id = gt_file.pageId
4929

50-
log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)
51-
52-
file_id = make_file_id(ocr_file, self.output_file_grp)
53-
report_prefix = os.path.join(self.output_file_grp, file_id)
54-
55-
# Process the files
56-
try:
57-
os.mkdir(self.output_file_grp)
58-
except FileExistsError:
59-
pass
60-
cli_process(
61-
gt_file.local_filename,
62-
ocr_file.local_filename,
63-
report_prefix,
64-
metrics=metrics,
65-
textequiv_level=textequiv_level,
30+
# wrong number of inputs: let fail
31+
gt_file, ocr_file = input_files
32+
# missing on either side: skip (zip_input_files already warned)
33+
if not gt_file or not ocr_file:
34+
return
35+
# missing download (i.e. OCRD_DOWNLOAD_INPUT=false):
36+
if not gt_file.local_filename:
37+
if config.OCRD_MISSING_INPUT == 'ABORT':
38+
raise MissingInputFile(gt_file.fileGrp, gt_file.pageId, gt_file.mimetype)
39+
return
40+
if not ocr_file.local_filename:
41+
if config.OCRD_MISSING_INPUT == 'ABORT':
42+
raise MissingInputFile(ocr_file.fileGrp, ocr_file.pageId, ocr_file.mimetype)
43+
return
44+
45+
page_id = gt_file.pageId
46+
47+
file_id = make_file_id(ocr_file, self.output_file_grp)
48+
cli_process(
49+
gt_file.local_filename,
50+
ocr_file.local_filename,
51+
file_id,
52+
self.output_file_grp,
53+
metrics=metrics,
54+
textequiv_level=textequiv_level,
55+
)
56+
57+
# Add reports to the workspace
58+
for report_suffix, mimetype in [
59+
[".html", "text/html"],
60+
[".json", "application/json"],
61+
]:
62+
output_file_id = file_id + report_suffix
63+
output_file = next(self.workspace.mets.find_files(ID=output_file_id), None)
64+
if output_file and config.OCRD_EXISTING_OUTPUT != 'OVERWRITE':
65+
raise FileExistsError(f"A file with ID=={output_file_id} already exists {output_file} and neither force nor ignore are set")
66+
self.workspace.add_file(
67+
file_id=output_file_id,
68+
file_grp=self.output_file_grp,
69+
page_id=page_id,
70+
mimetype=mimetype,
71+
local_filename=file_id + report_suffix,
6672
)
6773

68-
# Add reports to the workspace
69-
for report_suffix, mimetype in [
70-
[".html", "text/html"],
71-
[".json", "application/json"],
72-
]:
73-
self.workspace.add_file(
74-
file_id=file_id + report_suffix,
75-
file_grp=self.output_file_grp,
76-
page_id=page_id,
77-
mimetype=mimetype,
78-
local_filename=report_prefix + report_suffix,
79-
)
80-
8174

8275
if __name__ == "__main__":
8376
ocrd_dinglehopper()

0 commit comments

Comments
 (0)