Skip to content

Commit 632fce7

Browse files
authored
feat: validate page count: real PDF pages vs html2pdf4doc pages (#50)
1 parent 3584a4f commit 632fce7

File tree

3 files changed

+58
-11
lines changed

3 files changed

+58
-11
lines changed

html2pdf4doc/html2pdf4doc.py

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import argparse
22
import atexit
33
import base64
4+
import contextlib
45
import os.path
56
import platform
67
import re
@@ -9,10 +10,11 @@
910
import zipfile
1011
from datetime import datetime
1112
from pathlib import Path
12-
from time import sleep
13-
from typing import Dict, List, Optional
13+
from time import sleep, time
14+
from typing import Dict, Iterator, List, Optional, Tuple
1415

1516
import requests
17+
from pypdf import PdfReader
1618
from requests import Response
1719
from selenium import webdriver
1820
from selenium.webdriver.chrome.options import Options
@@ -39,6 +41,28 @@
3941
sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf8", closefd=False)
4042

4143

44+
@contextlib.contextmanager
45+
def measure_performance(title: str) -> Iterator[None]:
46+
time_start = time()
47+
yield
48+
time_end = time()
49+
50+
time_diff = time_end - time_start
51+
padded_name = f"{title} ".ljust(60, ".")
52+
padded_time = f" {time_diff:0.2f}".rjust(6, ".")
53+
print(f"{padded_name}{padded_time}s", flush=True) # noqa: T201
54+
55+
56+
def extract_page_count(logs: List[Dict[str, str]]) -> int:
57+
pattern = re.compile(r'"\[HTML2PDF4DOC]\s*Page count:"\s*(\d+)')
58+
for entry_ in logs:
59+
log_message = entry_["message"]
60+
match = pattern.search(log_message)
61+
if match:
62+
return int(match.group(1))
63+
raise ValueError("No page count found in logs.")
64+
65+
4266
class ChromeDriverManager:
4367
def get_chrome_driver(self, path_to_cache_dir: str) -> str:
4468
chrome_version: Optional[str] = self.get_chrome_version()
@@ -253,7 +277,7 @@ def get_inches_from_millimeters(mm: float) -> float:
253277
return mm / 25.4
254278

255279

256-
def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
280+
def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> Tuple[bytes, int]:
257281
print(f"html2pdf4doc: opening URL with ChromeDriver: {url}") # noqa: T201
258282

259283
driver.get(url)
@@ -285,21 +309,27 @@ def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
285309
}
286310

287311
class Done(Exception):
288-
pass
312+
def __init__(self, page_count: int):
313+
super().__init__()
314+
self.page_count: int = page_count
289315

290316
datetime_start = datetime.today()
291317

292318
logs: List[Dict[str, str]] = []
319+
page_count: int = 0
293320
try:
294321
while True:
295322
logs = driver.get_log("browser") # type: ignore[no-untyped-call]
296323
for entry_ in logs:
297324
if "[HTML2PDF4DOC] Total time:" in entry_["message"]:
298325
print("success: HTML2PDF4Doc completed its job.") # noqa: T201
299-
raise Done
326+
327+
page_count = extract_page_count(logs)
328+
329+
raise Done(page_count)
300330
if (datetime.today() - datetime_start).total_seconds() > 60:
301331
raise TimeoutError
302-
sleep(0.5)
332+
sleep(0.1)
303333
except Done:
304334
pass
305335
except TimeoutError:
@@ -322,7 +352,13 @@ class Done(Exception):
322352
result = driver.execute_cdp_cmd("Page.printToPDF", calculated_print_options)
323353

324354
data = base64.b64decode(result["data"])
325-
return data
355+
356+
if page_count == 0:
357+
raise RuntimeError(
358+
"html2pdf4doc: Something went wrong. "
359+
"Could not capture the printed page count from Chrome."
360+
)
361+
return data, page_count
326362

327363

328364
def create_webdriver(
@@ -521,9 +557,20 @@ def exit_handler() -> None:
521557

522558
url = Path(os.path.abspath(path_to_input_html)).as_uri()
523559

524-
pdf_bytes = get_pdf_from_html(driver, url)
560+
pdf_bytes, page_count = get_pdf_from_html(driver, url)
525561
with open(path_to_output_pdf, "wb") as f:
526562
f.write(pdf_bytes)
563+
564+
with measure_performance("html2pdf4doc: validating page count"):
565+
reader = PdfReader(path_to_output_pdf)
566+
if len(reader.pages) != page_count:
567+
raise RuntimeError(
568+
"Something went wrong with the printed page. "
569+
f"Page count mismatch: "
570+
f"PDF pages: {len(reader.pages)}, "
571+
f"html2pdf4doc pages: {page_count}."
572+
)
573+
527574
else:
528575
print("html2pdf4doc: unknown command.") # noqa: T201
529576
sys.exit(1)

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ dependencies = [
5656

5757
# requests is used for downloading the Chrome driver.
5858
"requests",
59+
60+
# pypdf is used for validating the printed PDF.
61+
"pypdf>=3.9.0",
5962
]
6063

6164
[project.optional-dependencies]

requirements.development.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,3 @@ ruff>=0.9
1616
#
1717
lit
1818
filecheck==0.0.24
19-
20-
# Integration tests use PyPDF to check the contents of the printed PDF.
21-
pypdf==3.9.0

0 commit comments

Comments
 (0)