11import argparse
22import atexit
33import base64
4+ import contextlib
45import os .path
56import platform
67import re
910import zipfile
1011from datetime import datetime
1112from pathlib import Path
12- from time import sleep
13- from typing import Dict , List , Optional
13+ from time import sleep , time
14+ from typing import Dict , Iterator , List , Optional , Tuple
1415
1516import requests
17+ from pypdf import PdfReader
1618from requests import Response
1719from selenium import webdriver
1820from selenium .webdriver .chrome .options import Options
3941sys .stdout = open (sys .stdout .fileno (), mode = "w" , encoding = "utf8" , closefd = False )
4042
4143
44+ @contextlib .contextmanager
45+ def measure_performance (title : str ) -> Iterator [None ]:
46+ time_start = time ()
47+ yield
48+ time_end = time ()
49+
50+ time_diff = time_end - time_start
51+ padded_name = f"{ title } " .ljust (60 , "." )
52+ padded_time = f" { time_diff :0.2f} " .rjust (6 , "." )
53+ print (f"{ padded_name } { padded_time } s" , flush = True ) # noqa: T201
54+
55+
56+ def extract_page_count (logs : List [Dict [str , str ]]) -> int :
57+ pattern = re .compile (r'"\[HTML2PDF4DOC]\s*Page count:"\s*(\d+)' )
58+ for entry_ in logs :
59+ log_message = entry_ ["message" ]
60+ match = pattern .search (log_message )
61+ if match :
62+ return int (match .group (1 ))
63+ raise ValueError ("No page count found in logs." )
64+
65+
4266class ChromeDriverManager :
4367 def get_chrome_driver (self , path_to_cache_dir : str ) -> str :
4468 chrome_version : Optional [str ] = self .get_chrome_version ()
@@ -253,7 +277,7 @@ def get_inches_from_millimeters(mm: float) -> float:
253277 return mm / 25.4
254278
255279
256- def get_pdf_from_html (driver : webdriver .Chrome , url : str ) -> bytes :
280+ def get_pdf_from_html (driver : webdriver .Chrome , url : str ) -> Tuple [ bytes , int ] :
257281 print (f"html2pdf4doc: opening URL with ChromeDriver: { url } " ) # noqa: T201
258282
259283 driver .get (url )
@@ -285,21 +309,27 @@ def get_pdf_from_html(driver: webdriver.Chrome, url: str) -> bytes:
285309 }
286310
287311 class Done (Exception ):
288- pass
312+ def __init__ (self , page_count : int ):
313+ super ().__init__ ()
314+ self .page_count : int = page_count
289315
290316 datetime_start = datetime .today ()
291317
292318 logs : List [Dict [str , str ]] = []
319+ page_count : int = 0
293320 try :
294321 while True :
295322 logs = driver .get_log ("browser" ) # type: ignore[no-untyped-call]
296323 for entry_ in logs :
297324 if "[HTML2PDF4DOC] Total time:" in entry_ ["message" ]:
298325 print ("success: HTML2PDF4Doc completed its job." ) # noqa: T201
299- raise Done
326+
327+ page_count = extract_page_count (logs )
328+
329+ raise Done (page_count )
300330 if (datetime .today () - datetime_start ).total_seconds () > 60 :
301331 raise TimeoutError
302- sleep (0.5 )
332+ sleep (0.1 )
303333 except Done :
304334 pass
305335 except TimeoutError :
@@ -322,7 +352,13 @@ class Done(Exception):
322352 result = driver .execute_cdp_cmd ("Page.printToPDF" , calculated_print_options )
323353
324354 data = base64 .b64decode (result ["data" ])
325- return data
355+
356+ if page_count == 0 :
357+ raise RuntimeError (
358+ "html2pdf4doc: Something went wrong. "
359+ "Could not capture the printed page count from Chrome."
360+ )
361+ return data , page_count
326362
327363
328364def create_webdriver (
@@ -521,9 +557,20 @@ def exit_handler() -> None:
521557
522558 url = Path (os .path .abspath (path_to_input_html )).as_uri ()
523559
524- pdf_bytes = get_pdf_from_html (driver , url )
560+ pdf_bytes , page_count = get_pdf_from_html (driver , url )
525561 with open (path_to_output_pdf , "wb" ) as f :
526562 f .write (pdf_bytes )
563+
564+ with measure_performance ("html2pdf4doc: validating page count" ):
565+ reader = PdfReader (path_to_output_pdf )
566+ if len (reader .pages ) != page_count :
567+ raise RuntimeError (
568+ "Something went wrong with the printed page. "
569+ f"Page count mismatch: "
570+ f"PDF pages: { len (reader .pages )} , "
571+ f"html2pdf4doc pages: { page_count } ."
572+ )
573+
527574 else :
528575 print ("html2pdf4doc: unknown command." ) # noqa: T201
529576 sys .exit (1 )
0 commit comments