Skip to content

Commit 28e2e82

Browse files
authored
feat: add html2pdf4doc_fuzzer script and the first fuzz test (#54)
1 parent ecb86b4 commit 28e2e82

File tree

62 files changed

+45496
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+45496
-1
lines changed

.github/workflows/ci-mac.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ jobs:
1111
strategy:
1212
matrix:
1313
python-version: [
14-
"3.8", "3.12"
14+
"3.9", "3.13"
1515
]
1616

1717
steps:
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
name: "HTML2PDF4Doc Fuzz Testing on Linux"
2+
3+
on:
4+
pull_request:
5+
branches: [ "**" ]
6+
7+
jobs:
8+
build:
9+
runs-on: ubuntu-latest
10+
timeout-minutes: 120 # 2 hours
11+
12+
strategy:
13+
matrix:
14+
python-version: [
15+
"3.12"
16+
]
17+
18+
steps:
19+
- uses: actions/checkout@v3
20+
21+
- name: Set up Python ${{ matrix.python-version }}
22+
uses: actions/setup-python@v1
23+
with:
24+
python-version: ${{ matrix.python-version }}
25+
26+
- name: Upgrade pip
27+
run: |
28+
python -m pip install --upgrade pip
29+
30+
- name: Install Python packages
31+
run: |
32+
pip install -r requirements.development.txt
33+
34+
- name: Clone html2pdf4doc.js
35+
run: |
36+
invoke bootstrap
37+
env:
38+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
39+
40+
- name: Install html2pdf4doc dependencies.
41+
run: |
42+
python developer/pip_install_html2pdf4doc_deps.py
43+
44+
- name: Run Lint tasks
45+
run: |
46+
invoke lint
47+
48+
- name: Build HTML2PDF4Doc.js
49+
run: |
50+
invoke build
51+
52+
- name: Run tests
53+
run: |
54+
if [ "${{ github.event_name }}" = "schedule" ]; then
55+
echo "🕒 Running long fuzzing..."
56+
invoke test-fuzz --long
57+
else
58+
echo "🚀 Running short fuzzing..."
59+
invoke test-fuzz
60+
fi
61+
62+
- name: Upload broken PDFs as artifact
63+
# Always upload, even if job fails.
64+
if: failure() || always()
65+
uses: actions/upload-artifact@v4
66+
with:
67+
name: broken-pdfs
68+
path: output/
69+
retention-days: 30

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,7 @@ tests/integration/.lit_test_times.txt
99
tests/integration/**/Output/
1010
output/
1111

12+
__pycache__/
13+
14+
# Fuzz testing files.
15+
**.mut.**

html2pdf4doc/html2pdf4doc.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
__version__ = "0.0.22"
2525

26+
PATH_TO_HTML2PDF4DOC_PY = __file__
2627
PATH_TO_HTML2PDF4DOC_JS = os.path.join(
2728
os.path.dirname(os.path.join(__file__)),
2829
"html2pdf4doc_js",
Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
import argparse
2+
import contextlib
3+
import datetime
4+
import os.path
5+
import random
6+
import shutil
7+
import sys
8+
from pathlib import Path
9+
from subprocess import CalledProcessError, CompletedProcess, TimeoutExpired, run
10+
from time import time
11+
from typing import Iterator, List
12+
13+
from faker import Faker
14+
from lxml import etree, html
15+
16+
from html2pdf4doc import PATH_TO_HTML2PDF4DOC_PY
17+
18+
19+
@contextlib.contextmanager
20+
def measure_performance(title: str) -> Iterator[None]:
21+
time_start = time()
22+
yield
23+
time_end = time()
24+
25+
time_diff = time_end - time_start
26+
padded_name = f"{title} ".ljust(60, ".")
27+
padded_time = f" {time_diff:0.2f}".rjust(6, ".")
28+
print(f"{padded_name}{padded_time}s", flush=True) # noqa: T201
29+
30+
31+
def mutate_and_print(path_to_input_file: str, path_to_root: str) -> bool:
32+
assert os.path.isfile(path_to_input_file), path_to_input_file
33+
assert os.path.isdir(path_to_root), path_to_root
34+
if not os.path.abspath(path_to_root):
35+
path_to_root = os.path.abspath(path_to_root)
36+
37+
text = open(path_to_input_file, encoding="utf-8").read()
38+
39+
# Parse HTML into DOM
40+
tree = html.fromstring(text)
41+
42+
# Pick a random element
43+
elems = tree.xpath("//p | //td")
44+
if elems:
45+
for _i in range(25):
46+
node = random.choice(elems)
47+
48+
print("Mutating node:", node.tag, flush=True) # noqa: T201
49+
50+
n_sentences = random.randint(1, 100)
51+
52+
fake = Faker()
53+
extra_text = fake.text(max_nb_chars=10 * n_sentences)
54+
55+
node.text = extra_text
56+
57+
# Serialize back to HTML
58+
mutated_html = etree.tostring(
59+
tree, pretty_print=False, method="html", encoding="unicode"
60+
)
61+
62+
# Save next to input file
63+
path_to_mut_html = path_to_input_file + ".mut.html"
64+
path_to_mut_pdf = path_to_input_file + ".mut.html.pdf"
65+
with open(path_to_mut_html, "w", encoding="utf-8") as f:
66+
f.write(mutated_html)
67+
68+
print("Wrote mutated file:", path_to_mut_html, flush=True) # noqa: T201
69+
70+
paths_to_print = [(path_to_mut_html, path_to_mut_pdf)]
71+
72+
cmd: List[str] = [
73+
sys.executable,
74+
PATH_TO_HTML2PDF4DOC_PY,
75+
"print",
76+
"--strict",
77+
]
78+
79+
for path_to_print_ in paths_to_print:
80+
cmd.append(path_to_print_[0])
81+
cmd.append(path_to_print_[1])
82+
83+
relative_path_to_mut_html = Path(path_to_root).relative_to(".")
84+
path_to_mut_output = f"output/{relative_path_to_mut_html}"
85+
86+
def copy_files_if_needed() -> None:
87+
if os.path.isdir(path_to_mut_output):
88+
return
89+
90+
shutil.rmtree("output", ignore_errors=True)
91+
Path("output").mkdir(parents=True, exist_ok=True)
92+
93+
shutil.copytree(
94+
"html2pdf4doc", "output/html2pdf4doc", dirs_exist_ok=True
95+
)
96+
97+
shutil.rmtree(path_to_mut_output, ignore_errors=True)
98+
Path(path_to_mut_output).mkdir(parents=True, exist_ok=True)
99+
100+
shutil.copytree(path_to_root, path_to_mut_output, dirs_exist_ok=True)
101+
102+
def copy_mutated_file() -> None:
103+
relative_path_to_mut_html = Path(path_to_mut_html).relative_to(
104+
path_to_root
105+
)
106+
107+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
108+
path_to_mut_html_out = os.path.join(
109+
path_to_mut_output,
110+
f"{relative_path_to_mut_html}.{timestamp}.html",
111+
)
112+
shutil.copy(path_to_mut_html, path_to_mut_html_out)
113+
114+
path_to_mut_pdf_out = os.path.join(
115+
path_to_mut_output,
116+
f"{relative_path_to_mut_html}.{timestamp}.pdf",
117+
)
118+
shutil.copy(path_to_mut_pdf, path_to_mut_pdf_out)
119+
120+
print( # noqa: T201
121+
f"Saved failed mutated HTML as:\n"
122+
f"HTML: {path_to_mut_html_out}\n"
123+
f"PDF: {path_to_mut_pdf_out}"
124+
)
125+
126+
with measure_performance(
127+
"html2pdf4doc_fuzzer: printing HTML to PDF using HTML2PDF and Chrome Driver"
128+
):
129+
try:
130+
_: CompletedProcess[bytes] = run(
131+
cmd, capture_output=False, check=True, bufsize=1
132+
)
133+
except CalledProcessError as called_process_error_:
134+
print(called_process_error_) # noqa: T201
135+
136+
copy_files_if_needed()
137+
138+
copy_mutated_file()
139+
140+
return False
141+
except TimeoutExpired:
142+
raise TimeoutError from None
143+
return True
144+
145+
146+
def main() -> None:
147+
parser = argparse.ArgumentParser()
148+
149+
parser.add_argument("input_file", type=str, help="TODO")
150+
parser.add_argument("root_path", type=str, help="TODO")
151+
parser.add_argument(
152+
"--long",
153+
action="store_true",
154+
help="Run the fuzzer in long mode (more iterations).",
155+
)
156+
157+
args = parser.parse_args()
158+
159+
path_to_input_file = args.input_file
160+
path_to_root = args.root_path
161+
162+
shutil.rmtree("output", ignore_errors=True)
163+
Path("output").mkdir(parents=True, exist_ok=True)
164+
165+
total_runs = 200 if args.long else 20
166+
success_count, failure_count = 0, 0
167+
for i in range(1, total_runs + 1):
168+
print( # noqa: T201
169+
f"html2pdf4doc_fuzzer print cycle #{i}/{total_runs} — "
170+
f"So far: 🟢{success_count} / 🔴{failure_count}",
171+
flush=True,
172+
)
173+
success = mutate_and_print(path_to_input_file, path_to_root)
174+
if success:
175+
success_count += 1
176+
else:
177+
failure_count += 1
178+
179+
assert total_runs > 0
180+
success_rate_percent = (success_count / total_runs) * 100
181+
182+
print( # noqa: T201
183+
f"html2pdf4doc_fuzzer: finished {'✅' if failure_count == 0 else '❌'} — "
184+
f"Success rate: {success_count}/{total_runs} ({success_rate_percent}%)",
185+
flush=True,
186+
)
187+
188+
if failure_count > 0:
189+
sys.exit(1)
190+
191+
192+
if __name__ == "__main__":
193+
main()

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ development = [
7070

7171
[project.scripts]
7272
html2pdf4doc = "html2pdf4doc.html2pdf4doc:main"
73+
html2pdf4doc_fuzzer = "html2pdf4doc.html2pdf4doc_fuzzer:main"
7374

7475
[project.urls]
7576
Changelog = "https://github.com/mettta/html2pdf_python/releases/"

requirements.development.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,9 @@ ruff>=0.9
1616
#
1717
lit
1818
filecheck==0.0.24
19+
20+
#
21+
# Fuzz tests
22+
#
23+
faker>=37.8.0
24+
lxml>=5.3.0

tasks.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,18 @@ def test_integration(
192192
run_invoke(context, itest_command)
193193

194194

195+
@task(aliases=["tf"])
196+
def test_fuzz(context):
197+
run_invoke(
198+
context,
199+
"""
200+
python html2pdf4doc/html2pdf4doc_fuzzer.py
201+
tests/fuzz/01_strictdoc_guide_202510/strictdoc/docs/strictdoc_01_user_guide-PDF.html
202+
tests/fuzz/01_strictdoc_guide_202510/
203+
""",
204+
)
205+
206+
195207
@task(aliases=["t"])
196208
def test(context):
197209
test_integration(context)

0 commit comments

Comments
 (0)