strictdoc-project
diff --git a/‎.github/workflows/ci-mac.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/ci-mac.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/ci_fuzz_linux.yml‎
Lines changed: 69 additions & 0 deletions b/‎.github/workflows/ci_fuzz_linux.yml‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎html2pdf4doc/html2pdf4doc.py‎
Lines changed: 1 addition & 0 deletions b/‎html2pdf4doc/html2pdf4doc.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎html2pdf4doc/html2pdf4doc_fuzzer.py‎
Lines changed: 193 additions & 0 deletions b/‎html2pdf4doc/html2pdf4doc_fuzzer.py‎
Lines changed: 193 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements.development.txt‎
Lines changed: 6 additions & 0 deletions b/‎requirements.development.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tasks.py‎
Lines changed: 12 additions & 0 deletions b/‎tasks.py‎
Lines changed: 12 additions & 0 deletions
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         python-version: [
-          "3.8", "3.12"
+          "3.9", "3.13"
         ]
 
     steps:
 
@@ -0,0 +1,69 @@
+name: "HTML2PDF4Doc Fuzz Testing on Linux"
+
+on:
+  pull_request:
+    branches: [ "**" ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    timeout-minutes: 120  # 2 hours
+
+    strategy:
+      matrix:
+        python-version: [
+          "3.12"
+        ]
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Upgrade pip
+      run: |
+        python -m pip install --upgrade pip
+
+    - name: Install Python packages
+      run: |
+        pip install -r requirements.development.txt
+
+    - name: Clone html2pdf4doc.js
+      run: |
+        invoke bootstrap
+      env:
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Install html2pdf4doc dependencies.
+      run: |
+        python developer/pip_install_html2pdf4doc_deps.py
+
+    - name: Run Lint tasks
+      run: |
+        invoke lint
+
+    - name: Build HTML2PDF4Doc.js
+      run: |
+        invoke build
+
+    - name: Run tests
+      run: |
+        if [ "${{ github.event_name }}" = "schedule" ]; then
+          echo "🕒 Running long fuzzing..."
+          invoke test-fuzz --long
+        else
+          echo "🚀 Running short fuzzing..."
+          invoke test-fuzz
+        fi
+
+    - name: Upload broken PDFs as artifact
+      # Always upload, even if job fails.
+      if: failure() || always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: broken-pdfs
+        path: output/
+        retention-days: 30
@@ -9,3 +9,7 @@ tests/integration/.lit_test_times.txt
 tests/integration/**/Output/
 output/
 
+__pycache__/
+
+# Fuzz testing files.
+**.mut.**
@@ -23,6 +23,7 @@
 
 __version__ = "0.0.22"
 
+PATH_TO_HTML2PDF4DOC_PY = __file__
 PATH_TO_HTML2PDF4DOC_JS = os.path.join(
     os.path.dirname(os.path.join(__file__)),
     "html2pdf4doc_js",
 
@@ -0,0 +1,193 @@
+import argparse
+import contextlib
+import datetime
+import os.path
+import random
+import shutil
+import sys
+from pathlib import Path
+from subprocess import CalledProcessError, CompletedProcess, TimeoutExpired, run
+from time import time
+from typing import Iterator, List
+
+from faker import Faker
+from lxml import etree, html
+
+from html2pdf4doc import PATH_TO_HTML2PDF4DOC_PY
+
+
+@contextlib.contextmanager
+def measure_performance(title: str) -> Iterator[None]:
+    time_start = time()
+    yield
+    time_end = time()
+
+    time_diff = time_end - time_start
+    padded_name = f"{title} ".ljust(60, ".")
+    padded_time = f" {time_diff:0.2f}".rjust(6, ".")
+    print(f"{padded_name}{padded_time}s", flush=True)  # noqa: T201
+
+
+def mutate_and_print(path_to_input_file: str, path_to_root: str) -> bool:
+    assert os.path.isfile(path_to_input_file), path_to_input_file
+    assert os.path.isdir(path_to_root), path_to_root
+    if not os.path.abspath(path_to_root):
+        path_to_root = os.path.abspath(path_to_root)
+
+    text = open(path_to_input_file, encoding="utf-8").read()
+
+    # Parse HTML into DOM
+    tree = html.fromstring(text)
+
+    # Pick a random element
+    elems = tree.xpath("//p | //td")
+    if elems:
+        for _i in range(25):
+            node = random.choice(elems)
+
+            print("Mutating node:", node.tag, flush=True)  # noqa: T201
+
+            n_sentences = random.randint(1, 100)
+
+            fake = Faker()
+            extra_text = fake.text(max_nb_chars=10 * n_sentences)
+
+            node.text = extra_text
+
+    # Serialize back to HTML
+    mutated_html = etree.tostring(
+        tree, pretty_print=False, method="html", encoding="unicode"
+    )
+
+    # Save next to input file
+    path_to_mut_html = path_to_input_file + ".mut.html"
+    path_to_mut_pdf = path_to_input_file + ".mut.html.pdf"
+    with open(path_to_mut_html, "w", encoding="utf-8") as f:
+        f.write(mutated_html)
+
+    print("Wrote mutated file:", path_to_mut_html, flush=True)  # noqa: T201
+
+    paths_to_print = [(path_to_mut_html, path_to_mut_pdf)]
+
+    cmd: List[str] = [
+        sys.executable,
+        PATH_TO_HTML2PDF4DOC_PY,
+        "print",
+        "--strict",
+    ]
+
+    for path_to_print_ in paths_to_print:
+        cmd.append(path_to_print_[0])
+        cmd.append(path_to_print_[1])
+
+    relative_path_to_mut_html = Path(path_to_root).relative_to(".")
+    path_to_mut_output = f"output/{relative_path_to_mut_html}"
+
+    def copy_files_if_needed() -> None:
+        if os.path.isdir(path_to_mut_output):
+            return
+
+        shutil.rmtree("output", ignore_errors=True)
+        Path("output").mkdir(parents=True, exist_ok=True)
+
+        shutil.copytree(
+            "html2pdf4doc", "output/html2pdf4doc", dirs_exist_ok=True
+        )
+
+        shutil.rmtree(path_to_mut_output, ignore_errors=True)
+        Path(path_to_mut_output).mkdir(parents=True, exist_ok=True)
+
+        shutil.copytree(path_to_root, path_to_mut_output, dirs_exist_ok=True)
+
+    def copy_mutated_file() -> None:
+        relative_path_to_mut_html = Path(path_to_mut_html).relative_to(
+            path_to_root
+        )
+
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        path_to_mut_html_out = os.path.join(
+            path_to_mut_output,
+            f"{relative_path_to_mut_html}.{timestamp}.html",
+        )
+        shutil.copy(path_to_mut_html, path_to_mut_html_out)
+
+        path_to_mut_pdf_out = os.path.join(
+            path_to_mut_output,
+            f"{relative_path_to_mut_html}.{timestamp}.pdf",
+        )
+        shutil.copy(path_to_mut_pdf, path_to_mut_pdf_out)
+
+        print(  # noqa: T201
+            f"Saved failed mutated HTML as:\n"
+            f"HTML: {path_to_mut_html_out}\n"
+            f"PDF: {path_to_mut_pdf_out}"
+        )
+
+    with measure_performance(
+        "html2pdf4doc_fuzzer: printing HTML to PDF using HTML2PDF and Chrome Driver"
+    ):
+        try:
+            _: CompletedProcess[bytes] = run(
+                cmd, capture_output=False, check=True, bufsize=1
+            )
+        except CalledProcessError as called_process_error_:
+            print(called_process_error_)  # noqa: T201
+
+            copy_files_if_needed()
+
+            copy_mutated_file()
+
+            return False
+        except TimeoutExpired:
+            raise TimeoutError from None
+    return True
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("input_file", type=str, help="TODO")
+    parser.add_argument("root_path", type=str, help="TODO")
+    parser.add_argument(
+        "--long",
+        action="store_true",
+        help="Run the fuzzer in long mode (more iterations).",
+    )
+
+    args = parser.parse_args()
+
+    path_to_input_file = args.input_file
+    path_to_root = args.root_path
+
+    shutil.rmtree("output", ignore_errors=True)
+    Path("output").mkdir(parents=True, exist_ok=True)
+
+    total_runs = 200 if args.long else 20
+    success_count, failure_count = 0, 0
+    for i in range(1, total_runs + 1):
+        print(  # noqa: T201
+            f"html2pdf4doc_fuzzer print cycle #{i}/{total_runs} — "
+            f"So far: 🟢{success_count} / 🔴{failure_count}",
+            flush=True,
+        )
+        success = mutate_and_print(path_to_input_file, path_to_root)
+        if success:
+            success_count += 1
+        else:
+            failure_count += 1
+
+    assert total_runs > 0
+    success_rate_percent = (success_count / total_runs) * 100
+
+    print(  # noqa: T201
+        f"html2pdf4doc_fuzzer: finished {'✅' if failure_count == 0 else '❌'} — "
+        f"Success rate: {success_count}/{total_runs} ({success_rate_percent}%)",
+        flush=True,
+    )
+
+    if failure_count > 0:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
@@ -70,6 +70,7 @@ development = [
 
 [project.scripts]
 html2pdf4doc = "html2pdf4doc.html2pdf4doc:main"
+html2pdf4doc_fuzzer = "html2pdf4doc.html2pdf4doc_fuzzer:main"
 
 [project.urls]
 Changelog = "https://github.com/mettta/html2pdf_python/releases/"
 
@@ -16,3 +16,9 @@ ruff>=0.9
 #
 lit
 filecheck==0.0.24
+
+#
+# Fuzz tests
+#
+faker>=37.8.0
+lxml>=5.3.0
@@ -192,6 +192,18 @@ def test_integration(
     run_invoke(context, itest_command)
 
 
+@task(aliases=["tf"])
+def test_fuzz(context):
+    run_invoke(
+        context,
+        """
+            python html2pdf4doc/html2pdf4doc_fuzzer.py
+                tests/fuzz/01_strictdoc_guide_202510/strictdoc/docs/strictdoc_01_user_guide-PDF.html
+                tests/fuzz/01_strictdoc_guide_202510/
+        """,
+    )
+
+
 @task(aliases=["t"])
 def test(context):
     test_integration(context)
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ jobs:`
`11`	`11`	`strategy:`
`12`	`12`	`matrix:`
`13`	`13`	`python-version: [`
`14`		`- "3.8", "3.12"`
	`14`	`+ "3.9", "3.13"`
`15`	`15`	`]`
`16`	`16`
`17`	`17`	`steps:`