|
| 1 | +import argparse |
| 2 | +import contextlib |
| 3 | +import datetime |
| 4 | +import os.path |
| 5 | +import random |
| 6 | +import shutil |
| 7 | +import sys |
| 8 | +from pathlib import Path |
| 9 | +from subprocess import CalledProcessError, CompletedProcess, TimeoutExpired, run |
| 10 | +from time import time |
| 11 | +from typing import Iterator, List |
| 12 | + |
| 13 | +from faker import Faker |
| 14 | +from lxml import etree, html |
| 15 | + |
| 16 | +from html2pdf4doc import PATH_TO_HTML2PDF4DOC_PY |
| 17 | + |
| 18 | + |
| 19 | +@contextlib.contextmanager |
| 20 | +def measure_performance(title: str) -> Iterator[None]: |
| 21 | + time_start = time() |
| 22 | + yield |
| 23 | + time_end = time() |
| 24 | + |
| 25 | + time_diff = time_end - time_start |
| 26 | + padded_name = f"{title} ".ljust(60, ".") |
| 27 | + padded_time = f" {time_diff:0.2f}".rjust(6, ".") |
| 28 | + print(f"{padded_name}{padded_time}s", flush=True) # noqa: T201 |
| 29 | + |
| 30 | + |
| 31 | +def mutate_and_print(path_to_input_file: str, path_to_root: str) -> bool: |
| 32 | + assert os.path.isfile(path_to_input_file), path_to_input_file |
| 33 | + assert os.path.isdir(path_to_root), path_to_root |
| 34 | + if not os.path.abspath(path_to_root): |
| 35 | + path_to_root = os.path.abspath(path_to_root) |
| 36 | + |
| 37 | + text = open(path_to_input_file, encoding="utf-8").read() |
| 38 | + |
| 39 | + # Parse HTML into DOM |
| 40 | + tree = html.fromstring(text) |
| 41 | + |
| 42 | + # Pick a random element |
| 43 | + elems = tree.xpath("//p | //td") |
| 44 | + if elems: |
| 45 | + for _i in range(25): |
| 46 | + node = random.choice(elems) |
| 47 | + |
| 48 | + print("Mutating node:", node.tag, flush=True) # noqa: T201 |
| 49 | + |
| 50 | + n_sentences = random.randint(1, 100) |
| 51 | + |
| 52 | + fake = Faker() |
| 53 | + extra_text = fake.text(max_nb_chars=10 * n_sentences) |
| 54 | + |
| 55 | + node.text = extra_text |
| 56 | + |
| 57 | + # Serialize back to HTML |
| 58 | + mutated_html = etree.tostring( |
| 59 | + tree, pretty_print=False, method="html", encoding="unicode" |
| 60 | + ) |
| 61 | + |
| 62 | + # Save next to input file |
| 63 | + path_to_mut_html = path_to_input_file + ".mut.html" |
| 64 | + path_to_mut_pdf = path_to_input_file + ".mut.html.pdf" |
| 65 | + with open(path_to_mut_html, "w", encoding="utf-8") as f: |
| 66 | + f.write(mutated_html) |
| 67 | + |
| 68 | + print("Wrote mutated file:", path_to_mut_html, flush=True) # noqa: T201 |
| 69 | + |
| 70 | + paths_to_print = [(path_to_mut_html, path_to_mut_pdf)] |
| 71 | + |
| 72 | + cmd: List[str] = [ |
| 73 | + sys.executable, |
| 74 | + PATH_TO_HTML2PDF4DOC_PY, |
| 75 | + "print", |
| 76 | + "--strict", |
| 77 | + ] |
| 78 | + |
| 79 | + for path_to_print_ in paths_to_print: |
| 80 | + cmd.append(path_to_print_[0]) |
| 81 | + cmd.append(path_to_print_[1]) |
| 82 | + |
| 83 | + relative_path_to_mut_html = Path(path_to_root).relative_to(".") |
| 84 | + path_to_mut_output = f"output/{relative_path_to_mut_html}" |
| 85 | + |
| 86 | + def copy_files_if_needed() -> None: |
| 87 | + if os.path.isdir(path_to_mut_output): |
| 88 | + return |
| 89 | + |
| 90 | + shutil.rmtree("output", ignore_errors=True) |
| 91 | + Path("output").mkdir(parents=True, exist_ok=True) |
| 92 | + |
| 93 | + shutil.copytree( |
| 94 | + "html2pdf4doc", "output/html2pdf4doc", dirs_exist_ok=True |
| 95 | + ) |
| 96 | + |
| 97 | + shutil.rmtree(path_to_mut_output, ignore_errors=True) |
| 98 | + Path(path_to_mut_output).mkdir(parents=True, exist_ok=True) |
| 99 | + |
| 100 | + shutil.copytree(path_to_root, path_to_mut_output, dirs_exist_ok=True) |
| 101 | + |
| 102 | + def copy_mutated_file() -> None: |
| 103 | + relative_path_to_mut_html = Path(path_to_mut_html).relative_to( |
| 104 | + path_to_root |
| 105 | + ) |
| 106 | + |
| 107 | + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
| 108 | + path_to_mut_html_out = os.path.join( |
| 109 | + path_to_mut_output, |
| 110 | + f"{relative_path_to_mut_html}.{timestamp}.html", |
| 111 | + ) |
| 112 | + shutil.copy(path_to_mut_html, path_to_mut_html_out) |
| 113 | + |
| 114 | + path_to_mut_pdf_out = os.path.join( |
| 115 | + path_to_mut_output, |
| 116 | + f"{relative_path_to_mut_html}.{timestamp}.pdf", |
| 117 | + ) |
| 118 | + shutil.copy(path_to_mut_pdf, path_to_mut_pdf_out) |
| 119 | + |
| 120 | + print( # noqa: T201 |
| 121 | + f"Saved failed mutated HTML as:\n" |
| 122 | + f"HTML: {path_to_mut_html_out}\n" |
| 123 | + f"PDF: {path_to_mut_pdf_out}" |
| 124 | + ) |
| 125 | + |
| 126 | + with measure_performance( |
| 127 | + "html2pdf4doc_fuzzer: printing HTML to PDF using HTML2PDF and Chrome Driver" |
| 128 | + ): |
| 129 | + try: |
| 130 | + _: CompletedProcess[bytes] = run( |
| 131 | + cmd, capture_output=False, check=True, bufsize=1 |
| 132 | + ) |
| 133 | + except CalledProcessError as called_process_error_: |
| 134 | + print(called_process_error_) # noqa: T201 |
| 135 | + |
| 136 | + copy_files_if_needed() |
| 137 | + |
| 138 | + copy_mutated_file() |
| 139 | + |
| 140 | + return False |
| 141 | + except TimeoutExpired: |
| 142 | + raise TimeoutError from None |
| 143 | + return True |
| 144 | + |
| 145 | + |
| 146 | +def main() -> None: |
| 147 | + parser = argparse.ArgumentParser() |
| 148 | + |
| 149 | + parser.add_argument("input_file", type=str, help="TODO") |
| 150 | + parser.add_argument("root_path", type=str, help="TODO") |
| 151 | + parser.add_argument( |
| 152 | + "--long", |
| 153 | + action="store_true", |
| 154 | + help="Run the fuzzer in long mode (more iterations).", |
| 155 | + ) |
| 156 | + |
| 157 | + args = parser.parse_args() |
| 158 | + |
| 159 | + path_to_input_file = args.input_file |
| 160 | + path_to_root = args.root_path |
| 161 | + |
| 162 | + shutil.rmtree("output", ignore_errors=True) |
| 163 | + Path("output").mkdir(parents=True, exist_ok=True) |
| 164 | + |
| 165 | + total_runs = 200 if args.long else 20 |
| 166 | + success_count, failure_count = 0, 0 |
| 167 | + for i in range(1, total_runs + 1): |
| 168 | + print( # noqa: T201 |
| 169 | + f"html2pdf4doc_fuzzer print cycle #{i}/{total_runs} — " |
| 170 | + f"So far: 🟢{success_count} / 🔴{failure_count}", |
| 171 | + flush=True, |
| 172 | + ) |
| 173 | + success = mutate_and_print(path_to_input_file, path_to_root) |
| 174 | + if success: |
| 175 | + success_count += 1 |
| 176 | + else: |
| 177 | + failure_count += 1 |
| 178 | + |
| 179 | + assert total_runs > 0 |
| 180 | + success_rate_percent = (success_count / total_runs) * 100 |
| 181 | + |
| 182 | + print( # noqa: T201 |
| 183 | + f"html2pdf4doc_fuzzer: finished {'✅' if failure_count == 0 else '❌'} — " |
| 184 | + f"Success rate: {success_count}/{total_runs} ({success_rate_percent}%)", |
| 185 | + flush=True, |
| 186 | + ) |
| 187 | + |
| 188 | + if failure_count > 0: |
| 189 | + sys.exit(1) |
| 190 | + |
| 191 | + |
| 192 | +if __name__ == "__main__": |
| 193 | + main() |
0 commit comments