|
7 | 7 | - The expected output in the __output__ folder. |
8 | 8 | """ |
9 | 9 |
|
| 10 | +import hashlib |
10 | 11 | import inspect |
| 12 | +from collections import Counter |
11 | 13 | from pathlib import Path |
12 | 14 | from typing import Type |
13 | 15 |
|
| 16 | +import attr |
14 | 17 | import pytest |
15 | 18 |
|
16 | 19 | from unblob import handlers |
@@ -39,6 +42,111 @@ def test_all_handlers( |
39 | 42 | check_output_is_the_same(output_dir, extraction_config.extract_root) |
40 | 43 |
|
41 | 44 |
|
| 45 | +BLOCK_SHIFTING_PREFIX = bytes([0]) + b"unique unknown prefix" + bytes([0]) |
| 46 | +PADDING_CHECK_SUFFIX = bytes([0] * 511) + b"unique unknown suffix" + bytes([0]) |
| 47 | + |
| 48 | + |
| 49 | +@pytest.mark.parametrize( |
| 50 | + "input_dir, output_dir", gather_integration_tests(TEST_DATA_PATH) |
| 51 | +) |
| 52 | +@pytest.mark.parametrize( |
| 53 | + "prefix, suffix", |
| 54 | + [ |
| 55 | + # pytest.param(b"", b"", id="no-extras"), |
| 56 | + pytest.param(BLOCK_SHIFTING_PREFIX, b"", id="block-shifted"), |
| 57 | + pytest.param(b"", PADDING_CHECK_SUFFIX, id="padding-check"), |
| 58 | + pytest.param( |
| 59 | + BLOCK_SHIFTING_PREFIX, |
| 60 | + PADDING_CHECK_SUFFIX, |
| 61 | + id="block-shifted-padding-check", |
| 62 | + ), |
| 63 | + ], |
| 64 | +) |
| 65 | +def test_all_handlers_chunk_stability( |
| 66 | + input_dir: Path, |
| 67 | + output_dir: Path, |
| 68 | + extraction_config: ExtractionConfig, |
| 69 | + tmp_path: Path, |
| 70 | + prefix: bytes, |
| 71 | + suffix: bytes, |
| 72 | +): |
| 73 | + """Test that handlers tolerate a non-empty unknown chunk prefix/suffix""" |
| 74 | + altered_input_file = tmp_path / "input_file" |
| 75 | + |
| 76 | + for input_file in input_dir.iterdir(): |
| 77 | + altered_input_file.write_bytes(prefix + input_file.read_bytes() + suffix) |
| 78 | + |
| 79 | + config = attr.evolve( |
| 80 | + extraction_config, |
| 81 | + extract_root=extraction_config.extract_root / input_file.name, |
| 82 | + ) |
| 83 | + reports = process_file(config, altered_input_file) |
| 84 | + check_result(reports) |
| 85 | + |
| 86 | + check_output_do_not_change_much_due_to_extras( |
| 87 | + input_file, |
| 88 | + expected=output_dir / (input_file.name + config.extract_suffix), |
| 89 | + actual=config.extract_root, |
| 90 | + ) |
| 91 | + |
| 92 | + |
| 93 | +def hash_bytes(data: bytes) -> str: |
| 94 | + return hashlib.sha512(data).hexdigest() |
| 95 | + |
| 96 | + |
| 97 | +def hash_dir(root: Path, print=lambda *_: None) -> Counter: |
| 98 | + """Hash all files under an unblob extraction directory :root:, ignoring test prefix/suffix. |
| 99 | +
|
| 100 | + Directory structures having the same set of files will result in the same output, |
| 101 | + even if the file names are different or the directory structure is different. |
| 102 | +
|
| 103 | + Test prefix/suffix is excluded from hash calculation, so that unknown chunks extended |
| 104 | + with them will produce the same hash. |
| 105 | +
|
| 106 | + Returns: count of each hashes found |
| 107 | + """ |
| 108 | + hash_counter = Counter() |
| 109 | + for path in sorted(root.rglob("*")): |
| 110 | + if not path.is_file() or path.name == ".gitkeep": |
| 111 | + continue |
| 112 | + |
| 113 | + content = path.read_bytes() |
| 114 | + # ignore newly introduced unknown chunk files |
| 115 | + if content in (BLOCK_SHIFTING_PREFIX, PADDING_CHECK_SUFFIX): |
| 116 | + continue |
| 117 | + |
| 118 | + # remove extras introduced before hashing |
| 119 | + if content.startswith(BLOCK_SHIFTING_PREFIX): |
| 120 | + content = content[len(BLOCK_SHIFTING_PREFIX) :] |
| 121 | + if content.endswith(PADDING_CHECK_SUFFIX): |
| 122 | + content = content[: -len(PADDING_CHECK_SUFFIX)] |
| 123 | + |
| 124 | + hash = hash_bytes(content) |
| 125 | + hash_counter[hash] += 1 |
| 126 | + # Output for debugging failures |
| 127 | + print(f" {path} =\n {hash}") |
| 128 | + return hash_counter |
| 129 | + |
| 130 | + |
| 131 | +def check_output_do_not_change_much_due_to_extras( |
| 132 | + original_input_file: Path, expected: Path, actual: Path |
| 133 | +): |
| 134 | + print(f"{expected=}") |
| 135 | + expected_counts = hash_dir(expected, print=print) |
| 136 | + print(f"{actual=}") |
| 137 | + actual_counts = hash_dir(actual, print=print) |
| 138 | + |
| 139 | + # original input will show up in the extraction of the modified input due to the extra unknown chunks |
| 140 | + # but it might not show up in the expected output if it was a "whole file chunk" |
| 141 | + hash_original_input = hash_bytes(original_input_file.read_bytes()) |
| 142 | + if hash_original_input not in expected_counts: |
| 143 | + print(f"Warn: hash of original input: {hash_original_input} not in expected") |
| 144 | + assert actual_counts[hash_original_input] <= 1 |
| 145 | + del actual_counts[hash_original_input] |
| 146 | + |
| 147 | + assert expected_counts == actual_counts |
| 148 | + |
| 149 | + |
42 | 150 | @pytest.mark.parametrize( |
43 | 151 | "handler", |
44 | 152 | (pytest.param(handler, id=handler.NAME) for handler in handlers.BUILTIN_HANDLERS), |
|
0 commit comments