Skip to content

Commit bfbd210

Browse files
e3krisztianqkaiser
authored andcommitted
Test that handlers tolerate a non-empty unknown chunk prefix/suffix
1 parent 266266c commit bfbd210

File tree

2 files changed

+109
-1
lines changed

2 files changed

+109
-1
lines changed

tests/test_handlers.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,13 @@
77
- The expected output in the __output__ folder.
88
"""
99

10+
import hashlib
1011
import inspect
12+
from collections import Counter
1113
from pathlib import Path
1214
from typing import Type
1315

16+
import attr
1417
import pytest
1518

1619
from unblob import handlers
@@ -39,6 +42,111 @@ def test_all_handlers(
3942
check_output_is_the_same(output_dir, extraction_config.extract_root)
4043

4144

45+
BLOCK_SHIFTING_PREFIX = bytes([0]) + b"unique unknown prefix" + bytes([0])
46+
PADDING_CHECK_SUFFIX = bytes([0] * 511) + b"unique unknown suffix" + bytes([0])
47+
48+
49+
@pytest.mark.parametrize(
50+
"input_dir, output_dir", gather_integration_tests(TEST_DATA_PATH)
51+
)
52+
@pytest.mark.parametrize(
53+
"prefix, suffix",
54+
[
55+
# pytest.param(b"", b"", id="no-extras"),
56+
pytest.param(BLOCK_SHIFTING_PREFIX, b"", id="block-shifted"),
57+
pytest.param(b"", PADDING_CHECK_SUFFIX, id="padding-check"),
58+
pytest.param(
59+
BLOCK_SHIFTING_PREFIX,
60+
PADDING_CHECK_SUFFIX,
61+
id="block-shifted-padding-check",
62+
),
63+
],
64+
)
65+
def test_all_handlers_chunk_stability(
66+
input_dir: Path,
67+
output_dir: Path,
68+
extraction_config: ExtractionConfig,
69+
tmp_path: Path,
70+
prefix: bytes,
71+
suffix: bytes,
72+
):
73+
"""Test that handlers tolerate a non-empty unknown chunk prefix/suffix"""
74+
altered_input_file = tmp_path / "input_file"
75+
76+
for input_file in input_dir.iterdir():
77+
altered_input_file.write_bytes(prefix + input_file.read_bytes() + suffix)
78+
79+
config = attr.evolve(
80+
extraction_config,
81+
extract_root=extraction_config.extract_root / input_file.name,
82+
)
83+
reports = process_file(config, altered_input_file)
84+
check_result(reports)
85+
86+
check_output_do_not_change_much_due_to_extras(
87+
input_file,
88+
expected=output_dir / (input_file.name + config.extract_suffix),
89+
actual=config.extract_root,
90+
)
91+
92+
93+
def hash_bytes(data: bytes) -> str:
94+
return hashlib.sha512(data).hexdigest()
95+
96+
97+
def hash_dir(root: Path, print=lambda *_: None) -> Counter:
98+
"""Hash all files under an unblob extraction directory :root:, ignoring test prefix/suffix.
99+
100+
Directory structures having the same set of files will result in the same output,
101+
even if the file names are different or the directory structure is different.
102+
103+
Test prefix/suffix is excluded from hash calculation, so that unknown chunks extended
104+
with them will produce the same hash.
105+
106+
Returns: count of each hashes found
107+
"""
108+
hash_counter = Counter()
109+
for path in sorted(root.rglob("*")):
110+
if not path.is_file() or path.name == ".gitkeep":
111+
continue
112+
113+
content = path.read_bytes()
114+
# ignore newly introduced unknown chunk files
115+
if content in (BLOCK_SHIFTING_PREFIX, PADDING_CHECK_SUFFIX):
116+
continue
117+
118+
# remove extras introduced before hashing
119+
if content.startswith(BLOCK_SHIFTING_PREFIX):
120+
content = content[len(BLOCK_SHIFTING_PREFIX) :]
121+
if content.endswith(PADDING_CHECK_SUFFIX):
122+
content = content[: -len(PADDING_CHECK_SUFFIX)]
123+
124+
hash = hash_bytes(content)
125+
hash_counter[hash] += 1
126+
# Output for debugging failures
127+
print(f" {path} =\n {hash}")
128+
return hash_counter
129+
130+
131+
def check_output_do_not_change_much_due_to_extras(
132+
original_input_file: Path, expected: Path, actual: Path
133+
):
134+
print(f"{expected=}")
135+
expected_counts = hash_dir(expected, print=print)
136+
print(f"{actual=}")
137+
actual_counts = hash_dir(actual, print=print)
138+
139+
# original input will show up in the extraction of the modified input due to the extra unknown chunks
140+
# but it might not show up in the expected output if it was a "whole file chunk"
141+
hash_original_input = hash_bytes(original_input_file.read_bytes())
142+
if hash_original_input not in expected_counts:
143+
print(f"Warn: hash of original input: {hash_original_input} not in expected")
144+
assert actual_counts[hash_original_input] <= 1
145+
del actual_counts[hash_original_input]
146+
147+
assert expected_counts == actual_counts
148+
149+
42150
@pytest.mark.parametrize(
43151
"handler",
44152
(pytest.param(handler, id=handler.NAME) for handler in handlers.BUILTIN_HANDLERS),

unblob/testing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def gather_integration_tests(test_data_path: Path):
4141
@pytest.fixture
4242
def extraction_config(tmp_path: Path):
4343
config = ExtractionConfig(
44-
extract_root=tmp_path,
44+
extract_root=tmp_path / "extract_root",
4545
entropy_depth=0,
4646
keep_extracted_chunks=True,
4747
)

0 commit comments

Comments
 (0)