Skip to content

Commit d7351fb

Browse files
kukoveczqkaiser
authored andcommitted
feat(processing): Re-add non-POSIX file path handling
The sole reason of the existence and usage of the valid_path function (to limit unblob not to handle files with non-POSIX path) was added because of yara-python limitations in commit e903602. Since we are not using yara-python anymore, these limitations can now be lifted! The only consequence what I discovered during testing is that the logger's format needed to be extended. It was already implemented to encode Path-strings with surrogateescape, but paths can be logged in strings as well (for example when logging the extractor command, which contains the path of the file to be extracted).
1 parent 5a65d66 commit d7351fb

File tree

5 files changed

+42
-29
lines changed

5 files changed

+42
-29
lines changed

tests/test_file_utils.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import io
2-
from pathlib import Path
32
from typing import List
43

54
import pytest
@@ -19,7 +18,6 @@
1918
iterate_patterns,
2019
round_down,
2120
round_up,
22-
valid_path,
2321
)
2422

2523

@@ -340,16 +338,3 @@ def test_get_endian_resets_the_file_pointer(self):
340338
with pytest.raises(InvalidInputFormat):
341339
get_endian(file, 0xFFFF_0000)
342340
assert file.tell() == pos
343-
344-
345-
@pytest.mark.parametrize(
346-
"content, expected",
347-
[
348-
pytest.param("some_random_file.txt", True, id="valid_unicode_path"),
349-
pytest.param(
350-
"some/random/file\udce4\udc94.txt", False, id="invalid_unicode_path"
351-
),
352-
],
353-
)
354-
def test_valid_path(content: str, expected: bool):
355-
assert valid_path(Path(content)) == expected

tests/test_processing.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import platform
12
import sys
23
import zipfile
34
from pathlib import Path
@@ -16,7 +17,7 @@
1617
process_file,
1718
remove_inner_chunks,
1819
)
19-
from unblob.report import ExtractDirectoryExistsReport
20+
from unblob.report import ExtractDirectoryExistsReport, StatReport
2021

2122

2223
def assert_same_chunks(expected, actual, explanation=None):
@@ -277,3 +278,36 @@ def test_process_file_prevents_double_extracts(tmp_path: Path, fw: Path):
277278
assert outsiders == [extracted_fw_zip]
278279

279280
assert extracted_extracted_fw_paths == [Path("."), *extracted_fw_paths]
281+
282+
283+
@pytest.mark.skipif(
284+
platform.system() == "Darwin", reason="non-POSIX path not supported"
285+
)
286+
def test_processing_with_non_posix_paths(tmp_path: Path):
287+
non_unicode_file = tmp_path / "file-\udce4\udc94"
288+
non_unicode_file.write_bytes(b"content")
289+
290+
directory = tmp_path / "dir-\udce4\udc94"
291+
directory.mkdir(exist_ok=True)
292+
file_with_non_unicode_dir = directory / "test.txt"
293+
file_with_non_unicode_dir.write_bytes(b"content")
294+
295+
extract_root = tmp_path / "extract_root"
296+
config = ExtractionConfig(extract_root=extract_root, entropy_depth=0)
297+
298+
for path in (non_unicode_file, file_with_non_unicode_dir):
299+
process_result = process_file(config, path)
300+
assert process_result.errors == []
301+
assert len(process_result.results) == 1
302+
assert len(process_result.results[0].reports) == 3
303+
304+
report = process_result.results[0].reports[0]
305+
assert isinstance(report, StatReport)
306+
assert report == StatReport(
307+
path=path,
308+
size=7,
309+
is_dir=False,
310+
is_file=True,
311+
is_link=False,
312+
link_target=None,
313+
)

unblob/file_utils.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -311,11 +311,3 @@ def read_until_past(file: File, pattern: bytes):
311311
return file.tell()
312312
if next_byte not in pattern:
313313
return file.tell() - 1
314-
315-
316-
def valid_path(path: Path) -> bool:
317-
try:
318-
path.as_posix().encode("utf-8")
319-
except UnicodeEncodeError:
320-
return False
321-
return True

unblob/logging.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ def _format_message(value: Any, extract_root: Path) -> Any:
4747
if isinstance(value, int):
4848
return format_hex(value)
4949

50+
if isinstance(value, str):
51+
try:
52+
value.encode()
53+
except UnicodeEncodeError:
54+
return value.encode("utf-8", errors="surrogateescape")
55+
5056
return value
5157

5258

unblob/processing.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from unblob.handlers import BUILTIN_HANDLERS, Handlers
1414

1515
from .extractor import carve_unknown_chunk, carve_valid_chunk, fix_extracted_directory
16-
from .file_utils import iterate_file, valid_path
16+
from .file_utils import iterate_file
1717
from .finder import search_chunks
1818
from .iter_utils import pairwise
1919
from .logging import noformat
@@ -244,10 +244,6 @@ def _process_task(self, result: TaskResult, task: Task):
244244
log.debug("Reached maximum depth, stop further processing")
245245
return
246246

247-
if not valid_path(task.path):
248-
log.warning("Path contains invalid characters, it won't be processed")
249-
return
250-
251247
if stat_report.is_dir:
252248
log.debug("Found directory")
253249
for path in task.path.iterdir():

0 commit comments

Comments
 (0)