Skip to content

Commit 460b835

Browse files
authored
Merge pull request #1258 from wilypence/1250-pydantic-report
Refactored report format to use Pydantic
2 parents 4587d13 + 965fa39 commit 460b835

File tree

17 files changed

+388
-180
lines changed

17 files changed

+388
-180
lines changed

docs/guide.md

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,7 @@ $ cat alpine-report.json
7171
"task": {
7272
"path": "/home/walkman/Projects/unblob/demo/alpine-minirootfs-3.16.1-x86_64.tar.gz",
7373
"depth": 0,
74-
"chunk_id": "",
75-
"__typename__": "Task"
74+
"chunk_id": ""
7675
},
7776
"reports": [
7877
{
@@ -104,11 +103,9 @@ $ cat alpine-report.json
104103
{
105104
"path": "/home/walkman/Projects/unblob/demo/alpine-minirootfs-3.16.1-x86_64.tar.gz_extract",
106105
"depth": 1,
107-
"chunk_id": "13590:1",
108-
"__typename__": "Task"
106+
"chunk_id": "13590:1"
109107
}
110-
],
111-
"__typename__": "TaskResult"
108+
]
112109
},
113110
...
114111
]
@@ -144,7 +141,7 @@ $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution"
144141
2024-10-30 10:52.03 [debug ] Shannon entropy calculated block_size=0x20000 highest=99.99 lowest=99.98 mean=99.98 path=unknown-file_extract/0-10485760.unknown pid=1963719 size=0xa00000
145142
2024-10-30 10:52.03 [debug ] Chi square probability calculated block_size=0x20000 highest=97.88 lowest=3.17 mean=52.76 path=unknown-file_extract/0-10485760.unknown pid=1963719 size=0xa00000
146143
2024-10-30 10:52.03 [debug ] Entropy chart chart=
147-
Randomness distribution
144+
Randomness distribution
148145
┌───────────────────────────────────────────────────────────────────────────┐
149146
100┤ •• Shannon entropy (%) •••••••••♰••••••••••••••••••••••••••••••••••│
150147
90┤ ♰♰ Chi square probability (%) ♰ ♰ ♰♰♰♰ ♰ ♰ ♰ │
@@ -158,16 +155,16 @@ $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution"
158155
10┤ ♰ ♰ ♰ ♰ ♰ ♰♰ ♰ ♰ ♰♰ │
159156
0┤ ♰ ♰ │
160157
└─┬──┬─┬──┬────┬───┬──┬──┬──┬───┬───┬──┬────┬───┬────┬──┬──┬────┬──┬───┬──┬─┘
161-
0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79
162-
131072 bytes
158+
0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79
159+
131072 bytes
163160
path=unknown-file_extract/0-10485760.unknown pid=1963719
164161
2024-10-30 10:52.03 [info ] Extracting unknown chunk chunk=0xc96196-0x1696196 path=unknown-file_extract/13197718-23683478.unknown pid=1963719
165162
2024-10-30 10:52.03 [debug ] Carving chunk path=unknown-file_extract/13197718-23683478.unknown pid=1963719
166163
2024-10-30 10:52.03 [debug ] Calculating randomness for file path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000
167164
2024-10-30 10:52.03 [debug ] Shannon entropy calculated block_size=0x20000 highest=99.99 lowest=99.98 mean=99.98 path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000
168165
2024-10-30 10:52.03 [debug ] Chi square probability calculated block_size=0x20000 highest=99.03 lowest=0.23 mean=42.62 path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000
169166
2024-10-30 10:52.03 [debug ] Entropy chart chart=
170-
Randomness distribution
167+
Randomness distribution
171168
┌───────────────────────────────────────────────────────────────────────────┐
172169
100┤ •• Shannon entropy (%) •••••••••••••••••••••♰••••••••••••••••••••••│
173170
90┤ ♰♰ Chi square probability (%) ♰ ♰♰ ♰ │
@@ -181,8 +178,8 @@ $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution"
181178
10┤ ♰ ♰ ♰ ♰ ♰ ♰ ♰ ♰♰ ♰ ♰♰ ♰♰ ♰♰ ♰ ♰ ♰ │
182179
0┤ ♰ ♰ ♰♰ ♰ ♰♰ │
183180
└─┬──┬─┬──┬────┬───┬──┬──┬──┬───┬───┬──┬────┬───┬────┬──┬──┬────┬──┬───┬──┬─┘
184-
0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79
185-
131072 bytes
181+
0 2 5 7 11 16 20 23 27 30 34 38 42 47 51 56 60 63 68 71 76 79
182+
131072 bytes
186183
```
187184

188185
### Skip extraction with file magic

fuzzing/search_chunks_fuzzer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def test_search_chunks(data):
5959
depth=0,
6060
blob_id="",
6161
)
62-
result = TaskResult(task)
62+
result = TaskResult(task=task)
6363
search_chunks(file, len(data), config.handlers, result)
6464

6565

package.nix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ python3.pkgs.buildPythonApplication {
8686
python3.pkgs.lz4 # shadowed by pkgs.lz4
8787
plotext
8888
pluggy
89+
pydantic
8990
pyfatfs
9091
pymdown-extensions
9192
pyperscan

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ dependencies = [
1616
"lz4>=4.3.2,!=4.4.3", # 4.4.3 doesn't have aarch64 wheels https://github.com/python-lz4/python-lz4/pull/298
1717
"plotext>=4.2.0,<6.0",
1818
"pluggy>=1.3.0",
19+
"pydantic>=2.0",
1920
"pyfatfs>=1.0.5",
2021
"pymdown-extensions>=10.15",
2122
"pyperscan>=0.3.0",

python/unblob/extractors/command.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,22 +64,22 @@ def no_op():
6464
exit_code=res.returncode,
6565
)
6666

67-
logger.error("Extract command failed", **error_report.asdict())
67+
logger.error("Extract command failed", **error_report.model_dump())
6868
raise ExtractError(error_report)
6969
except FileNotFoundError:
7070
error_report = ExtractorDependencyNotFoundReport(
7171
dependencies=self.get_dependencies()
7272
)
7373
logger.error(
7474
"Can't run extract command. Is the extractor installed?",
75-
**error_report.asdict(),
75+
**error_report.model_dump(),
7676
)
7777
raise ExtractError(error_report) from None
7878
except subprocess.TimeoutExpired as e:
7979
error_report = ExtractorTimedOut(cmd=e.cmd, timeout=e.timeout)
8080
logger.error(
8181
"Extract command timed out.",
82-
**error_report.asdict(),
82+
**error_report.model_dump(),
8383
)
8484
raise ExtractError(error_report) from None
8585
finally:

python/unblob/finder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def _calculate_chunk(
6363
)
6464
task_result.add_report(error_report)
6565
logger.error(
66-
"Unhandled Exception during chunk calculation", **error_report.asdict()
66+
"Unhandled Exception during chunk calculation", **error_report.model_dump()
6767
)
6868

6969

python/unblob/models.py

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import Generic, Optional, TypeVar, Union
99

1010
import attrs
11+
from pydantic import BaseModel, TypeAdapter
1112
from structlog import get_logger
1213

1314
from .file_utils import Endian, File, InvalidInputFormat, StructParser
@@ -61,12 +62,11 @@ def __post_init__(self):
6162
self.fully_supported = len(self.limitations) == 0
6263

6364

64-
@attrs.define(frozen=True)
65-
class Task:
65+
class Task(BaseModel):
6666
path: Path
6767
depth: int
6868
blob_id: str
69-
is_multi_file: bool = attrs.field(default=False)
69+
is_multi_file: bool = False
7070

7171

7272
@attrs.define
@@ -228,11 +228,10 @@ def as_report(self, extraction_reports: list[Report]) -> MultiFileReport:
228228
ReportType = TypeVar("ReportType", bound=Report)
229229

230230

231-
@attrs.define
232-
class TaskResult:
231+
class TaskResult(BaseModel):
233232
task: Task
234-
reports: list[Report] = attrs.field(factory=list)
235-
subtasks: list[Task] = attrs.field(factory=list)
233+
reports: list[Report] = []
234+
subtasks: list[Task] = []
236235

237236
def add_report(self, report: Report):
238237
self.reports.append(report)
@@ -244,9 +243,8 @@ def filter_reports(self, report_class: type[ReportType]) -> list[ReportType]:
244243
return [report for report in self.reports if isinstance(report, report_class)]
245244

246245

247-
@attrs.define
248-
class ProcessResult:
249-
results: list[TaskResult] = attrs.field(factory=list)
246+
class ProcessResult(BaseModel):
247+
results: list[TaskResult] = []
250248

251249
@property
252250
def errors(self) -> list[ErrorReport]:
@@ -268,7 +266,9 @@ def register(self, result: TaskResult):
268266
self.results.append(result)
269267

270268
def to_json(self, indent=" "):
271-
return to_json(self.results, indent=indent)
269+
return json.dumps(
270+
[result.model_dump(mode="json") for result in self.results], indent=indent
271+
)
272272

273273
def get_output_dir(self) -> Optional[Path]:
274274
try:
@@ -285,37 +285,20 @@ def get_output_dir(self) -> Optional[Path]:
285285
return None
286286

287287

288-
class _JSONEncoder(json.JSONEncoder):
289-
def default(self, o):
290-
obj = o
291-
if attrs.has(type(obj)):
292-
extend_attr_output = True
293-
attr_output = attrs.asdict(obj, recurse=not extend_attr_output)
294-
attr_output["__typename__"] = obj.__class__.__name__
295-
return attr_output
296-
297-
if isinstance(obj, Enum):
298-
return obj.name
299-
300-
if isinstance(obj, Path):
301-
return str(obj)
302-
303-
if isinstance(obj, bytes):
304-
try:
305-
return obj.decode()
306-
except UnicodeDecodeError:
307-
return str(obj)
288+
ReportModel = list[TaskResult]
289+
ReportModelAdapter = TypeAdapter(ReportModel)
290+
"""Use this for deserialization (import JSON report back into Python
291+
objects) of the JSON report.
308292
309-
logger.error("JSONEncoder met a non-JSON encodable value", obj=obj)
310-
# the usual fail path of custom JSONEncoders is to call the parent and let it fail
311-
# return json.JSONEncoder.default(self, obj)
312-
# instead of failing, just return something usable
313-
return f"Non-JSON encodable value: {obj}"
293+
For example:
314294
295+
with open('report.json', 'r') as f:
296+
data = f.read()
297+
report_data = ReportModelAdapter.validate_json(data)
315298
316-
def to_json(obj, indent=" ") -> str:
317-
"""Encode any UnBlob object as a serialized JSON."""
318-
return json.dumps(obj, cls=_JSONEncoder, indent=indent)
299+
For another example see:
300+
tests/test_models.py::Test_to_json::test_process_result_deserialization
301+
"""
319302

320303

321304
class ExtractError(Exception):

python/unblob/processing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ def __init__(self, config: ExtractionConfig):
242242
self._get_mime_type = magic.Magic(mime=True).from_file
243243

244244
def process_task(self, task: Task) -> TaskResult:
245-
result = TaskResult(task)
245+
result = TaskResult(task=task)
246246
try:
247247
self._process_task(result, task)
248248
except Exception as exc:
@@ -393,7 +393,7 @@ def _calculate_multifile(
393393
task_result.add_report(error_report)
394394
logger.warning(
395395
"Unhandled Exception during multi file calculation",
396-
**error_report.asdict(),
396+
**error_report.model_dump(),
397397
)
398398

399399
def _check_conflicting_files(

0 commit comments

Comments
 (0)