onekey-sec
diff --git a/‎docs/guide.md‎
Lines changed: 9 additions & 12 deletions b/‎docs/guide.md‎
Lines changed: 9 additions & 12 deletions
diff --git a/‎fuzzing/search_chunks_fuzzer.py‎
Lines changed: 1 addition & 1 deletion b/‎fuzzing/search_chunks_fuzzer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎package.nix‎
Lines changed: 1 addition & 0 deletions b/‎package.nix‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/unblob/extractors/command.py‎
Lines changed: 3 additions & 3 deletions b/‎python/unblob/extractors/command.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/unblob/finder.py‎
Lines changed: 1 addition & 1 deletion b/‎python/unblob/finder.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/unblob/models.py‎
Lines changed: 22 additions & 39 deletions b/‎python/unblob/models.py‎
Lines changed: 22 additions & 39 deletions
diff --git a/‎python/unblob/processing.py‎
Lines changed: 2 additions & 2 deletions b/‎python/unblob/processing.py‎
Lines changed: 2 additions & 2 deletions
@@ -71,8 +71,7 @@ $ cat alpine-report.json
     "task": {
       "path": "/home/walkman/Projects/unblob/demo/alpine-minirootfs-3.16.1-x86_64.tar.gz",
       "depth": 0,
-      "chunk_id": "",
-      "__typename__": "Task"
+      "chunk_id": ""
     },
     "reports": [
       {
@@ -104,11 +103,9 @@ $ cat alpine-report.json
       {
         "path": "/home/walkman/Projects/unblob/demo/alpine-minirootfs-3.16.1-x86_64.tar.gz_extract",
         "depth": 1,
-        "chunk_id": "13590:1",
-        "__typename__": "Task"
+        "chunk_id": "13590:1"
       }
-    ],
-    "__typename__": "TaskResult"
+    ]
   },
   ...
 ]
@@ -144,7 +141,7 @@ $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution"
 2024-10-30 10:52.03 [debug    ] Shannon entropy calculated     block_size=0x20000 highest=99.99 lowest=99.98 mean=99.98 path=unknown-file_extract/0-10485760.unknown pid=1963719 size=0xa00000
 2024-10-30 10:52.03 [debug    ] Chi square probability calculated block_size=0x20000 highest=97.88 lowest=3.17 mean=52.76 path=unknown-file_extract/0-10485760.unknown pid=1963719 size=0xa00000
 2024-10-30 10:52.03 [debug    ] Entropy chart                  chart=
-                              Randomness distribution                           
+                              Randomness distribution
    ┌───────────────────────────────────────────────────────────────────────────┐
 100┤ •• Shannon entropy (%)        •••••••••♰••••••••••••••••••••••••••••••••••│
  90┤ ♰♰ Chi square probability (%)   ♰ ♰ ♰♰♰♰                    ♰    ♰  ♰     │
@@ -158,16 +155,16 @@ $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution"
  10┤       ♰      ♰    ♰  ♰  ♰     ♰♰    ♰         ♰                   ♰♰      │
   0┤                                ♰                                   ♰      │
    └─┬──┬─┬──┬────┬───┬──┬──┬──┬───┬───┬──┬────┬───┬────┬──┬──┬────┬──┬───┬──┬─┘
-   0 2  5 7 11   16  20 23 27 30  34  38 42   47  51   56 60 63   68 71  76 79  
-                                   131072 bytes                                 
+   0 2  5 7 11   16  20 23 27 30  34  38 42   47  51   56 60 63   68 71  76 79
+                                   131072 bytes
  path=unknown-file_extract/0-10485760.unknown pid=1963719
 2024-10-30 10:52.03 [info     ] Extracting unknown chunk       chunk=0xc96196-0x1696196 path=unknown-file_extract/13197718-23683478.unknown pid=1963719
 2024-10-30 10:52.03 [debug    ] Carving chunk                  path=unknown-file_extract/13197718-23683478.unknown pid=1963719
 2024-10-30 10:52.03 [debug    ] Calculating randomness for file path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000
 2024-10-30 10:52.03 [debug    ] Shannon entropy calculated     block_size=0x20000 highest=99.99 lowest=99.98 mean=99.98 path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000
 2024-10-30 10:52.03 [debug    ] Chi square probability calculated block_size=0x20000 highest=99.03 lowest=0.23 mean=42.62 path=unknown-file_extract/13197718-23683478.unknown pid=1963719 size=0xa00000
 2024-10-30 10:52.03 [debug    ] Entropy chart                  chart=
-                              Randomness distribution                           
+                              Randomness distribution
    ┌───────────────────────────────────────────────────────────────────────────┐
 100┤ •• Shannon entropy (%)        •••••••••••••••••••••♰••••••••••••••••••••••│
  90┤ ♰♰ Chi square probability (%)         ♰           ♰♰            ♰         │
@@ -181,8 +178,8 @@ $ unblob -vvv unknown-file | grep -C 15 "Entropy distribution"
  10┤     ♰                ♰    ♰       ♰ ♰  ♰ ♰ ♰♰   ♰ ♰♰     ♰♰ ♰♰   ♰  ♰ ♰   │
   0┤                                           ♰ ♰    ♰♰          ♰       ♰♰   │
    └─┬──┬─┬──┬────┬───┬──┬──┬──┬───┬───┬──┬────┬───┬────┬──┬──┬────┬──┬───┬──┬─┘
-   0 2  5 7 11   16  20 23 27 30  34  38 42   47  51   56 60 63   68 71  76 79  
-                                   131072 bytes 
+   0 2  5 7 11   16  20 23 27 30  34  38 42   47  51   56 60 63   68 71  76 79
+                                   131072 bytes
 ```
 
 ### Skip extraction with file magic
 
@@ -59,7 +59,7 @@ def test_search_chunks(data):
             depth=0,
             blob_id="",
         )
-        result = TaskResult(task)
+        result = TaskResult(task=task)
         search_chunks(file, len(data), config.handlers, result)
 
 
 
@@ -86,6 +86,7 @@ python3.pkgs.buildPythonApplication {
     python3.pkgs.lz4 # shadowed by pkgs.lz4
     plotext
     pluggy
+    pydantic
     pyfatfs
     pymdown-extensions
     pyperscan
 
@@ -16,6 +16,7 @@ dependencies = [
   "lz4>=4.3.2,!=4.4.3",        # 4.4.3 doesn't have aarch64 wheels https://github.com/python-lz4/python-lz4/pull/298
   "plotext>=4.2.0,<6.0",
   "pluggy>=1.3.0",
+  "pydantic>=2.0",
   "pyfatfs>=1.0.5",
   "pymdown-extensions>=10.15",
   "pyperscan>=0.3.0",
 
@@ -64,22 +64,22 @@ def no_op():
                     exit_code=res.returncode,
                 )
 
-                logger.error("Extract command failed", **error_report.asdict())
+                logger.error("Extract command failed", **error_report.model_dump())
                 raise ExtractError(error_report)
         except FileNotFoundError:
             error_report = ExtractorDependencyNotFoundReport(
                 dependencies=self.get_dependencies()
             )
             logger.error(
                 "Can't run extract command. Is the extractor installed?",
-                **error_report.asdict(),
+                **error_report.model_dump(),
             )
             raise ExtractError(error_report) from None
         except subprocess.TimeoutExpired as e:
             error_report = ExtractorTimedOut(cmd=e.cmd, timeout=e.timeout)
             logger.error(
                 "Extract command timed out.",
-                **error_report.asdict(),
+                **error_report.model_dump(),
             )
             raise ExtractError(error_report) from None
         finally:
 
@@ -63,7 +63,7 @@ def _calculate_chunk(
         )
         task_result.add_report(error_report)
         logger.error(
-            "Unhandled Exception during chunk calculation", **error_report.asdict()
+            "Unhandled Exception during chunk calculation", **error_report.model_dump()
         )
 
 
 
@@ -8,6 +8,7 @@
 from typing import Generic, Optional, TypeVar, Union
 
 import attrs
+from pydantic import BaseModel, TypeAdapter
 from structlog import get_logger
 
 from .file_utils import Endian, File, InvalidInputFormat, StructParser
@@ -61,12 +62,11 @@ def __post_init__(self):
         self.fully_supported = len(self.limitations) == 0
 
 
-@attrs.define(frozen=True)
-class Task:
+class Task(BaseModel):
     path: Path
     depth: int
     blob_id: str
-    is_multi_file: bool = attrs.field(default=False)
+    is_multi_file: bool = False
 
 
 @attrs.define
@@ -228,11 +228,10 @@ def as_report(self, extraction_reports: list[Report]) -> MultiFileReport:
 ReportType = TypeVar("ReportType", bound=Report)
 
 
-@attrs.define
-class TaskResult:
+class TaskResult(BaseModel):
     task: Task
-    reports: list[Report] = attrs.field(factory=list)
-    subtasks: list[Task] = attrs.field(factory=list)
+    reports: list[Report] = []
+    subtasks: list[Task] = []
 
     def add_report(self, report: Report):
         self.reports.append(report)
@@ -244,9 +243,8 @@ def filter_reports(self, report_class: type[ReportType]) -> list[ReportType]:
         return [report for report in self.reports if isinstance(report, report_class)]
 
 
-@attrs.define
-class ProcessResult:
-    results: list[TaskResult] = attrs.field(factory=list)
+class ProcessResult(BaseModel):
+    results: list[TaskResult] = []
 
     @property
     def errors(self) -> list[ErrorReport]:
@@ -268,7 +266,9 @@ def register(self, result: TaskResult):
         self.results.append(result)
 
     def to_json(self, indent="  "):
-        return to_json(self.results, indent=indent)
+        return json.dumps(
+            [result.model_dump(mode="json") for result in self.results], indent=indent
+        )
 
     def get_output_dir(self) -> Optional[Path]:
         try:
@@ -285,37 +285,20 @@ def get_output_dir(self) -> Optional[Path]:
             return None
 
 
-class _JSONEncoder(json.JSONEncoder):
-    def default(self, o):
-        obj = o
-        if attrs.has(type(obj)):
-            extend_attr_output = True
-            attr_output = attrs.asdict(obj, recurse=not extend_attr_output)
-            attr_output["__typename__"] = obj.__class__.__name__
-            return attr_output
-
-        if isinstance(obj, Enum):
-            return obj.name
-
-        if isinstance(obj, Path):
-            return str(obj)
-
-        if isinstance(obj, bytes):
-            try:
-                return obj.decode()
-            except UnicodeDecodeError:
-                return str(obj)
+ReportModel = list[TaskResult]
+ReportModelAdapter = TypeAdapter(ReportModel)
+"""Use this for deserialization (import JSON report back into Python
+objects) of the JSON report.
 
-        logger.error("JSONEncoder met a non-JSON encodable value", obj=obj)
-        # the usual fail path of custom JSONEncoders is to call the parent and let it fail
-        #     return json.JSONEncoder.default(self, obj)
-        # instead of failing, just return something usable
-        return f"Non-JSON encodable value: {obj}"
+For example:
 
+with open('report.json', 'r') as f:
+    data = f.read()
+    report_data = ReportModelAdapter.validate_json(data)
 
-def to_json(obj, indent="  ") -> str:
-    """Encode any UnBlob object as a serialized JSON."""
-    return json.dumps(obj, cls=_JSONEncoder, indent=indent)
+For another example see:
+tests/test_models.py::Test_to_json::test_process_result_deserialization
+"""
 
 
 class ExtractError(Exception):
 
@@ -242,7 +242,7 @@ def __init__(self, config: ExtractionConfig):
         self._get_mime_type = magic.Magic(mime=True).from_file
 
     def process_task(self, task: Task) -> TaskResult:
-        result = TaskResult(task)
+        result = TaskResult(task=task)
         try:
             self._process_task(result, task)
         except Exception as exc:
@@ -393,7 +393,7 @@ def _calculate_multifile(
             task_result.add_report(error_report)
             logger.warning(
                 "Unhandled Exception during multi file calculation",
-                **error_report.asdict(),
+                **error_report.model_dump(),
             )
 
     def _check_conflicting_files(
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ def test_search_chunks(data):`
`59`	`59`	`depth=0,`
`60`	`60`	`blob_id="",`
`61`	`61`	`)`
`62`		`- result = TaskResult(task)`
	`62`	`+ result = TaskResult(task=task)`
`63`	`63`	`search_chunks(file, len(data), config.handlers, result)`
`64`	`64`
`65`	`65`
Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ def _calculate_chunk(`
`63`	`63`	`)`
`64`	`64`	`task_result.add_report(error_report)`
`65`	`65`	`logger.error(`
`66`		`- "Unhandled Exception during chunk calculation", **error_report.asdict()`
	`66`	`+ "Unhandled Exception during chunk calculation", **error_report.model_dump()`
`67`	`67`	`)`
`68`	`68`
`69`	`69`