Skip to content

Commit e26dc7c

Browse files
authored
Merge pull request #401 from onekey-sec/custom-magic-file
Config option for custom magic file
2 parents 64cade4 + 19d7cd9 commit e26dc7c

File tree

2 files changed

+21
-22
lines changed

2 files changed

+21
-22
lines changed

unblob/processing.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Iterable, List, Optional
77

88
import attr
9+
import magic
910
import plotext as plt
1011
from structlog import get_logger
1112

@@ -67,6 +68,7 @@ class ExtractionConfig:
6768
keep_extracted_chunks: bool = False
6869
extract_suffix: str = "_extract"
6970
handlers: Handlers = BUILTIN_HANDLERS
71+
magic_file: Optional[Path] = None
7072

7173
def get_extract_dir_for(self, path: Path) -> Path:
7274
"""Extraction dir under root with the name of path."""
@@ -202,6 +204,20 @@ def write_json_report(report_file: Path, process_result: ProcessResult):
202204
class Processor:
203205
def __init__(self, config: ExtractionConfig):
204206
self._config = config
207+
# libmagic helpers
208+
# file magic uses a rule-set to guess the file type, however as rules are added they could
209+
# shadow each other. File magic uses rule priorities to determine which is the best matching
210+
# rule, however this could shadow other valid matches as well, which could eventually break
211+
# any further processing that depends on magic.
212+
# By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns
213+
# will be included in the magic string at the cost of being a bit slower, but increasing
214+
# accuracy by no shadowing rules.
215+
self._get_magic = magic.Magic(
216+
keep_going=True, magic_file=config.magic_file
217+
).from_file
218+
self._get_mime_type = magic.Magic(
219+
mime=True, magic_file=config.magic_file
220+
).from_file
205221

206222
def process_task(self, task: Task) -> TaskResult:
207223
result = TaskResult(task)
@@ -251,13 +267,13 @@ def _process_task(self, result: TaskResult, task: Task):
251267
log.debug("Ignoring empty file")
252268
return
253269

254-
magic_report = FileMagicReport.from_path(task.path)
255-
result.add_report(magic_report)
256-
257-
magic = magic_report.magic
258-
270+
magic = self._get_magic(task.path)
271+
mime_type = self._get_mime_type(task.path)
259272
logger.debug("Detected file-magic", magic=magic, path=task.path, _verbosity=2)
260273

274+
magic_report = FileMagicReport(magic=magic, mime_type=mime_type)
275+
result.add_report(magic_report)
276+
261277
should_skip_file = any(
262278
magic.startswith(pattern) for pattern in self._config.skip_magic
263279
)

unblob/report.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from typing import List, Optional, Union
77

88
import attr
9-
import magic
109

1110

1211
@attr.define(kw_only=True, frozen=True)
@@ -140,27 +139,11 @@ def from_path(cls, path: Path):
140139
)
141140

142141

143-
# libmagic helpers
144-
# file magic uses a rule-set to guess the file type, however as rules are added they could
145-
# shadow each other. File magic uses rule priorities to determine which is the best matching
146-
# rule, however this could shadow other valid matches as well, which could eventually break
147-
# any further processing that depends on magic.
148-
# By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns
149-
# will be included in the magic string at the cost of being a bit slower, but increasing
150-
# accuracy by no shadowing rules.
151-
get_magic = magic.Magic(keep_going=True).from_file
152-
get_mime_type = magic.Magic(mime=True).from_file
153-
154-
155142
@attr.define(kw_only=True)
156143
class FileMagicReport(Report):
157144
magic: str
158145
mime_type: str
159146

160-
@classmethod
161-
def from_path(cls, path: Path):
162-
return cls(magic=get_magic(path), mime_type=get_mime_type(path))
163-
164147

165148
@attr.define(kw_only=True)
166149
class ChunkReport(Report):

0 commit comments

Comments
 (0)