Skip to content

Commit 19d7cd9

Browse files
committed
Config option for custom magic file
unblob creates metadata about files, which includes magic and mime type using libmagic. When using unlob as a library, the user might want to control which exact magic file to use for fixed results for these 2 fields. python-magic supports this with the magic_file init argument.
1 parent 64cade4 commit 19d7cd9

File tree

2 files changed

+21
-22
lines changed

2 files changed

+21
-22
lines changed

unblob/processing.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Iterable, List, Optional
77

88
import attr
9+
import magic
910
import plotext as plt
1011
from structlog import get_logger
1112

@@ -67,6 +68,7 @@ class ExtractionConfig:
6768
keep_extracted_chunks: bool = False
6869
extract_suffix: str = "_extract"
6970
handlers: Handlers = BUILTIN_HANDLERS
71+
magic_file: Optional[Path] = None
7072

7173
def get_extract_dir_for(self, path: Path) -> Path:
7274
"""Extraction dir under root with the name of path."""
@@ -202,6 +204,20 @@ def write_json_report(report_file: Path, process_result: ProcessResult):
202204
class Processor:
203205
def __init__(self, config: ExtractionConfig):
204206
self._config = config
207+
# libmagic helpers
208+
# file magic uses a rule-set to guess the file type, however as rules are added they could
209+
# shadow each other. File magic uses rule priorities to determine which is the best matching
210+
# rule, however this could shadow other valid matches as well, which could eventually break
211+
# any further processing that depends on magic.
212+
# By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns
213+
# will be included in the magic string at the cost of being a bit slower, but increasing
214+
# accuracy by no shadowing rules.
215+
self._get_magic = magic.Magic(
216+
keep_going=True, magic_file=config.magic_file
217+
).from_file
218+
self._get_mime_type = magic.Magic(
219+
mime=True, magic_file=config.magic_file
220+
).from_file
205221

206222
def process_task(self, task: Task) -> TaskResult:
207223
result = TaskResult(task)
@@ -251,13 +267,13 @@ def _process_task(self, result: TaskResult, task: Task):
251267
log.debug("Ignoring empty file")
252268
return
253269

254-
magic_report = FileMagicReport.from_path(task.path)
255-
result.add_report(magic_report)
256-
257-
magic = magic_report.magic
258-
270+
magic = self._get_magic(task.path)
271+
mime_type = self._get_mime_type(task.path)
259272
logger.debug("Detected file-magic", magic=magic, path=task.path, _verbosity=2)
260273

274+
magic_report = FileMagicReport(magic=magic, mime_type=mime_type)
275+
result.add_report(magic_report)
276+
261277
should_skip_file = any(
262278
magic.startswith(pattern) for pattern in self._config.skip_magic
263279
)

unblob/report.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from typing import List, Optional, Union
77

88
import attr
9-
import magic
109

1110

1211
@attr.define(kw_only=True, frozen=True)
@@ -140,27 +139,11 @@ def from_path(cls, path: Path):
140139
)
141140

142141

143-
# libmagic helpers
144-
# file magic uses a rule-set to guess the file type, however as rules are added they could
145-
# shadow each other. File magic uses rule priorities to determine which is the best matching
146-
# rule, however this could shadow other valid matches as well, which could eventually break
147-
# any further processing that depends on magic.
148-
# By enabling keep_going (which eventually enables MAGIC_CONTINUE) all matching patterns
149-
# will be included in the magic string at the cost of being a bit slower, but increasing
150-
# accuracy by no shadowing rules.
151-
get_magic = magic.Magic(keep_going=True).from_file
152-
get_mime_type = magic.Magic(mime=True).from_file
153-
154-
155142
@attr.define(kw_only=True)
156143
class FileMagicReport(Report):
157144
magic: str
158145
mime_type: str
159146

160-
@classmethod
161-
def from_path(cls, path: Path):
162-
return cls(magic=get_magic(path), mime_type=get_mime_type(path))
163-
164147

165148
@attr.define(kw_only=True)
166149
class ChunkReport(Report):

0 commit comments

Comments
 (0)