Skip to content

Commit adffec0

Browse files
committed
add ZSTD compression stream format support.
Support is provided by the zstandard library recommended by ZSTD format maintainers (see https://python-zstandard.readthedocs.io/en/latest/decompressor.html). The implementation is similar to the one we got for gzip, execpt we did not have to monkeypatch it this time. More information about ZSTD can be found at https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md
1 parent d6b0746 commit adffec0

File tree

2 files changed

+86
-1
lines changed

2 files changed

+86
-1
lines changed

unblob/handlers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from ..models import Handlers
22
from .archive import ar, arc, arj, cab, cpio, dmg, rar, sevenzip, stuffit, tar, zip
3-
from .compression import bzip2, compress, gzip, lz4, lzh, lzip, lzma, lzo, xz
3+
from .compression import bzip2, compress, gzip, lz4, lzh, lzip, lzma, lzo, xz, zstd
44
from .executable import elf
55
from .filesystem import (
66
cramfs,
@@ -62,6 +62,7 @@
6262
lz4.SkippableFrameHandler,
6363
lz4.DefaultFrameHandler,
6464
xz.XZHandler,
65+
zstd.ZSTDHandler,
6566
elf.ELF32Handler,
6667
elf.ELF64Handler,
6768
)
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import io
2+
from typing import Optional
3+
4+
from structlog import get_logger
5+
6+
from unblob.extractors import Command
7+
8+
from ...file_utils import Endian, InvalidInputFormat, convert_int8
9+
from ...models import File, Handler, HexString, ValidChunk
10+
11+
logger = get_logger()
12+
13+
MAGIC_LEN = 4
14+
BLOCK_HEADER_LEN = 3
15+
RAW_BLOCK = 0
16+
RLE_BLOCK = 1
17+
COMPRESSED_BLOCK = 2
18+
DICT_ID_FIELDSIZE_MAP = [0, 1, 2, 4]
19+
FRAME_CONTENT_FIELDSIZE_MAP = [0, 2, 4, 8]
20+
21+
22+
class ZSTDHandler(Handler):
23+
NAME = "zstd"
24+
25+
PATTERNS = [HexString("28 B5 2F FD")]
26+
27+
EXTRACTOR = Command("zstd", "-d", "{inpath}", "-o", "{outdir}/{infile}")
28+
29+
def get_frame_header_size(self, frame_header_descriptor: int) -> int:
30+
single_segment = (frame_header_descriptor >> 5 & 1) & 0b1
31+
dictionary_id = frame_header_descriptor >> 0 & 0b11
32+
frame_content_size = (frame_header_descriptor >> 6) & 0b1
33+
return (
34+
int(not single_segment)
35+
+ DICT_ID_FIELDSIZE_MAP[dictionary_id]
36+
+ FRAME_CONTENT_FIELDSIZE_MAP[frame_content_size]
37+
+ (single_segment and not frame_content_size)
38+
)
39+
40+
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
41+
42+
file.seek(start_offset, io.SEEK_SET)
43+
file.seek(MAGIC_LEN, io.SEEK_CUR)
44+
45+
frame_header_descriptor = convert_int8(file.read(1), Endian.LITTLE)
46+
frame_header_size = self.get_frame_header_size(frame_header_descriptor)
47+
48+
content_checksum_flag = frame_header_descriptor >> 2 & 1
49+
if content_checksum_flag:
50+
content_checksum_size = 4
51+
else:
52+
content_checksum_size = 0
53+
54+
unused_bit = frame_header_descriptor >> 4 & 1
55+
reserved_bit = frame_header_descriptor >> 3 & 1
56+
57+
# these values MUST be zero per the standard
58+
if unused_bit != 0x00 or reserved_bit != 0x0:
59+
raise InvalidInputFormat("Invalid frame header format.")
60+
61+
file.seek(frame_header_size, io.SEEK_CUR)
62+
63+
last_block = False
64+
while not last_block:
65+
block_header = int.from_bytes(
66+
file.read(BLOCK_HEADER_LEN), byteorder="little"
67+
)
68+
last_block = block_header >> 0 & 0b1
69+
block_type = block_header >> 1 & 0b11
70+
71+
if block_type in [RAW_BLOCK, COMPRESSED_BLOCK]:
72+
block_size = block_header >> 3
73+
elif block_type == RLE_BLOCK:
74+
block_size = 1
75+
else:
76+
raise InvalidInputFormat("Invalid block type")
77+
file.seek(block_size, io.SEEK_CUR)
78+
79+
file.seek(content_checksum_size, io.SEEK_CUR)
80+
81+
return ValidChunk(
82+
start_offset=start_offset,
83+
end_offset=file.tell(),
84+
)

0 commit comments

Comments
 (0)