Skip to content

Commit 96a4aff

Browse files
committed
fix(handlers): add support for unix-compatible (aka v7) tar files.
v7 tar headers do not have the 'ustar' magic that we match on for modern tar files. In order to match on those v7 archive, we build a regular expression that matches on mode, uid, gid, mtime, size files given their properties: - fixed size (e.g. 8 for mode) - optionally prepended by whitespaces - suffixed by null bytes (null terminated) - ASCII encoded octal digits (0x30 to 0x37) In order to build a pattern that can be handled by hyperscan without using a notation such as '[\w]{1,99}' for path name (see below for detailed explanation), we rely on utility function to build a regular expression to match all possible combination using the or (|) operator. Note: hyperscan will yield a "Pattern is too large" exception when trying to use '{1,99}' notation. Even though we found out that using '.*' works, it would have an important performance impact on pattern matching. That's why we decided to go with the OR operator approach with combination. See https://intel.github.io/hyperscan/dev-reference/compilation.html for more information about this.
1 parent aee811b commit 96a4aff

File tree

9 files changed

+174
-14
lines changed

9 files changed

+174
-14
lines changed

tests/handlers/archive/test_tar.py

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
from helpers import unhex
33

44
from unblob.file_utils import File
5-
from unblob.handlers.archive.tar import TarHandler, _get_tar_end_offset
5+
from unblob.handlers.archive.tar import (
6+
TarUnixHandler,
7+
TarUstarHandler,
8+
_get_tar_end_offset,
9+
)
610

711
GNU_TAR_CONTENTS = unhex(
812
"""\
@@ -120,6 +124,58 @@
120124
"""
121125
)
122126

127+
UNIX_TAR_CONTENT = unhex(
128+
"""\
129+
00000000 66 72 75 69 74 73 2f 00 00 00 00 00 00 00 00 00 |fruits/.........|
130+
00000010 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
131+
*
132+
00000060 00 00 00 00 30 30 30 30 37 37 35 00 30 30 30 31 |....0000775.0001|
133+
00000070 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
134+
00000080 30 30 30 30 30 30 30 00 31 34 35 30 34 32 36 32 |0000000.14504262|
135+
00000090 30 37 37 00 30 30 37 34 30 34 00 20 35 00 00 00 |077.007404. 5...|
136+
000000a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
137+
*
138+
00000140 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
139+
00000150 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
140+
00000160 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
141+
*
142+
00000200 66 72 75 69 74 73 2f 61 70 70 6c 65 2e 74 78 74 |fruits/apple.txt|
143+
00000210 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
144+
*
145+
00000260 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001|
146+
00000270 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
147+
00000280 30 30 30 30 30 30 36 00 31 34 35 30 34 32 36 32 |0000006.14504262|
148+
00000290 30 37 31 00 30 31 31 31 35 34 00 20 00 00 00 00 |071.011154. ....|
149+
000002a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
150+
*
151+
00000340 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
152+
00000350 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
153+
00000360 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
154+
*
155+
00000400 61 70 70 6c 65 0a 00 00 00 00 00 00 00 00 00 00 |apple...........|
156+
00000410 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
157+
*
158+
00000600 66 72 75 69 74 73 2f 63 68 65 72 72 79 2e 74 78 |fruits/cherry.tx|
159+
00000610 74 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |t...............|
160+
00000620 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
161+
*
162+
00000660 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001|
163+
00000670 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
164+
00000680 30 30 30 30 30 30 37 00 31 34 35 30 34 32 36 32 |0000007.14504262|
165+
00000690 30 37 37 00 30 31 31 33 35 36 00 20 00 00 00 00 |077.011356. ....|
166+
000006a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
167+
*
168+
00000740 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
169+
00000750 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
170+
00000760 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
171+
*
172+
00000800 63 68 65 72 72 79 0a 00 00 00 00 00 00 00 00 00 |cherry..........|
173+
00000810 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
174+
*
175+
00002800
176+
"""
177+
)
178+
123179
PADDING_TO_DEFAULT_BLOCKING_FACTOR = unhex(
124180
"""\
125181
00000400 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
@@ -303,12 +359,30 @@ def test_different_blocking_factor():
303359
pytest.param(b"some prefix ", id="nonzero-prefix"),
304360
],
305361
)
306-
def test_calculate_chunk(prefix):
362+
def test_calculate_chunk_ustar(prefix):
307363
tar_file = File.from_bytes(prefix + GNU_TAR_CONTENTS)
308-
handler = TarHandler()
364+
handler = TarUstarHandler()
309365

310366
chunk = handler.calculate_chunk(tar_file, len(prefix))
311367

312368
assert chunk is not None
313369
assert chunk.start_offset == len(prefix)
314370
assert chunk.end_offset == len(prefix) + len(GNU_TAR_CONTENTS)
371+
372+
373+
@pytest.mark.parametrize(
374+
"prefix",
375+
[
376+
pytest.param(b"", id="zero-prefix"),
377+
pytest.param(b"some prefix ", id="nonzero-prefix"),
378+
],
379+
)
380+
def test_calculate_chunk_unix(prefix):
381+
tar_file = File.from_bytes(prefix + UNIX_TAR_CONTENT)
382+
handler = TarUnixHandler()
383+
384+
chunk = handler.calculate_chunk(tar_file, len(prefix))
385+
386+
assert chunk is not None
387+
assert chunk.start_offset == len(prefix)
388+
assert chunk.end_offset == len(prefix) + len(UNIX_TAR_CONTENT)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:f799945e335bcd22cae7f2a53033781b8a181a266a12bd04d6ab3ed3f5ba1fd2
3+
size 10240
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:7592083f2355ad7e207557efabb3594bf62c9e39677298e8265766a37d835c39
3+
size 8
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:bb6f92894363eceff37d58287d2b8b37bb17a00b320312ed59924a8ec07004a6
3+
size 8
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:b4d09282b5ac47cc58aa5dc4fe3e8d6829ac737adfe49ee32efee9cf4bf6cdf3
3+
size 8
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:7843a44cca6d57497113ed505d027b9f5ffac78f1de7809f81ae9b314d943e79
3+
size 8

unblob/extractors/command.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ def _make_extract_command(self, inpath: Path, outdir: Path):
9696
raise InvalidCommandTemplate("Invalid template placeholder", t) from k
9797
except ValueError as v:
9898
raise InvalidCommandTemplate("The template is malformed", t) from v
99-
10099
return args
101100

102101
def get_dependencies(self) -> List[str]:

unblob/handlers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@
7070
arc.ARCHandler,
7171
arj.ARJHandler,
7272
cab.CABHandler,
73-
tar.TarHandler,
73+
tar.TarUstarHandler,
74+
tar.TarUnixHandler,
7475
cpio.PortableASCIIHandler,
7576
cpio.PortableASCIIWithCRCHandler,
7677
cpio.PortableOldASCIIHandler,

unblob/handlers/archive/tar.py

Lines changed: 80 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
ExtractResult,
1313
File,
1414
HexString,
15+
Regex,
1516
StructHandler,
1617
ValidChunk,
1718
)
@@ -98,17 +99,10 @@ def extract(self, inpath: Path, outdir: Path):
9899
return ExtractResult(reports=tarfile.reports)
99100

100101

101-
class TarHandler(StructHandler):
102+
class _TarHandler(StructHandler):
102103
NAME = "tar"
103104

104-
PATTERNS = [
105-
HexString("75 73 74 61 72 20 20 00"),
106-
HexString("75 73 74 61 72 00 30 30"),
107-
]
108-
109-
# Since the magic is at 257, we have to subtract that from the match offset
110-
# to get to the start of the file.
111-
PATTERN_MATCH_OFFSET = -MAGIC_OFFSET
105+
PATTERNS = []
112106

113107
C_DEFINITIONS = r"""
114108
typedef struct posix_header
@@ -146,3 +140,80 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
146140
if end_offset == -1:
147141
return None
148142
return ValidChunk(start_offset=start_offset, end_offset=end_offset)
143+
144+
145+
class TarUstarHandler(_TarHandler):
146+
PATTERNS = [
147+
HexString("75 73 74 61 72 20 20 00"),
148+
HexString("75 73 74 61 72 00 30 30"),
149+
]
150+
151+
# Since the magic is at 257, we have to subtract that from the match offset
152+
# to get to the start of the file.
153+
PATTERN_MATCH_OFFSET = -MAGIC_OFFSET
154+
155+
156+
def _re_frame(regexp: str):
157+
"""Wrap regexp to ensure its integrity from concatenation.
158+
159+
E.g.: when the regex
160+
a|b
161+
is naively appended by regex c, the result
162+
a|bc
163+
will not match "ac", while
164+
(a|b)c
165+
will match "ac" as intended.
166+
"""
167+
return f"({regexp})"
168+
169+
170+
def _re_alternatives(regexps):
171+
return _re_frame("|".join(_re_frame(regexp) for regexp in regexps))
172+
173+
174+
def _padded_field(re_content_char, size, leftpad_re=" ", rightpad_re=r"[ \0x00]"):
175+
field_regexes = []
176+
177+
for padsize in range(size):
178+
content_re = f"{re_content_char}{{{size-padsize}}}"
179+
180+
for leftpadsize in range(padsize + 1):
181+
rightpadsize = padsize - leftpadsize
182+
183+
left_re = f"{leftpad_re}{{{leftpadsize}}}" if leftpadsize else ""
184+
right_re = f"{rightpad_re}{{{rightpadsize}}}" if rightpadsize else ""
185+
186+
field_regexes.append(f"{left_re}{content_re}{right_re}")
187+
188+
return _re_alternatives(field_regexes)
189+
190+
191+
class TarUnixHandler(_TarHandler):
192+
PATTERNS = [
193+
Regex(
194+
r""
195+
# (pattern would be too big) char name[100]
196+
+ _padded_field(r"[0-7]", 8) # char mode[8]
197+
+ _padded_field(r"[0-7]", 8) # char uid[8]
198+
+ _padded_field(r"[0-7]", 8) # char gid[8]
199+
+ _padded_field(r"[0-7]", 12) # char size[12]
200+
+ _padded_field(r"[0-7]", 12) # char mtime[12]
201+
+ _padded_field(r"[0-7]", 8) # char chksum[8]
202+
+ r"[0-7\x00]" # char typeflag[1] - no extensions
203+
# Extending/dropping typeflag pattern would cover all tar formats,
204+
# r"[0-7xgA-Z\x00]" would probably match all current major implementations.
205+
# Info on the values for typeflag:
206+
# - https://en.wikipedia.org/wiki/Tar_(computing)
207+
# - https://www.gnu.org/software/tar/manual/html_node/Standard.html
208+
# - https://github.com/openbsd/src/blob/master/bin/pax/tar.h
209+
# - https://codebrowser.dev/glibc/glibc/posix/tar.h.html
210+
# - https://www.ibm.com/docs/el/aix/7.2?topic=files-tarh-file
211+
# Values 'A'-'Z' are reserved for custom implementations.
212+
# All other values are reserved for future POSIX.1 revisions.
213+
# Several places mention custom extensions and how they extract it,
214+
# e.g. the IBM link above is quite explicit.
215+
# Since its possible values are somewhat vague,
216+
# it might be better still to not include this field in the pattern at all.
217+
),
218+
]
219+
PATTERN_MATCH_OFFSET = -100 # go back to beginning of skipped name

0 commit comments

Comments
 (0)