Skip to content

Commit ed3e303

Browse files
authored
Merge pull request #655 from onekey-sec/654-tar-handler
fix(handlers): add support for unix-compatible (aka v7) tar files.
2 parents aee811b + 0166d74 commit ed3e303

File tree

9 files changed

+205
-14
lines changed

9 files changed

+205
-14
lines changed

tests/handlers/archive/test_tar.py

Lines changed: 77 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
from helpers import unhex
33

44
from unblob.file_utils import File
5-
from unblob.handlers.archive.tar import TarHandler, _get_tar_end_offset
5+
from unblob.handlers.archive.tar import (
6+
TarUnixHandler,
7+
TarUstarHandler,
8+
_get_tar_end_offset,
9+
)
610

711
GNU_TAR_CONTENTS = unhex(
812
"""\
@@ -120,6 +124,58 @@
120124
"""
121125
)
122126

127+
UNIX_TAR_CONTENT = unhex(
128+
"""\
129+
00000000 66 72 75 69 74 73 2f 00 00 00 00 00 00 00 00 00 |fruits/.........|
130+
00000010 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
131+
*
132+
00000060 00 00 00 00 30 30 30 30 37 37 35 00 30 30 30 31 |....0000775.0001|
133+
00000070 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
134+
00000080 30 30 30 30 30 30 30 00 31 34 35 30 34 32 36 32 |0000000.14504262|
135+
00000090 30 37 37 00 30 30 37 34 30 34 00 20 35 00 00 00 |077.007404. 5...|
136+
000000a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
137+
*
138+
00000140 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
139+
00000150 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
140+
00000160 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
141+
*
142+
00000200 66 72 75 69 74 73 2f 61 70 70 6c 65 2e 74 78 74 |fruits/apple.txt|
143+
00000210 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
144+
*
145+
00000260 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001|
146+
00000270 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
147+
00000280 30 30 30 30 30 30 36 00 31 34 35 30 34 32 36 32 |0000006.14504262|
148+
00000290 30 37 31 00 30 31 31 31 35 34 00 20 00 00 00 00 |071.011154. ....|
149+
000002a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
150+
*
151+
00000340 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
152+
00000350 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
153+
00000360 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
154+
*
155+
00000400 61 70 70 6c 65 0a 00 00 00 00 00 00 00 00 00 00 |apple...........|
156+
00000410 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
157+
*
158+
00000600 66 72 75 69 74 73 2f 63 68 65 72 72 79 2e 74 78 |fruits/cherry.tx|
159+
00000610 74 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |t...............|
160+
00000620 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
161+
*
162+
00000660 00 00 00 00 30 30 30 30 36 36 34 00 30 30 30 31 |....0000664.0001|
163+
00000670 37 35 30 00 30 30 30 31 37 35 30 00 30 30 30 30 |750.0001750.0000|
164+
00000680 30 30 30 30 30 30 37 00 31 34 35 30 34 32 36 32 |0000007.14504262|
165+
00000690 30 37 37 00 30 31 31 33 35 36 00 20 00 00 00 00 |077.011356. ....|
166+
000006a0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
167+
*
168+
00000740 00 00 00 00 00 00 00 00 00 30 30 30 30 30 30 30 |.........0000000|
169+
00000750 00 30 30 30 30 30 30 30 00 00 00 00 00 00 00 00 |.0000000........|
170+
00000760 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
171+
*
172+
00000800 63 68 65 72 72 79 0a 00 00 00 00 00 00 00 00 00 |cherry..........|
173+
00000810 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
174+
*
175+
00002800
176+
"""
177+
)
178+
123179
PADDING_TO_DEFAULT_BLOCKING_FACTOR = unhex(
124180
"""\
125181
00000400 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................|
@@ -303,12 +359,30 @@ def test_different_blocking_factor():
303359
pytest.param(b"some prefix ", id="nonzero-prefix"),
304360
],
305361
)
306-
def test_calculate_chunk(prefix):
362+
def test_calculate_chunk_ustar(prefix):
307363
tar_file = File.from_bytes(prefix + GNU_TAR_CONTENTS)
308-
handler = TarHandler()
364+
handler = TarUstarHandler()
309365

310366
chunk = handler.calculate_chunk(tar_file, len(prefix))
311367

312368
assert chunk is not None
313369
assert chunk.start_offset == len(prefix)
314370
assert chunk.end_offset == len(prefix) + len(GNU_TAR_CONTENTS)
371+
372+
373+
@pytest.mark.parametrize(
374+
"prefix",
375+
[
376+
pytest.param(b"", id="zero-prefix"),
377+
pytest.param(b"some prefix ", id="nonzero-prefix"),
378+
],
379+
)
380+
def test_calculate_chunk_unix(prefix):
381+
tar_file = File.from_bytes(prefix + UNIX_TAR_CONTENT)
382+
handler = TarUnixHandler()
383+
384+
chunk = handler.calculate_chunk(tar_file, len(prefix))
385+
386+
assert chunk is not None
387+
assert chunk.start_offset == len(prefix)
388+
assert chunk.end_offset == len(prefix) + len(UNIX_TAR_CONTENT)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:f799945e335bcd22cae7f2a53033781b8a181a266a12bd04d6ab3ed3f5ba1fd2
3+
size 10240
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:7592083f2355ad7e207557efabb3594bf62c9e39677298e8265766a37d835c39
3+
size 8
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:bb6f92894363eceff37d58287d2b8b37bb17a00b320312ed59924a8ec07004a6
3+
size 8
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:b4d09282b5ac47cc58aa5dc4fe3e8d6829ac737adfe49ee32efee9cf4bf6cdf3
3+
size 8
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:7843a44cca6d57497113ed505d027b9f5ffac78f1de7809f81ae9b314d943e79
3+
size 8

unblob/extractors/command.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,6 @@ def _make_extract_command(self, inpath: Path, outdir: Path):
9696
raise InvalidCommandTemplate("Invalid template placeholder", t) from k
9797
except ValueError as v:
9898
raise InvalidCommandTemplate("The template is malformed", t) from v
99-
10099
return args
101100

102101
def get_dependencies(self) -> List[str]:

unblob/handlers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,8 @@
7070
arc.ARCHandler,
7171
arj.ARJHandler,
7272
cab.CABHandler,
73-
tar.TarHandler,
73+
tar.TarUstarHandler,
74+
tar.TarUnixHandler,
7475
cpio.PortableASCIIHandler,
7576
cpio.PortableASCIIWithCRCHandler,
7677
cpio.PortableOldASCIIHandler,

unblob/handlers/archive/tar.py

Lines changed: 111 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
ExtractResult,
1313
File,
1414
HexString,
15+
Regex,
1516
StructHandler,
1617
ValidChunk,
1718
)
@@ -98,17 +99,10 @@ def extract(self, inpath: Path, outdir: Path):
9899
return ExtractResult(reports=tarfile.reports)
99100

100101

101-
class TarHandler(StructHandler):
102+
class _TarHandler(StructHandler):
102103
NAME = "tar"
103104

104-
PATTERNS = [
105-
HexString("75 73 74 61 72 20 20 00"),
106-
HexString("75 73 74 61 72 00 30 30"),
107-
]
108-
109-
# Since the magic is at 257, we have to subtract that from the match offset
110-
# to get to the start of the file.
111-
PATTERN_MATCH_OFFSET = -MAGIC_OFFSET
105+
PATTERNS = []
112106

113107
C_DEFINITIONS = r"""
114108
typedef struct posix_header
@@ -142,7 +136,115 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
142136
header_size = snull(header.size)
143137
decode_int(header_size, 8)
144138

139+
def signed_sum(octets) -> int:
140+
return sum(b if b < 128 else 256 - b for b in octets)
141+
142+
if header.chksum[6:8] not in (b"\x00 ", b" \x00"):
143+
logger.error(
144+
"Invalid checksum format",
145+
actual_last_2_bytes=header.chksum[6:8],
146+
handler=self.NAME,
147+
)
148+
return None
149+
checksum = decode_int(header.chksum[:6], 8)
150+
header_bytes_for_checksum = (
151+
file[start_offset : start_offset + 148]
152+
+ b" " * 8 # chksum field is replaced with "blanks"
153+
+ file[start_offset + 156 : start_offset + 257]
154+
)
155+
extended_header_bytes = file[start_offset + 257 : start_offset + 500]
156+
calculated_checksum_unsigned = sum(header_bytes_for_checksum)
157+
calculated_checksum_signed = signed_sum(header_bytes_for_checksum)
158+
checksums = (
159+
calculated_checksum_unsigned,
160+
calculated_checksum_unsigned + sum(extended_header_bytes),
161+
# signed is of historical interest, calculating for the extended header is not needed
162+
calculated_checksum_signed,
163+
)
164+
if checksum not in checksums:
165+
logger.error(
166+
"Tar header checksum mismatch", expected=str(checksum), actual=checksums
167+
)
168+
return None
169+
145170
end_offset = _get_tar_end_offset(file, start_offset)
146171
if end_offset == -1:
147172
return None
148173
return ValidChunk(start_offset=start_offset, end_offset=end_offset)
174+
175+
176+
class TarUstarHandler(_TarHandler):
177+
PATTERNS = [
178+
HexString("75 73 74 61 72 20 20 00"),
179+
HexString("75 73 74 61 72 00 30 30"),
180+
]
181+
182+
# Since the magic is at 257, we have to subtract that from the match offset
183+
# to get to the start of the file.
184+
PATTERN_MATCH_OFFSET = -MAGIC_OFFSET
185+
186+
187+
def _re_frame(regexp: str):
188+
"""Wrap regexp to ensure its integrity from concatenation.
189+
190+
E.g.: when the regex
191+
a|b
192+
is naively appended by regex c, the result
193+
a|bc
194+
will not match "ac", while
195+
(a|b)c
196+
will match "ac" as intended.
197+
"""
198+
return f"({regexp})"
199+
200+
201+
def _re_alternatives(regexps):
202+
return _re_frame("|".join(_re_frame(regexp) for regexp in regexps))
203+
204+
205+
def _padded_field(re_content_char, size, leftpad_re=" ", rightpad_re=r"[ \0x00]"):
206+
field_regexes = []
207+
208+
for padsize in range(size):
209+
content_re = f"{re_content_char}{{{size-padsize}}}"
210+
211+
for leftpadsize in range(padsize + 1):
212+
rightpadsize = padsize - leftpadsize
213+
214+
left_re = f"{leftpad_re}{{{leftpadsize}}}" if leftpadsize else ""
215+
right_re = f"{rightpad_re}{{{rightpadsize}}}" if rightpadsize else ""
216+
217+
field_regexes.append(f"{left_re}{content_re}{right_re}")
218+
219+
return _re_alternatives(field_regexes)
220+
221+
222+
class TarUnixHandler(_TarHandler):
223+
PATTERNS = [
224+
Regex(
225+
r""
226+
# (pattern would be too big) char name[100]
227+
+ _padded_field(r"[0-7]", 8) # char mode[8]
228+
+ _padded_field(r"[0-7]", 8) # char uid[8]
229+
+ _padded_field(r"[0-7]", 8) # char gid[8]
230+
+ _padded_field(r"[0-7]", 12) # char size[12]
231+
+ _padded_field(r"[0-7]", 12) # char mtime[12]
232+
+ _padded_field(r"[0-7]", 8) # char chksum[8]
233+
+ r"[0-7\x00]" # char typeflag[1] - no extensions
234+
# Extending/dropping typeflag pattern would cover all tar formats,
235+
# r"[0-7xgA-Z\x00]" would probably match all current major implementations.
236+
# Info on the values for typeflag:
237+
# - https://en.wikipedia.org/wiki/Tar_(computing)
238+
# - https://www.gnu.org/software/tar/manual/html_node/Standard.html
239+
# - https://github.com/openbsd/src/blob/master/bin/pax/tar.h
240+
# - https://codebrowser.dev/glibc/glibc/posix/tar.h.html
241+
# - https://www.ibm.com/docs/el/aix/7.2?topic=files-tarh-file
242+
# Values 'A'-'Z' are reserved for custom implementations.
243+
# All other values are reserved for future POSIX.1 revisions.
244+
# Several places mention custom extensions and how they extract it,
245+
# e.g. the IBM link above is quite explicit.
246+
# Since its possible values are somewhat vague,
247+
# it might be better still to not include this field in the pattern at all.
248+
),
249+
]
250+
PATTERN_MATCH_OFFSET = -100 # go back to beginning of skipped name

0 commit comments

Comments
 (0)