Merge pull request #655 from onekey-sec/654-tar-handler

qkaiser · web-flow · commit ed3e30309074 · 2023-10-23T11:53:31.000+02:00
fix(handlers): add support for unix-compatible (aka v7) tar files.
diff --git a/tests/handlers/archive/test_tar.py b/tests/handlers/archive/test_tar.py
@@ -2,7 +2,11 @@
 from helpers import unhex
 
 from unblob.file_utils import File
-from unblob.handlers.archive.tar import TarHandler, _get_tar_end_offset
+from unblob.handlers.archive.tar import (
+    TarUnixHandler,
+    TarUstarHandler,
+    _get_tar_end_offset,
+)
 
 GNU_TAR_CONTENTS = unhex(
     """\
@@ -120,6 +124,58 @@
 """
 )
 
+UNIX_TAR_CONTENT = unhex(
+    """\
+00000000  66 72 75 69 74 73 2f 00  00 00 00 00 00 00 00 00  |fruits/.........|
+00000010  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000060  00 00 00 00 30 30 30 30  37 37 35 00 30 30 30 31  |....0000775.0001|
+00000070  37 35 30 00 30 30 30 31  37 35 30 00 30 30 30 30  |750.0001750.0000|
+00000080  30 30 30 30 30 30 30 00  31 34 35 30 34 32 36 32  |0000000.14504262|
+00000090  30 37 37 00 30 30 37 34  30 34 00 20 35 00 00 00  |077.007404. 5...|
+000000a0  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000140  00 00 00 00 00 00 00 00  00 30 30 30 30 30 30 30  |.........0000000|
+00000150  00 30 30 30 30 30 30 30  00 00 00 00 00 00 00 00  |.0000000........|
+00000160  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000200  66 72 75 69 74 73 2f 61  70 70 6c 65 2e 74 78 74  |fruits/apple.txt|
+00000210  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000260  00 00 00 00 30 30 30 30  36 36 34 00 30 30 30 31  |....0000664.0001|
+00000270  37 35 30 00 30 30 30 31  37 35 30 00 30 30 30 30  |750.0001750.0000|
+00000280  30 30 30 30 30 30 36 00  31 34 35 30 34 32 36 32  |0000006.14504262|
+00000290  30 37 31 00 30 31 31 31  35 34 00 20 00 00 00 00  |071.011154. ....|
+000002a0  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000340  00 00 00 00 00 00 00 00  00 30 30 30 30 30 30 30  |.........0000000|
+00000350  00 30 30 30 30 30 30 30  00 00 00 00 00 00 00 00  |.0000000........|
+00000360  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000400  61 70 70 6c 65 0a 00 00  00 00 00 00 00 00 00 00  |apple...........|
+00000410  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000600  66 72 75 69 74 73 2f 63  68 65 72 72 79 2e 74 78  |fruits/cherry.tx|
+00000610  74 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |t...............|
+00000620  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000660  00 00 00 00 30 30 30 30  36 36 34 00 30 30 30 31  |....0000664.0001|
+00000670  37 35 30 00 30 30 30 31  37 35 30 00 30 30 30 30  |750.0001750.0000|
+00000680  30 30 30 30 30 30 37 00  31 34 35 30 34 32 36 32  |0000007.14504262|
+00000690  30 37 37 00 30 31 31 33  35 36 00 20 00 00 00 00  |077.011356. ....|
+000006a0  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000740  00 00 00 00 00 00 00 00  00 30 30 30 30 30 30 30  |.........0000000|
+00000750  00 30 30 30 30 30 30 30  00 00 00 00 00 00 00 00  |.0000000........|
+00000760  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00000800  63 68 65 72 72 79 0a 00  00 00 00 00 00 00 00 00  |cherry..........|
+00000810  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
+*
+00002800
+"""
+)
+
 PADDING_TO_DEFAULT_BLOCKING_FACTOR = unhex(
     """\
 00000400  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|
@@ -303,12 +359,30 @@ def test_different_blocking_factor():
         pytest.param(b"some prefix ", id="nonzero-prefix"),
     ],
 )
-def test_calculate_chunk(prefix):
+def test_calculate_chunk_ustar(prefix):
     tar_file = File.from_bytes(prefix + GNU_TAR_CONTENTS)
-    handler = TarHandler()
+    handler = TarUstarHandler()
 
     chunk = handler.calculate_chunk(tar_file, len(prefix))
 
     assert chunk is not None
     assert chunk.start_offset == len(prefix)
     assert chunk.end_offset == len(prefix) + len(GNU_TAR_CONTENTS)
+
+
+@pytest.mark.parametrize(
+    "prefix",
+    [
+        pytest.param(b"", id="zero-prefix"),
+        pytest.param(b"some prefix ", id="nonzero-prefix"),
+    ],
+)
+def test_calculate_chunk_unix(prefix):
+    tar_file = File.from_bytes(prefix + UNIX_TAR_CONTENT)
+    handler = TarUnixHandler()
+
+    chunk = handler.calculate_chunk(tar_file, len(prefix))
+
+    assert chunk is not None
+    assert chunk.start_offset == len(prefix)
+    assert chunk.end_offset == len(prefix) + len(UNIX_TAR_CONTENT)
diff --git a/tests/integration/archive/tar/__input__/cherry.v7.tar b/tests/integration/archive/tar/__input__/cherry.v7.tar
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f799945e335bcd22cae7f2a53033781b8a181a266a12bd04d6ab3ed3f5ba1fd2
+size 10240
diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry1.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry1.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7592083f2355ad7e207557efabb3594bf62c9e39677298e8265766a37d835c39
+size 8
diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry2.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry2.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb6f92894363eceff37d58287d2b8b37bb17a00b320312ed59924a8ec07004a6
+size 8
diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry3.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry3.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b4d09282b5ac47cc58aa5dc4fe3e8d6829ac737adfe49ee32efee9cf4bf6cdf3
+size 8
diff --git a/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry4.txt b/tests/integration/archive/tar/__output__/cherry.v7.tar_extract/fruits/cherry4.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7843a44cca6d57497113ed505d027b9f5ffac78f1de7809f81ae9b314d943e79
+size 8
diff --git a/unblob/extractors/command.py b/unblob/extractors/command.py
@@ -96,7 +96,6 @@ def _make_extract_command(self, inpath: Path, outdir: Path):
                 raise InvalidCommandTemplate("Invalid template placeholder", t) from k
             except ValueError as v:
                 raise InvalidCommandTemplate("The template is malformed", t) from v
-
         return args
 
     def get_dependencies(self) -> List[str]:
diff --git a/unblob/handlers/__init__.py b/unblob/handlers/__init__.py
@@ -70,7 +70,8 @@
     arc.ARCHandler,
     arj.ARJHandler,
     cab.CABHandler,
-    tar.TarHandler,
+    tar.TarUstarHandler,
+    tar.TarUnixHandler,
     cpio.PortableASCIIHandler,
     cpio.PortableASCIIWithCRCHandler,
     cpio.PortableOldASCIIHandler,
diff --git a/unblob/handlers/archive/tar.py b/unblob/handlers/archive/tar.py
@@ -12,6 +12,7 @@
     ExtractResult,
     File,
     HexString,
+    Regex,
     StructHandler,
     ValidChunk,
 )
@@ -98,17 +99,10 @@ def extract(self, inpath: Path, outdir: Path):
         return ExtractResult(reports=tarfile.reports)
 
 
-class TarHandler(StructHandler):
+class _TarHandler(StructHandler):
     NAME = "tar"
 
-    PATTERNS = [
-        HexString("75 73 74 61 72 20 20 00"),
-        HexString("75 73 74 61 72 00 30 30"),
-    ]
-
-    # Since the magic is at 257, we have to subtract that from the match offset
-    # to get to the start of the file.
-    PATTERN_MATCH_OFFSET = -MAGIC_OFFSET
+    PATTERNS = []
 
     C_DEFINITIONS = r"""
         typedef struct posix_header
@@ -142,7 +136,115 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
         header_size = snull(header.size)
         decode_int(header_size, 8)
 
+        def signed_sum(octets) -> int:
+            return sum(b if b < 128 else 256 - b for b in octets)
+
+        if header.chksum[6:8] not in (b"\x00 ", b" \x00"):
+            logger.error(
+                "Invalid checksum format",
+                actual_last_2_bytes=header.chksum[6:8],
+                handler=self.NAME,
+            )
+            return None
+        checksum = decode_int(header.chksum[:6], 8)
+        header_bytes_for_checksum = (
+            file[start_offset : start_offset + 148]
+            + b" " * 8  # chksum field is replaced with "blanks"
+            + file[start_offset + 156 : start_offset + 257]
+        )
+        extended_header_bytes = file[start_offset + 257 : start_offset + 500]
+        calculated_checksum_unsigned = sum(header_bytes_for_checksum)
+        calculated_checksum_signed = signed_sum(header_bytes_for_checksum)
+        checksums = (
+            calculated_checksum_unsigned,
+            calculated_checksum_unsigned + sum(extended_header_bytes),
+            # signed is of historical interest, calculating for the extended header is not needed
+            calculated_checksum_signed,
+        )
+        if checksum not in checksums:
+            logger.error(
+                "Tar header checksum mismatch", expected=str(checksum), actual=checksums
+            )
+            return None
+
         end_offset = _get_tar_end_offset(file, start_offset)
         if end_offset == -1:
             return None
         return ValidChunk(start_offset=start_offset, end_offset=end_offset)
+
+
+class TarUstarHandler(_TarHandler):
+    PATTERNS = [
+        HexString("75 73 74 61 72 20 20 00"),
+        HexString("75 73 74 61 72 00 30 30"),
+    ]
+
+    # Since the magic is at 257, we have to subtract that from the match offset
+    # to get to the start of the file.
+    PATTERN_MATCH_OFFSET = -MAGIC_OFFSET
+
+
+def _re_frame(regexp: str):
+    """Wrap regexp to ensure its integrity from concatenation.
+
+    E.g.: when the regex
+      a|b
+    is naively appended by regex c, the result
+      a|bc
+    will not match "ac", while
+      (a|b)c
+    will match "ac" as intended.
+    """
+    return f"({regexp})"
+
+
+def _re_alternatives(regexps):
+    return _re_frame("|".join(_re_frame(regexp) for regexp in regexps))
+
+
+def _padded_field(re_content_char, size, leftpad_re=" ", rightpad_re=r"[ \0x00]"):
+    field_regexes = []
+
+    for padsize in range(size):
+        content_re = f"{re_content_char}{{{size-padsize}}}"
+
+        for leftpadsize in range(padsize + 1):
+            rightpadsize = padsize - leftpadsize
+
+            left_re = f"{leftpad_re}{{{leftpadsize}}}" if leftpadsize else ""
+            right_re = f"{rightpad_re}{{{rightpadsize}}}" if rightpadsize else ""
+
+            field_regexes.append(f"{left_re}{content_re}{right_re}")
+
+    return _re_alternatives(field_regexes)
+
+
+class TarUnixHandler(_TarHandler):
+    PATTERNS = [
+        Regex(
+            r""
+            #  (pattern would be too big)   char name[100]
+            + _padded_field(r"[0-7]", 8)  # char mode[8]
+            + _padded_field(r"[0-7]", 8)  # char uid[8]
+            + _padded_field(r"[0-7]", 8)  # char gid[8]
+            + _padded_field(r"[0-7]", 12)  # char size[12]
+            + _padded_field(r"[0-7]", 12)  # char mtime[12]
+            + _padded_field(r"[0-7]", 8)  # char chksum[8]
+            + r"[0-7\x00]"  # char typeflag[1] - no extensions
+            # Extending/dropping typeflag pattern would cover all tar formats,
+            # r"[0-7xgA-Z\x00]" would probably match all current major implementations.
+            # Info on the values for typeflag:
+            #  - https://en.wikipedia.org/wiki/Tar_(computing)
+            #  - https://www.gnu.org/software/tar/manual/html_node/Standard.html
+            #  - https://github.com/openbsd/src/blob/master/bin/pax/tar.h
+            #  - https://codebrowser.dev/glibc/glibc/posix/tar.h.html
+            #  - https://www.ibm.com/docs/el/aix/7.2?topic=files-tarh-file
+            # Values 'A'-'Z' are reserved for custom implementations.
+            # All other values are reserved for future POSIX.1 revisions.
+            # Several places mention custom extensions and how they extract it,
+            # e.g. the IBM link above is quite explicit.
+            # Since its possible values are somewhat vague,
+            # it might be better still to not include this field in the pattern at all.
+        ),
+    ]
+    PATTERN_MATCH_OFFSET = -100  # go back to beginning of skipped name

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:f799945e335bcd22cae7f2a53033781b8a181a266a12bd04d6ab3ed3f5ba1fd2`
	`3`	`+size 10240`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:7592083f2355ad7e207557efabb3594bf62c9e39677298e8265766a37d835c39`
	`3`	`+size 8`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:bb6f92894363eceff37d58287d2b8b37bb17a00b320312ed59924a8ec07004a6`
	`3`	`+size 8`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:b4d09282b5ac47cc58aa5dc4fe3e8d6829ac737adfe49ee32efee9cf4bf6cdf3`
	`3`	`+size 8`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:7843a44cca6d57497113ed505d027b9f5ffac78f1de7809f81ae9b314d943e79`
	`3`	`+size 8`