|
12 | 12 | ExtractResult, |
13 | 13 | File, |
14 | 14 | HexString, |
| 15 | + Regex, |
15 | 16 | StructHandler, |
16 | 17 | ValidChunk, |
17 | 18 | ) |
@@ -98,17 +99,10 @@ def extract(self, inpath: Path, outdir: Path): |
98 | 99 | return ExtractResult(reports=tarfile.reports) |
99 | 100 |
|
100 | 101 |
|
101 | | -class TarHandler(StructHandler): |
| 102 | +class _TarHandler(StructHandler): |
102 | 103 | NAME = "tar" |
103 | 104 |
|
104 | | - PATTERNS = [ |
105 | | - HexString("75 73 74 61 72 20 20 00"), |
106 | | - HexString("75 73 74 61 72 00 30 30"), |
107 | | - ] |
108 | | - |
109 | | - # Since the magic is at 257, we have to subtract that from the match offset |
110 | | - # to get to the start of the file. |
111 | | - PATTERN_MATCH_OFFSET = -MAGIC_OFFSET |
| 105 | + PATTERNS = [] |
112 | 106 |
|
113 | 107 | C_DEFINITIONS = r""" |
114 | 108 | typedef struct posix_header |
@@ -142,7 +136,115 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk] |
142 | 136 | header_size = snull(header.size) |
143 | 137 | decode_int(header_size, 8) |
144 | 138 |
|
| 139 | + def signed_sum(octets) -> int: |
| 140 | + return sum(b if b < 128 else 256 - b for b in octets) |
| 141 | + |
| 142 | + if header.chksum[6:8] not in (b"\x00 ", b" \x00"): |
| 143 | + logger.error( |
| 144 | + "Invalid checksum format", |
| 145 | + actual_last_2_bytes=header.chksum[6:8], |
| 146 | + handler=self.NAME, |
| 147 | + ) |
| 148 | + return None |
| 149 | + checksum = decode_int(header.chksum[:6], 8) |
| 150 | + header_bytes_for_checksum = ( |
| 151 | + file[start_offset : start_offset + 148] |
| 152 | + + b" " * 8 # chksum field is replaced with "blanks" |
| 153 | + + file[start_offset + 156 : start_offset + 257] |
| 154 | + ) |
| 155 | + extended_header_bytes = file[start_offset + 257 : start_offset + 500] |
| 156 | + calculated_checksum_unsigned = sum(header_bytes_for_checksum) |
| 157 | + calculated_checksum_signed = signed_sum(header_bytes_for_checksum) |
| 158 | + checksums = ( |
| 159 | + calculated_checksum_unsigned, |
| 160 | + calculated_checksum_unsigned + sum(extended_header_bytes), |
| 161 | + # signed is of historical interest, calculating for the extended header is not needed |
| 162 | + calculated_checksum_signed, |
| 163 | + ) |
| 164 | + if checksum not in checksums: |
| 165 | + logger.error( |
| 166 | + "Tar header checksum mismatch", expected=str(checksum), actual=checksums |
| 167 | + ) |
| 168 | + return None |
| 169 | + |
145 | 170 | end_offset = _get_tar_end_offset(file, start_offset) |
146 | 171 | if end_offset == -1: |
147 | 172 | return None |
148 | 173 | return ValidChunk(start_offset=start_offset, end_offset=end_offset) |
| 174 | + |
| 175 | + |
| 176 | +class TarUstarHandler(_TarHandler): |
| 177 | + PATTERNS = [ |
| 178 | + HexString("75 73 74 61 72 20 20 00"), |
| 179 | + HexString("75 73 74 61 72 00 30 30"), |
| 180 | + ] |
| 181 | + |
| 182 | + # Since the magic is at 257, we have to subtract that from the match offset |
| 183 | + # to get to the start of the file. |
| 184 | + PATTERN_MATCH_OFFSET = -MAGIC_OFFSET |
| 185 | + |
| 186 | + |
| 187 | +def _re_frame(regexp: str): |
| 188 | + """Wrap regexp to ensure its integrity from concatenation. |
| 189 | +
|
| 190 | + E.g.: when the regex |
| 191 | + a|b |
| 192 | + is naively appended by regex c, the result |
| 193 | + a|bc |
| 194 | + will not match "ac", while |
| 195 | + (a|b)c |
| 196 | + will match "ac" as intended. |
| 197 | + """ |
| 198 | + return f"({regexp})" |
| 199 | + |
| 200 | + |
| 201 | +def _re_alternatives(regexps): |
| 202 | + return _re_frame("|".join(_re_frame(regexp) for regexp in regexps)) |
| 203 | + |
| 204 | + |
| 205 | +def _padded_field(re_content_char, size, leftpad_re=" ", rightpad_re=r"[ \0x00]"): |
| 206 | + field_regexes = [] |
| 207 | + |
| 208 | + for padsize in range(size): |
| 209 | + content_re = f"{re_content_char}{{{size-padsize}}}" |
| 210 | + |
| 211 | + for leftpadsize in range(padsize + 1): |
| 212 | + rightpadsize = padsize - leftpadsize |
| 213 | + |
| 214 | + left_re = f"{leftpad_re}{{{leftpadsize}}}" if leftpadsize else "" |
| 215 | + right_re = f"{rightpad_re}{{{rightpadsize}}}" if rightpadsize else "" |
| 216 | + |
| 217 | + field_regexes.append(f"{left_re}{content_re}{right_re}") |
| 218 | + |
| 219 | + return _re_alternatives(field_regexes) |
| 220 | + |
| 221 | + |
| 222 | +class TarUnixHandler(_TarHandler): |
| 223 | + PATTERNS = [ |
| 224 | + Regex( |
| 225 | + r"" |
| 226 | + # (pattern would be too big) char name[100] |
| 227 | + + _padded_field(r"[0-7]", 8) # char mode[8] |
| 228 | + + _padded_field(r"[0-7]", 8) # char uid[8] |
| 229 | + + _padded_field(r"[0-7]", 8) # char gid[8] |
| 230 | + + _padded_field(r"[0-7]", 12) # char size[12] |
| 231 | + + _padded_field(r"[0-7]", 12) # char mtime[12] |
| 232 | + + _padded_field(r"[0-7]", 8) # char chksum[8] |
| 233 | + + r"[0-7\x00]" # char typeflag[1] - no extensions |
| 234 | + # Extending/dropping typeflag pattern would cover all tar formats, |
| 235 | + # r"[0-7xgA-Z\x00]" would probably match all current major implementations. |
| 236 | + # Info on the values for typeflag: |
| 237 | + # - https://en.wikipedia.org/wiki/Tar_(computing) |
| 238 | + # - https://www.gnu.org/software/tar/manual/html_node/Standard.html |
| 239 | + # - https://github.com/openbsd/src/blob/master/bin/pax/tar.h |
| 240 | + # - https://codebrowser.dev/glibc/glibc/posix/tar.h.html |
| 241 | + # - https://www.ibm.com/docs/el/aix/7.2?topic=files-tarh-file |
| 242 | + # Values 'A'-'Z' are reserved for custom implementations. |
| 243 | + # All other values are reserved for future POSIX.1 revisions. |
| 244 | + # Several places mention custom extensions and how they extract it, |
| 245 | + # e.g. the IBM link above is quite explicit. |
| 246 | + # Since its possible values are somewhat vague, |
| 247 | + # it might be better still to not include this field in the pattern at all. |
| 248 | + ), |
| 249 | + ] |
| 250 | + PATTERN_MATCH_OFFSET = -100 # go back to beginning of skipped name |
0 commit comments