Skip to content

Commit 321ef6f

Browse files
Properly handle 7z archives
Co-authored-by: catileptic <alex.stefanescu@pm.me>
1 parent fa740d2 commit 321ef6f

File tree

8 files changed

+77
-3
lines changed

8 files changed

+77
-3
lines changed

ingestors/packages/__init__.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
import tarfile
55
from pathlib import PurePath
66

7+
import py7zr
8+
from py7zr.exceptions import ArchiveError
9+
710
from ingestors.ingestor import Ingestor
811
from ingestors.support.package import PackageSupport
912
from ingestors.support.shell import ShellSupport
@@ -24,9 +27,11 @@ def unpack(self, file_path, entity, temp_dir):
2427
*pure_file_path.parts[1:-1], reconstructed_filename
2528
)
2629

27-
self.exec_command(
28-
"7z", "x", str(pure_file_path), "-y", "-r", "-bb0", "-bd", f"-oc:{temp_dir}"
29-
)
30+
try:
31+
with py7zr.SevenZipFile(str(pure_file_path), mode="r") as z:
32+
z.extractall(path=temp_dir)
33+
except ArchiveError as e:
34+
raise ProcessingException(f"Error: {e}")
3035

3136

3237
class SingleFilePackageIngestor(PackageSupport, Ingestor):

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ numpy<2.0.0 # pinned because otherwise spacy requires an incompatible numpy
1414
fingerprints==1.1.1
1515
fasttext==0.9.2
1616
pika==1.3.2
17+
py7zr==1.0.0
1718

1819
# Development
1920
pytest==8.2.0

tests/fixtures/bad7zip.7z

170 Bytes
Binary file not shown.

tests/fixtures/badrar.rar

115 Bytes
Binary file not shown.

tests/fixtures/badtar.tar

10 KB
Binary file not shown.

tests/fixtures/badzip.zip

206 Bytes
Binary file not shown.

tests/fixtures/secret.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is a secret!

tests/test_packages.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- coding: utf-8 -*-
22
from pprint import pprint # noqa
3+
from pathlib import Path
34

45
from .support import TestCase
56

@@ -11,14 +12,80 @@ def test_zip(self):
1112
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
1213
self.assertEqual(entity.schema.name, "Package")
1314

15+
def test_zip_symlink_escape(self):
16+
fixture_path, entity = self.fixture("badzip.zip")
17+
18+
# Ensure that the symlink target exists
19+
target = Path("/ingestors/tests/fixtures/secret.txt")
20+
assert target.read_text() == "This is a secret!"
21+
22+
self.manager.ingest(fixture_path, entity)
23+
24+
# Python’s zipfile handles symlinks that point to files outside of the archive root
25+
# treating them as normal files
26+
assert len(self.manager.entities) == 2
27+
assert self.manager.entities[0].first("fileName") == "secret.txt"
28+
assert (
29+
self.manager.entities[0].first("bodyText")
30+
== "/ingestors/tests/fixtures/secret.txt"
31+
)
32+
assert self.manager.entities[1].first("fileName") == "badzip.zip"
33+
1434
def test_rar(self):
1535
fixture_path, entity = self.fixture("test-documents.rar")
1636
self.manager.ingest(fixture_path, entity)
1737
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
1838
self.assertEqual(entity.schema.name, "Package")
1939

40+
def test_rar_symlink_escape(self):
41+
fixture_path, entity = self.fixture("badrar.rar")
42+
43+
# Ensure that the symlink target exists
44+
target = Path("/ingestors/tests/fixtures/secret.txt")
45+
assert target.read_text() == "This is a secret!"
46+
47+
self.manager.ingest(fixture_path, entity)
48+
49+
# rarfile handles symlinks that point to files outside of the archive root
50+
# treating them as normal files
51+
assert len(self.manager.entities) == 2
52+
assert self.manager.entities[0].first("fileName") == "secret.txt"
53+
assert (
54+
self.manager.entities[0].first("bodyText")
55+
== "/ingestors/tests/fixtures/secret.txt"
56+
)
57+
assert self.manager.entities[1].first("fileName") == "badrar.rar"
58+
2059
def test_tar(self):
2160
fixture_path, entity = self.fixture("test-documents.tar")
2261
self.manager.ingest(fixture_path, entity)
2362
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
2463
self.assertEqual(entity.schema.name, "Package")
64+
65+
def test_tar_symlink_escape(self):
66+
fixture_path, entity = self.fixture("badtar.tar")
67+
68+
# Ensure that the symlink target exists
69+
target = Path("/ingestors/tests/fixtures/secret.txt")
70+
assert target.read_text() == "This is a secret!"
71+
72+
self.manager.ingest(fixture_path, entity)
73+
74+
# Python’s tarfile ignores symlinks that point to files outside of the archive root
75+
assert len(self.manager.entities) == 1
76+
assert self.manager.entities[0].first("fileName") == "badtar.tar"
77+
78+
def test_7zip_symlink_escape(self):
79+
fixture_path, entity = self.fixture("bad7zip.7z")
80+
81+
# Ensure that the symlink target exists
82+
target = Path("/ingestors/tests/fixtures/secret.txt")
83+
assert target.read_text() == "This is a secret!"
84+
85+
self.manager.ingest(fixture_path, entity)
86+
87+
# py7zr raises an exception if it encounters a symlink that points to a file
88+
# outside of the archive root
89+
assert len(self.manager.entities) == 1
90+
assert self.manager.entities[0].first("fileName") == "bad7zip.7z"
91+
assert self.manager.entities[0].first("processingStatus") == "failure"

0 commit comments

Comments
 (0)