Skip to content

Commit 8f85e29

Browse files
authored
Merge pull request #137 from ferstar/master
Restore reader position after retrieving signature bytes
2 parents b586bb5 + 03ebcda commit 8f85e29

File tree

6 files changed

+34
-15
lines changed

6 files changed

+34
-15
lines changed

filetype/types/document.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,10 @@ def match_document(self, buf):
6767
return
6868

6969
# Loop through next 3 files and check if they match
70+
# NOTE: OpenOffice/Libreoffice orders ZIP entry differently, so check the 4th file
71+
# https://github.com/h2non/filetype/blob/d730d98ad5c990883148485b6fd5adbdd378364a/matchers/document.go#L134
7072
idx = 0
71-
for i in range(3):
73+
for i in range(4):
7274
# Search for next file header
7375
idx = self.search_signature(buf, idx + 4, 6000)
7476
if idx == -1:
@@ -110,7 +112,7 @@ def __init__(self):
110112

111113
def match(self, buf):
112114
if len(buf) > 515 and buf[0:8] == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1":
113-
if buf[512:515] == b"\xEC\xA5\xC1\x00":
115+
if buf[512:516] == b"\xEC\xA5\xC1\x00":
114116
return True
115117
if (
116118
len(buf) > 2142

filetype/utils.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- coding: utf-8 -*-
22

3+
from io import BufferedIOBase
34
# Python 2.7 workaround
45
try:
56
import pathlib
@@ -48,7 +49,8 @@ def get_bytes(obj):
4849
returning a sliced bytearray.
4950
5051
Args:
51-
obj: path to readable, file-like object(with read() method), bytes, bytearray or memoryview
52+
obj: path to readable, file-like object(with read() method), bytes,
53+
bytearray or memoryview
5254
5355
Returns:
5456
First 8192 bytes of the file content as bytearray type.
@@ -71,6 +73,13 @@ def get_bytes(obj):
7173
if isinstance(obj, pathlib.PurePath):
7274
return get_signature_bytes(obj)
7375

76+
if isinstance(obj, BufferedIOBase):
77+
start_pos = obj.tell()
78+
obj.seek(0)
79+
magic_bytes = obj.read(_NUM_SIGNATURE_BYTES)
80+
obj.seek(start_pos) # restore reader position
81+
return get_bytes(magic_bytes)
82+
7483
if hasattr(obj, 'read'):
7584
return get_bytes(obj.read(_NUM_SIGNATURE_BYTES))
7685

tests/fixtures/sample_1.doc

13.5 KB
Binary file not shown.

tests/fixtures/sample_1.docx

12.7 KB
Binary file not shown.

tests/test_types.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,14 @@
1414
class TestFileType(unittest.TestCase):
1515
def test_guess_jpeg(self):
1616
img_path = FIXTURES + '/sample.jpg'
17-
for obj in (img_path, open(img_path, 'rb')):
18-
kind = filetype.guess(obj)
17+
with open(img_path, 'rb') as fp:
18+
for obj in (img_path, fp):
19+
kind = filetype.guess(obj)
20+
self.assertTrue(kind is not None)
21+
self.assertEqual(kind.mime, 'image/jpeg')
22+
self.assertEqual(kind.extension, 'jpg')
23+
# reset reader position test
24+
kind = filetype.guess(fp)
1925
self.assertTrue(kind is not None)
2026
self.assertEqual(kind.mime, 'image/jpeg')
2127
self.assertEqual(kind.extension, 'jpg')
@@ -70,16 +76,18 @@ def test_guess_zstd(self):
7076
self.assertEqual(kind.extension, 'zst')
7177

7278
def test_guess_doc(self):
73-
kind = filetype.guess(FIXTURES + '/sample.doc')
74-
self.assertIsNotNone(kind)
75-
self.assertEqual(kind.mime, 'application/msword')
76-
self.assertEqual(kind.extension, 'doc')
79+
for name in 'sample.doc', 'sample_1.doc':
80+
kind = filetype.guess(os.path.join(FIXTURES, name))
81+
self.assertIsNotNone(kind)
82+
self.assertEqual(kind.mime, 'application/msword')
83+
self.assertEqual(kind.extension, 'doc')
7784

7885
def test_guess_docx(self):
79-
kind = filetype.guess(FIXTURES + '/sample.docx')
80-
self.assertTrue(kind is not None)
81-
self.assertEqual(kind.mime, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')
82-
self.assertEqual(kind.extension, 'docx')
86+
for name in 'sample.docx', 'sample_1.docx':
87+
kind = filetype.guess(os.path.join(FIXTURES, name))
88+
self.assertTrue(kind is not None)
89+
self.assertEqual(kind.mime, 'application/vnd.openxmlformats-officedocument.wordprocessingml.document')
90+
self.assertEqual(kind.extension, 'docx')
8391

8492
def test_guess_odt(self):
8593
kind = filetype.guess(FIXTURES + '/sample.odt')

tox.ini

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ envlist = py{27,35,36,37,38,39}, lint, doc, clean
88
skip_missing_interpreters = true
99

1010
[testenv:test]
11-
deps = pytest-benchmark
12-
commands = pytest
11+
deps = pytest
12+
commands = pytest --ignore=tests/test_benchmark.py
1313

1414
[testenv:lint]
1515
basepython = python3

0 commit comments

Comments
 (0)