Skip to content

Commit 4985242

Browse files
authored
Merge pull request #350 from dantecalderon/feature/support-open-protected-files-by-password
feat: Added support for opening password-protected files
2 parents 3b5b3dc + 4fcca50 commit 4985242

File tree

3 files changed

+24
-3
lines changed

3 files changed

+24
-3
lines changed

py_pdf_parser/loaders.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def load_file(
4444
def load(
4545
pdf_file: IO,
4646
pdf_file_path: Optional[str] = None,
47+
password: Optional[str] = None,
4748
la_params: Optional[Dict] = None,
4849
**kwargs: Any,
4950
) -> PDFDocument:
@@ -52,13 +53,15 @@ def load(
5253
5354
Args:
5455
pdf_file (io): The PDF file.
56+
pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
57+
for `PDFDocument`.
58+
password (str, optional): Password for the encrypted PDF. Required if the
59+
PDF is encrypted.
5560
la_params (dict): The layout parameters passed to PDF Miner for analysis. See
5661
the PDFMiner documentation here:
5762
https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams.
5863
Note that py_pdf_parser will re-order the elements it receives from PDFMiner
5964
so options relating to element ordering will have no effect.
60-
pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
61-
for `PDFDocument`.
6265
kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`.
6366
6467
Returns:
@@ -69,7 +72,9 @@ def load(
6972
la_params = {**DEFAULT_LA_PARAMS, **la_params}
7073

7174
pages: Dict[int, Page] = {}
72-
for page in extract_pages(pdf_file, laparams=LAParams(**la_params)):
75+
for page in extract_pages(
76+
pdf_file, laparams=LAParams(**la_params), password=password
77+
):
7378
elements = [element for element in page if isinstance(element, LTTextBox)]
7479

7580
# If all_texts=True then we may get some text from inside figures

tests/data/pdfs/test_protected.pdf

35.7 KB
Binary file not shown.

tests/test_loaders.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
from unittest import TestCase
33

4+
from pdfminer.pdfdocument import PDFPasswordIncorrect
5+
46
from py_pdf_parser.components import PDFDocument
57
from py_pdf_parser.loaders import load, load_file
68

@@ -11,6 +13,20 @@ def test_load_file(self):
1113
document = load_file(file_path)
1214
self.assertIsInstance(document, PDFDocument)
1315

16+
def test_load_protected_file(self):
17+
file_path = os.path.join(
18+
os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf"
19+
)
20+
document = load_file(file_path, password="p4ssword")
21+
self.assertIsInstance(document, PDFDocument)
22+
23+
def test_load_protected_file_wrong_password(self):
24+
file_path = os.path.join(
25+
os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf"
26+
)
27+
with self.assertRaises(PDFPasswordIncorrect):
28+
load_file(file_path, password="wrong_password")
29+
1430
def test_load(self):
1531
file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf")
1632
with open(file_path, "rb") as in_file:

0 commit comments

Comments
 (0)