Merge pull request #350 from dantecalderon/feature/support-open-protected-files-by-password

jstockwin · web-flow · commit 4985242848c4 · 2023-11-10T17:13:07.000Z
feat: Added support for opening password-protected files
diff --git a/py_pdf_parser/loaders.py b/py_pdf_parser/loaders.py
@@ -44,6 +44,7 @@ def load_file(
 def load(
     pdf_file: IO,
     pdf_file_path: Optional[str] = None,
+    password: Optional[str] = None,
     la_params: Optional[Dict] = None,
     **kwargs: Any,
 ) -> PDFDocument:
@@ -52,13 +53,15 @@ def load(
 
     Args:
         pdf_file (io): The PDF file.
+        pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
+            for `PDFDocument`.
+        password (str, optional): Password for the encrypted PDF. Required if the
+            PDF is encrypted.
         la_params (dict): The layout parameters passed to PDF Miner for analysis. See
             the PDFMiner documentation here:
             https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams.
             Note that py_pdf_parser will re-order the elements it receives from PDFMiner
             so options relating to element ordering will have no effect.
-        pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
-            for `PDFDocument`.
         kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`.
 
     Returns:
@@ -69,7 +72,9 @@ def load(
     la_params = {**DEFAULT_LA_PARAMS, **la_params}
 
     pages: Dict[int, Page] = {}
-    for page in extract_pages(pdf_file, laparams=LAParams(**la_params)):
+    for page in extract_pages(
+        pdf_file, laparams=LAParams(**la_params), password=password
+    ):
         elements = [element for element in page if isinstance(element, LTTextBox)]
 
         # If all_texts=True then we may get some text from inside figures
diff --git a/tests/data/pdfs/test_protected.pdf b/tests/data/pdfs/test_protected.pdf
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
@@ -1,6 +1,8 @@
 import os
 from unittest import TestCase
 
+from pdfminer.pdfdocument import PDFPasswordIncorrect
+
 from py_pdf_parser.components import PDFDocument
 from py_pdf_parser.loaders import load, load_file
 
@@ -11,6 +13,20 @@ def test_load_file(self):
         document = load_file(file_path)
         self.assertIsInstance(document, PDFDocument)
 
+    def test_load_protected_file(self):
+        file_path = os.path.join(
+            os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf"
+        )
+        document = load_file(file_path, password="p4ssword")
+        self.assertIsInstance(document, PDFDocument)
+
+    def test_load_protected_file_wrong_password(self):
+        file_path = os.path.join(
+            os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf"
+        )
+        with self.assertRaises(PDFPasswordIncorrect):
+            load_file(file_path, password="wrong_password")
+
     def test_load(self):
         file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf")
         with open(file_path, "rb") as in_file: