Skip to content

Commit bd816e3

Browse files
committed
save and load parsed pdf
1 parent 0f6e178 commit bd816e3

File tree

1 file changed

+31
-3
lines changed

1 file changed

+31
-3
lines changed

src/axiomatic/client.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import base64
22
import requests
3+
import os
34
from typing import Dict
45

56
from .base_client import BaseClient, AsyncBaseClient
@@ -9,27 +10,31 @@
910
class Axiomatic(BaseClient):
1011

1112
def __init__(self, *args, **kwargs):
13+
if "timeout" not in kwargs:
14+
kwargs["timeout"] = 600
1215
super().__init__(*args, **kwargs)
1316

1417
self.document_helper = DocumentHelper(self)
1518

1619

1720
class DocumentHelper:
1821

22+
_ax_client: Axiomatic
23+
1924
def __init__(self, ax_client: Axiomatic):
20-
self.ax_client = ax_client
25+
self._ax_client = ax_client
2126

2227
def pdf_from_url(self, url: str) -> MdResponse:
2328
"""Download a PDF document from a URL and parse it into a Markdown response."""
2429
file = requests.get(url)
25-
response = self.ax_client.document.parse(file=file.content)
30+
response = self._ax_client.document.parse(file=file.content)
2631
return response.content
2732

2833
def pdf_from_file(self, path: str) -> MdResponse:
2934
"""Open a PDF document from a file path and parse it into a Markdown response."""
3035
with open(path, "rb") as f:
3136
file = f.read()
32-
response = self.ax_client.document.parse(file=file)
37+
response = self._ax_client.document.parse(file=file)
3338
return response.content
3439

3540
def plot_b64_images(self, images: Dict[str, str]):
@@ -60,5 +65,28 @@ def navigate_image(change):
6065
display(layout)
6166
display_base64_image(current_index[0])
6267

68+
def save_parsed_pdf(self, response: MdResponse, path: str):
69+
"""Save a parsed PDF response to a file."""
70+
os.makedirs(path, exist_ok=True)
71+
for img_name, img in response.images.items():
72+
with open(os.path.join(path, f"{img_name}.png"), "wb") as f:
73+
f.write(base64.b64decode(img))
74+
75+
with open(os.path.join(path, "text.md"), "w") as f:
76+
f.write(response.markdown)
77+
78+
def load_parsed_pdf(self, path: str) -> MdResponse:
79+
"""Load a parsed PDF response from a file."""
80+
with open(os.path.join(path, "text.md"), "r") as f:
81+
markdown = f.read()
82+
83+
images = {}
84+
for img_name in os.listdir(path):
85+
if img_name.endswith((".png")):
86+
with open(os.path.join(path, img_name), "rb") as img_file:
87+
images[img_name] = base64.b64encode(img_file.read()).decode("utf-8")
88+
89+
return MdResponse(markdown=markdown, images=images)
90+
6391

6492
class AsyncAxiomatic(AsyncBaseClient): ...

0 commit comments

Comments
 (0)