|
1 | 1 | import base64 |
2 | 2 | import requests |
| 3 | +import os |
3 | 4 | from typing import Dict |
4 | 5 |
|
5 | 6 | from .base_client import BaseClient, AsyncBaseClient |
|
9 | 10 | class Axiomatic(BaseClient): |
10 | 11 |
|
11 | 12 | def __init__(self, *args, **kwargs): |
| 13 | + if "timeout" not in kwargs: |
| 14 | + kwargs["timeout"] = 600 |
12 | 15 | super().__init__(*args, **kwargs) |
13 | 16 |
|
14 | 17 | self.document_helper = DocumentHelper(self) |
15 | 18 |
|
16 | 19 |
|
17 | 20 | class DocumentHelper: |
18 | 21 |
|
| 22 | + _ax_client: Axiomatic |
| 23 | + |
19 | 24 | def __init__(self, ax_client: Axiomatic): |
20 | | - self.ax_client = ax_client |
| 25 | + self._ax_client = ax_client |
21 | 26 |
|
22 | 27 | def pdf_from_url(self, url: str) -> MdResponse: |
23 | 28 | """Download a PDF document from a URL and parse it into a Markdown response.""" |
24 | 29 | file = requests.get(url) |
25 | | - response = self.ax_client.document.parse(file=file.content) |
| 30 | + response = self._ax_client.document.parse(file=file.content) |
26 | 31 | return response.content |
27 | 32 |
|
28 | 33 | def pdf_from_file(self, path: str) -> MdResponse: |
29 | 34 | """Open a PDF document from a file path and parse it into a Markdown response.""" |
30 | 35 | with open(path, "rb") as f: |
31 | 36 | file = f.read() |
32 | | - response = self.ax_client.document.parse(file=file) |
| 37 | + response = self._ax_client.document.parse(file=file) |
33 | 38 | return response.content |
34 | 39 |
|
35 | 40 | def plot_b64_images(self, images: Dict[str, str]): |
@@ -60,5 +65,28 @@ def navigate_image(change): |
60 | 65 | display(layout) |
61 | 66 | display_base64_image(current_index[0]) |
62 | 67 |
|
| 68 | + def save_parsed_pdf(self, response: MdResponse, path: str): |
| 69 | + """Save a parsed PDF response to a file.""" |
| 70 | + os.makedirs(path, exist_ok=True) |
| 71 | + for img_name, img in response.images.items(): |
| 72 | + with open(os.path.join(path, f"{img_name}.png"), "wb") as f: |
| 73 | + f.write(base64.b64decode(img)) |
| 74 | + |
| 75 | + with open(os.path.join(path, "text.md"), "w") as f: |
| 76 | + f.write(response.markdown) |
| 77 | + |
| 78 | + def load_parsed_pdf(self, path: str) -> MdResponse: |
| 79 | + """Load a parsed PDF response from a file.""" |
| 80 | + with open(os.path.join(path, "text.md"), "r") as f: |
| 81 | + markdown = f.read() |
| 82 | + |
| 83 | + images = {} |
| 84 | + for img_name in os.listdir(path): |
| 85 | + if img_name.endswith((".png")): |
| 86 | + with open(os.path.join(path, img_name), "rb") as img_file: |
| 87 | + images[img_name] = base64.b64encode(img_file.read()).decode("utf-8") |
| 88 | + |
| 89 | + return MdResponse(markdown=markdown, images=images) |
| 90 | + |
63 | 91 |
|
64 | 92 | class AsyncAxiomatic(AsyncBaseClient): ... |
0 commit comments