|
137 | 137 | }, |
138 | 138 | "outputs": [], |
139 | 139 | "source": [ |
140 | | - "from langchain_community.chat_models import BedrockChat\n", |
| 140 | + "from langchain_aws import ChatBedrock\n", |
141 | 141 | "from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler" |
142 | 142 | ] |
143 | 143 | }, |
|
150 | 150 | }, |
151 | 151 | "outputs": [], |
152 | 152 | "source": [ |
153 | | - "llm_text = BedrockChat(\n", |
| 153 | + "llm_text = ChatBedrock(\n", |
154 | 154 | " model_id=bedrock_info.get_model_id(model_name=\"Claude-V3-Sonnet\"),\n", |
155 | 155 | " client=boto3_bedrock,\n", |
156 | 156 | " streaming=True,\n", |
|
504 | 504 | }, |
505 | 505 | "outputs": [], |
506 | 506 | "source": [ |
507 | | - "table_by_llama_parse = True" |
| 507 | + "table_by_llama_parse = False" |
508 | 508 | ] |
509 | 509 | }, |
510 | 510 | { |
|
517 | 517 | "outputs": [], |
518 | 518 | "source": [ |
519 | 519 | "def api_key():\n", |
520 | | - " os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"llx-R4G3Pzu5IZIdq5AoAFILW1PPaVZxrVRN937R6f3cItBvPs1U\"\n", |
| 520 | + " os.environ[\"LLAMA_CLOUD_API_KEY\"] = \"<your key>\"\n", |
521 | 521 | " nest_asyncio.apply()\n", |
522 | 522 | " load_dotenv()" |
523 | 523 | ] |
|
617 | 617 | }, |
618 | 618 | "outputs": [], |
619 | 619 | "source": [ |
620 | | - "table_by_pymupdf = True" |
| 620 | + "table_by_pymupdf = False" |
621 | 621 | ] |
622 | 622 | }, |
623 | 623 | { |
|
803 | 803 | "# tables_camleot[0].df.to_markdown()" |
804 | 804 | ] |
805 | 805 | }, |
806 | | - { |
807 | | - "cell_type": "code", |
808 | | - "execution_count": null, |
809 | | - "id": "825d1b1f-5208-4a1d-8ee9-971a493d7bfb", |
810 | | - "metadata": { |
811 | | - "tags": [] |
812 | | - }, |
813 | | - "outputs": [], |
814 | | - "source": [ |
815 | | - "# for image in images:\n", |
816 | | - " \n", |
817 | | - "# img = cv2.imread(image) \n", |
818 | | - "# width, height, _ = img.shape\n", |
819 | | - "# image_token = width*height/750\n", |
820 | | - "# print (f'image: {image}, shape: {img.shape}, image_token_for_claude3: {image_token}' )\n", |
821 | | - " \n", |
822 | | - "# if image_token > 1500:\n", |
823 | | - "# resize_img = cv2.resize(img, (0, 0), fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)\n", |
824 | | - "# print(\" - resize_img.shape = {0}\".format(resize_img.shape))\n", |
825 | | - "# table_image_path = image.replace(\".jpg\", \"-resize.jpg\")\n", |
826 | | - "# cv2.imwrite(table_image_path, resize_img)\n", |
827 | | - "# os.remove(image)\n", |
828 | | - " \n", |
829 | | - "# images = glob(os.path.join(image_path, \"*\"))" |
830 | | - ] |
831 | | - }, |
832 | 806 | { |
833 | 807 | "cell_type": "markdown", |
834 | 808 | "id": "fb16fd2a-1983-462d-94d8-4c62e81ef28c", |
|
1102 | 1076 | "summarize_chain = {\"table\": lambda x:x} | prompt | llm_text | StrOutputParser()" |
1103 | 1077 | ] |
1104 | 1078 | }, |
1105 | | - { |
1106 | | - "cell_type": "code", |
1107 | | - "execution_count": null, |
1108 | | - "id": "7a0bc0a5-7f5e-4bb8-9f0f-bbaf31622448", |
1109 | | - "metadata": { |
1110 | | - "tags": [] |
1111 | | - }, |
1112 | | - "outputs": [], |
1113 | | - "source": [ |
1114 | | - "len(tables), len(docs_table_pymupdf), len(docs_table_llamaparse)" |
1115 | | - ] |
1116 | | - }, |
1117 | 1079 | { |
1118 | 1080 | "cell_type": "code", |
1119 | 1081 | "execution_count": null, |
|
1223 | 1185 | "#tables_preprocessed, images_preprocessed\n" |
1224 | 1186 | ] |
1225 | 1187 | }, |
1226 | | - { |
1227 | | - "cell_type": "code", |
1228 | | - "execution_count": null, |
1229 | | - "id": "d3504e83-d62e-40a9-9cdc-d5fbea2d7029", |
1230 | | - "metadata": { |
1231 | | - "tags": [] |
1232 | | - }, |
1233 | | - "outputs": [], |
1234 | | - "source": [ |
1235 | | - "tables_preprocessed[0]" |
1236 | | - ] |
1237 | | - }, |
1238 | 1188 | { |
1239 | 1189 | "cell_type": "code", |
1240 | 1190 | "execution_count": null, |
|
1314 | 1264 | "outputs": [], |
1315 | 1265 | "source": [ |
1316 | 1266 | "#index_name = \"kb_complex_doc\"\n", |
1317 | | - "index_name = \"summit-workshop-index-unstructured-pymupdf-llama\" #summit-workshop-index-unstructured, #summit-workshop-index-unstructured-pymupdf" |
| 1267 | + "index_name = \"summit-workshop-index\" #summit-workshop-index-unstructured, #summit-workshop-index-unstructured-pymupdf, summit-workshop-index-unstructured-pymupdf-llama" |
1318 | 1268 | ] |
1319 | 1269 | }, |
1320 | 1270 | { |
|
0 commit comments