Skip to content
This repository was archived by the owner on Oct 25, 2024. It is now read-only.

Commit 6042826

Browse files
xmx-521ClarkChin08lvliang-intelpre-commit-ci[bot]
authored
[NeuralChat] Suport pptx format for RAG (#1447)
* enable rag for pptx format file Signed-off-by: Chen Xi <xi2.chen@intel.com> Signed-off-by: Manxin Xu <1426356297@qq.com> --------- Signed-off-by: Chen Xi <xi2.chen@intel.com> Signed-off-by: lvliang-intel <liang1.lv@intel.com> Signed-off-by: Manxin Xu <1426356297@qq.com> Co-authored-by: Chen Xi <xi2.chen@intel.com> Co-authored-by: lvliang-intel <liang1.lv@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent cfb19e6 commit 6042826

File tree

5 files changed

+38
-3
lines changed

5 files changed

+38
-3
lines changed
Binary file not shown.

intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/parser/context_utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import pandas as pd
2020
import re, json
2121
from langchain_community.document_loaders import UnstructuredMarkdownLoader
22+
from langchain_community.document_loaders import UnstructuredPowerPointLoader
2223
from docx import Document as DDocument
2324
from bs4 import BeautifulSoup
2425
import fitz
@@ -112,6 +113,13 @@ def read_md(md_path):
112113
return text
113114

114115

116+
def read_pptx(pptx_path):
117+
"""Read pptx file."""
118+
loader = UnstructuredPowerPointLoader(pptx_path)
119+
text = loader.load()[0].page_content
120+
return text
121+
122+
115123
def load_json(input, process, max_length, min_length):
116124
"""Load and process json file."""
117125
data = []
@@ -226,6 +234,8 @@ def load_unstructured_data(input):
226234
text = read_txt(input)
227235
elif input.endswith("md"):
228236
text = read_md(input)
237+
elif input.endswith("pptx"):
238+
text = read_pptx(input)
229239

230240
text = text.replace('\n', ' ')
231241
text = text.replace('\n\n', ' ')

intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/parser/parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def load(self, input, **kwargs):
7070
def parse_document(self, input):
7171
"""Parse the uploaded file."""
7272
if input.endswith("pdf") or input.endswith("docx") or input.endswith("html") \
73-
or input.endswith("txt") or input.endswith("md"):
73+
or input.endswith("txt") or input.endswith("md") or input.endswith("pptx"):
7474
content = load_unstructured_data(input)
7575
if self.process:
7676
chuck = get_chuck_data(content, self.max_chuck_size, self.min_chuck_size, input)
@@ -110,7 +110,7 @@ def batch_parse_document(self, input):
110110
for dirpath, dirnames, filenames in os.walk(input):
111111
for filename in filenames:
112112
if filename.endswith("pdf") or filename.endswith("docx") or filename.endswith("html") \
113-
or filename.endswith("txt") or filename.endswith("md"):
113+
or filename.endswith("txt") or filename.endswith("md") or filename.endswith("pptx"):
114114
content = load_unstructured_data(os.path.join(dirpath, filename))
115115
if self.process:
116116
chuck = get_chuck_data(content, self.max_chuck_size, self.min_chuck_size, input)

intel_extension_for_transformers/neural_chat/tests/ci/plugins/retrieval/test_rag.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,31 @@ def test_retrieval_docx(self):
9898
self.assertIsNotNone(response)
9999
plugins.retrieval.enable = False
100100

101+
class TestChatbotBuilder_pptx(unittest.TestCase):
102+
def setUp(self):
103+
if os.path.exists("test_pptx"):
104+
shutil.rmtree("test_pptx", ignore_errors=True)
105+
return super().setUp()
106+
107+
def tearDown(self) -> None:
108+
if os.path.exists("test_pptx"):
109+
shutil.rmtree("test_pptx", ignore_errors=True)
110+
return super().tearDown()
111+
112+
def test_retrieval_pptx(self):
113+
plugins.retrieval.enable = True
114+
plugins.retrieval.args["input_path"] = "../assets/docs/sample.pptx"
115+
plugins.retrieval.args["persist_directory"] = "./test_pptx"
116+
plugins.retrieval.args["retrieval_type"] = 'default'
117+
config = PipelineConfig(model_name_or_path="facebook/opt-125m",
118+
plugins=plugins)
119+
chatbot = build_chatbot(config)
120+
response = chatbot.predict("How many cores does the Intel Xeon Platinum 8480+ Processor have in total?")
121+
print(response)
122+
plugins.retrieval.args["persist_directory"] = "./output"
123+
self.assertIsNotNone(response)
124+
plugins.retrieval.enable = False
125+
101126
class TestChatbotBuilder_xlsx(unittest.TestCase):
102127
def setUp(self):
103128
if os.path.exists("test_xlsx"):

intel_extension_for_transformers/neural_chat/tests/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ torchvision==0.18.0
8585
tqdm
8686
transformers==4.38.0
8787
transformers_stream_generator
88-
unstructured
88+
unstructured[all-docs]
8989
urllib3
9090
uvicorn
9191
vector_quantize_pytorch

0 commit comments

Comments
 (0)