77from llmstack .common .blocks .data .store .vectorstore import Document
88from llmstack .common .utils .splitter import SpacyTextSplitter
99from llmstack .common .utils .text_extract import ExtraParams , extract_text_from_url
10- from llmstack .common .utils .utils import extract_urls_from_sitemap
1110from llmstack .datasources .handlers .datasource_processor import (
1211 WEAVIATE_SCHEMA ,
1312 DataSourceEntryItem ,
@@ -116,25 +115,13 @@ def get_url_data(
116115
117116 def validate_and_process (self , data : dict ) -> List [DataSourceEntryItem ]:
118117 entry = URLSchema (** data )
119- sitemap_urls = []
120118 # Split urls by newline and then by comma
121119 urls = entry .urls .split ("\n " )
122120 urls = [url .strip ().rstrip () for url_list in [url .split ("," ) for url in urls ] for url in url_list ]
123121 # Filter out empty urls
124122 urls = list (set (list (filter (lambda url : url != "" , urls ))))
125- sitemap_xmls = list (
126- filter (lambda url : url .endswith (".xml" ), urls ),
127- )
128123 # Filter out sitemap.xml
129124 urls = list (filter (lambda url : not url .endswith (".xml" ), urls ))
130- # If sitemap.xml is present, scrape the site to extract urls
131- try :
132- for sitemap_xml in sitemap_xmls :
133- sitmap_xml_urls = extract_urls_from_sitemap (sitemap_xml )
134- for sitmap_xml_url in sitmap_xml_urls :
135- sitemap_urls .append (sitmap_xml_url )
136- except BaseException :
137- logger .exception ("Error in extracting urls from sitemap" )
138125
139126 return list (
140127 map (
@@ -145,7 +132,7 @@ def validate_and_process(self, data: dict) -> List[DataSourceEntryItem]:
145132 "connection_id" : entry .connection_id ,
146133 },
147134 ),
148- urls + sitemap_urls ,
135+ urls ,
149136 ),
150137 )
151138
0 commit comments