Skip to content

Commit 4c2d767

Browse files
kartikpersistentaashipandya
authored andcommitted
wikipedia URL input (#424)
* accept only wikipedia links * added wikipedia link * added wikilink regex * wikipedia single url only * changed the alert message * wording change * pushed validation state persist error --------- Co-authored-by: aashipandya <156318202+aashipandya@users.noreply.github.com>
1 parent 51754e3 commit 4c2d767

File tree

6 files changed

+89
-60
lines changed

6 files changed

+89
-60
lines changed

backend/src/main.py

Lines changed: 24 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -127,34 +127,31 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type
127127
success_count=0
128128
failed_count=0
129129
lst_file_name=[]
130-
queries_list = wiki_query.split(',')
131-
wiki_query_ids, languages = check_url_source(source_type=source_type, queries_list=queries_list)
132-
for query,language in zip(wiki_query_ids, languages):
133-
logging.info(f"Creating source node for {query.strip()}, {language}")
134-
pages = WikipediaLoader(query=query.strip(), lang=language, load_max_docs=1, load_all_available_meta=True).load()
135-
try:
136-
if not pages:
137-
failed_count+=1
138-
continue
139-
obj_source_node = sourceNode()
140-
obj_source_node.file_name = query.strip()
141-
obj_source_node.file_type = 'text'
142-
obj_source_node.file_source = source_type
143-
obj_source_node.file_size = sys.getsizeof(pages[0].page_content)
144-
obj_source_node.total_pages = len(pages)
145-
obj_source_node.model = model
146-
obj_source_node.url = urllib.parse.unquote(pages[0].metadata['source'])
147-
obj_source_node.created_at = datetime.now()
148-
obj_source_node.language = language
149-
graphDb_data_Access = graphDBdataAccess(graph)
150-
graphDb_data_Access.create_source_node(obj_source_node)
151-
success_count+=1
152-
lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url, 'language':obj_source_node.language, 'status':'Success'})
153-
except Exception as e:
154-
failed_count+=1
155-
lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url, 'language':obj_source_node.language, 'status':'Failed'})
130+
#queries_list = wiki_query.split(',')
131+
wiki_query_id, language = check_url_source(source_type=source_type, wiki_query=wiki_query)
132+
logging.info(f"Creating source node for {wiki_query_id.strip()}, {language}")
133+
pages = WikipediaLoader(query=wiki_query_id.strip(), lang=language, load_max_docs=1, load_all_available_meta=True).load()
134+
if pages==None or len(pages)==0:
135+
failed_count+=1
136+
message = f"Unable to read data for given Wikipedia url : {wiki_query}"
137+
raise Exception(message)
138+
else:
139+
obj_source_node = sourceNode()
140+
obj_source_node.file_name = wiki_query_id.strip()
141+
obj_source_node.file_type = 'text'
142+
obj_source_node.file_source = source_type
143+
obj_source_node.file_size = sys.getsizeof(pages[0].page_content)
144+
obj_source_node.total_pages = len(pages)
145+
obj_source_node.model = model
146+
obj_source_node.url = urllib.parse.unquote(pages[0].metadata['source'])
147+
obj_source_node.created_at = datetime.now()
148+
obj_source_node.language = language
149+
graphDb_data_Access = graphDBdataAccess(graph)
150+
graphDb_data_Access.create_source_node(obj_source_node)
151+
success_count+=1
152+
lst_file_name.append({'fileName':obj_source_node.file_name,'fileSize':obj_source_node.file_size,'url':obj_source_node.url, 'language':obj_source_node.language, 'status':'Success'})
156153
return lst_file_name,success_count,failed_count
157-
154+
158155
def extract_graph_from_file_local_file(graph, model, fileName, merged_file_path, allowedNodes, allowedRelationship):
159156

160157
logging.info(f'Process file name :{fileName}')

backend/src/shared/common_fn.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,37 +19,36 @@
1919

2020
#watch("neo4j")
2121

22-
23-
def check_url_source(source_type, yt_url:str=None, queries_list:List[str]=None):
24-
languages=[]
22+
def check_url_source(source_type, yt_url:str=None, wiki_query:str=None):
23+
language=''
2524
try:
2625
logging.info(f"incoming URL: {yt_url}")
2726
if source_type == 'youtube':
2827
if re.match('(?:https?:\/\/)?(?:www\.)?youtu\.?be(?:\.com)?\/?.*(?:watch|embed)?(?:.*v=|v\/|\/)([\w\-_]+)\&?',yt_url.strip()):
2928
youtube_url = create_youtube_url(yt_url.strip())
3029
logging.info(youtube_url)
31-
return youtube_url,languages
30+
return youtube_url,language
3231
else:
3332
raise Exception('Incoming URL is not youtube URL')
3433

3534
elif source_type == 'Wikipedia':
36-
wiki_query_ids=[]
35+
wiki_query_id=''
3736
#pattern = r"https?:\/\/([a-zA-Z0-9\.\,\_\-\/]+)\.wikipedia\.([a-zA-Z]{2,3})\/wiki\/([a-zA-Z0-9\.\,\_\-\/]+)"
3837
wikipedia_url_regex = r'https?:\/\/(www\.)?([a-zA-Z]{2,3})\.wikipedia\.org\/wiki\/(.*)'
3938
wiki_id_pattern = r'^[a-zA-Z0-9 _\-\.\,\:\(\)\[\]\{\}\/]*$'
4039

41-
for wiki_url in queries_list:
42-
match = re.search(wikipedia_url_regex, wiki_url.strip())
43-
if match:
44-
languages.append(match.group(2))
45-
wiki_query_ids.append(match.group(3))
46-
else :
47-
languages.append("en")
48-
wiki_query_ids.append(wiki_url.strip())
49-
40+
match = re.search(wikipedia_url_regex, wiki_query.strip())
41+
if match:
42+
language = match.group(2)
43+
wiki_query_id = match.group(3)
44+
# else :
45+
# languages.append("en")
46+
# wiki_query_ids.append(wiki_url.strip())
47+
else:
48+
raise Exception(f'Not a valid wikipedia url: {wiki_query} ')
5049

51-
logging.info(f"wikipedia query ids = {wiki_query_ids}")
52-
return wiki_query_ids, languages
50+
logging.info(f"wikipedia query id = {wiki_query_id}")
51+
return wiki_query_id, language
5352
except Exception as e:
5453
logging.error(f"Error in recognize URL: {e}")
5554
raise Exception(e)

frontend/src/App.css

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -278,23 +278,34 @@
278278
width: 100%;
279279
}
280280

281-
@container (min-height:500px) and (max-height:600px) {
281+
@container (min-height:500px) and (max-height:700px) {
282282
.outline-dashed img {
283-
width: 45px;
283+
width: 40px;
284+
height: auto;
284285
}
285286

286287
.outline-dashed h6 {
287288
font-size: 14px;
288289
}
289290

290291
}
291-
292-
@container (min-height:300px) and (max-height:500px) {
292+
@container (min-height:400px) and (max-height:500px) {
293293
.outline-dashed img {
294294
width: 35px;
295295
height: auto;
296296
}
297297

298+
.outline-dashed h6 {
299+
font-size: 14px;
300+
}
301+
302+
}
303+
@container (max-height:300px) {
304+
.outline-dashed img {
305+
width: 30px;
306+
height: auto;
307+
}
308+
298309
.outline-dashed h6 {
299310
font-size: 12px;
300311
}
@@ -308,4 +319,7 @@
308319
.imageBg >div{
309320
padding: 5px;
310321
}
322+
}
323+
.ndl-dropzone .ndl-dropzone-header{
324+
margin-bottom: 0 !important;
311325
}

frontend/src/components/DropZone.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ const DropZone: FunctionComponent = () => {
178178
return {
179179
...curfile,
180180
status: 'Failed',
181-
type: curfile.type?.split('/')[1]?.toUpperCase() ?? 'PDF',
181+
type: `${file.name.substring(file.name.lastIndexOf('.') + 1, file.name.length).toUpperCase()}`,
182182
};
183183
}
184184
return curfile;
@@ -228,7 +228,7 @@ const DropZone: FunctionComponent = () => {
228228
className='!bg-none dropzoneContainer'
229229
supportedFilesDescription={
230230
<Typography variant='body-small'>
231-
<Flex>
231+
<Flex gap='0'>
232232
<span>Documents, Images, Unstructured</span>
233233
<div className='align-self-center'>
234234
<IconButtonWithToolTip

frontend/src/components/WikipediaModal.tsx

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,25 @@ import { useFileContext } from '../context/UsersFiles';
66
import { v4 as uuidv4 } from 'uuid';
77
import { useCredentials } from '../context/UserCredentials';
88
import { urlScanAPI } from '../services/URLScan';
9+
import { wikiValidation } from '../utils/Utils';
910

1011
const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
1112
const [wikiQuery, setwikiQuery] = useState<string>('');
1213
const [statusMessage, setStatusMessage] = useState<string>('');
1314
const [status, setStatus] = useState<'unknown' | 'success' | 'info' | 'warning' | 'danger'>('unknown');
1415
const { setFilesData, model, filesData } = useFileContext();
1516
const { userCredentials } = useCredentials();
17+
const [isFocused, setisFocused] = useState<boolean>(false);
18+
const [isValid, setValid] = useState<boolean>(false);
1619
const onClose = useCallback(() => {
1720
hideModal();
1821
setwikiQuery('');
1922
setStatus('unknown');
23+
setValid(false)
24+
setisFocused(false)
2025
}, []);
2126

22-
const submitHandler = async () => {
27+
const submitHandler = async (url: string) => {
2328
const defaultValues: CustomFileBase = {
2429
processing: 0,
2530
status: 'New',
@@ -30,7 +35,10 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
3035
fileSource: 'Wikipedia',
3136
processingProgress: undefined,
3237
};
33-
if (wikiQuery.length) {
38+
if (url.trim() != '') {
39+
setValid(wikiValidation(url) && isFocused);
40+
}
41+
if (isValid) {
3442
try {
3543
setStatus('info');
3644
setStatusMessage('Scanning...');
@@ -47,6 +55,8 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
4755
setTimeout(() => {
4856
setStatus('unknown');
4957
setwikiQuery('');
58+
setValid(false)
59+
setisFocused(false)
5060
hideModal();
5161
}, 5000);
5262
return;
@@ -56,13 +66,13 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
5666
if (apiResCheck) {
5767
setStatus('info');
5868
setStatusMessage(
59-
`Successfully Created Source Nodes for ${apiResponse.data.success_count} and Failed for ${apiResponse.data.failed_count} Wikipedia Sources`
69+
`Successfully Created Source Node for ${apiResponse.data.success_count} and Failed for ${apiResponse.data.failed_count} Wikipedia Link`
6070
);
6171
} else if (apiResponse?.data?.success_count) {
62-
setStatusMessage(`Successfully Created Source Nodes for ${apiResponse.data.success_count} Wikipedia Sources`);
72+
setStatusMessage(`Successfully Created Source Node for ${apiResponse.data.success_count} Wikipedia Link`);
6373
} else {
6474
setStatus('danger');
65-
setStatusMessage(`Failed to Create Source Nodes for ${apiResponse.data.failed_count} Wikipedia Sources`);
75+
setStatusMessage(`Failed to Create Source Node for ${apiResponse.data.failed_count} Wikipedia Link`);
6676
}
6777

6878
const copiedFilesData: CustomFile[] = [...filesData];
@@ -96,13 +106,15 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
96106
});
97107
setFilesData(copiedFilesData);
98108
setwikiQuery('');
109+
setValid(false);
110+
setisFocused(false);
99111
} catch (error) {
100112
setStatus('danger');
101113
setStatusMessage('Some Error Occurred or Please Check your Instance Connection');
102114
}
103115
} else {
104116
setStatus('danger');
105-
setStatusMessage('Please Fill the Wikipedia source');
117+
setStatusMessage('Please Fill the Wikipedia Link');
106118
setTimeout(() => {
107119
setStatus('unknown');
108120
}, 5000);
@@ -111,30 +123,34 @@ const WikipediaModal: React.FC<WikipediaModalTypes> = ({ hideModal, open }) => {
111123
setTimeout(() => {
112124
setStatus('unknown');
113125
hideModal();
114-
}, 500);
126+
}, 1000);
115127
};
116128
return (
117129
<CustomModal
118130
open={open}
119131
onClose={onClose}
120132
statusMessage={statusMessage}
121133
setStatus={setStatus}
122-
submitHandler={submitHandler}
134+
submitHandler={() => submitHandler(wikiQuery)}
123135
status={status}
124136
submitLabel='Submit'
125137
>
126138
<div className='w-full inline-block'>
127139
<TextInput
140+
type='url'
128141
id='keyword'
129142
value={wikiQuery}
130143
disabled={false}
131-
label='Wikipedia Keywords'
132-
aria-label='Wikipedia Keywords'
133-
placeholder='Albert Einstein ,Isaac Newton'
144+
label='Wikipedia Link'
145+
aria-label='Wikipedia Link'
146+
placeholder='https://en.wikipedia.org/wiki/Albert_Einstein'
134147
autoFocus
135148
fluid
136149
required
150+
onBlur={() => setValid(wikiValidation(wikiQuery) && isFocused)}
151+
errorText={!isValid && isFocused && 'Please Fill The Valid URL'}
137152
onChange={(e) => {
153+
setisFocused(true);
138154
setwikiQuery(e.target.value);
139155
}}
140156
/>

frontend/src/utils/Utils.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ export const validation = (url: string) => {
1616
return url.trim() != '' && /^s3:\/\/([^/]+)\/?$/.test(url) != false;
1717
};
1818

19+
export const wikiValidation = (url: string) => {
20+
return url.trim() != '' && /https:\/\/([a-zA-Z]{2,3})\.wikipedia\.org\/wiki\/(.*)/gm.test(url) != false;
21+
};
1922
// Status indicator icons to status column
2023
export const statusCheck = (status: string) => {
2124
switch (status) {

0 commit comments

Comments
 (0)