@@ -46,46 +46,58 @@ def gcs_loader_func(file_path):
4646 return loader
4747
4848def get_documents_from_gcs (gcs_project_id , gcs_bucket_name , gcs_bucket_folder , gcs_blob_filename , access_token = None ):
49- nltk .download ('punkt' )
50- nltk .download ('averaged_perceptron_tagger' )
51- if gcs_bucket_folder is not None and gcs_bucket_folder .strip ()!= "" :
52- if gcs_bucket_folder .endswith ('/' ):
53- blob_name = gcs_bucket_folder + gcs_blob_filename
49+
50+ nltk .data .path .append ("/usr/local/nltk_data" )
51+ nltk .data .path .append (os .path .expanduser ("~/.nltk_data" ))
52+ try :
53+ nltk .data .find ("tokenizers/punkt" )
54+ except LookupError :
55+ for resource in ["punkt" , "averaged_perceptron_tagger" ]:
56+ try :
57+ nltk .data .find (f"tokenizers/{ resource } " if resource == "punkt" else f"taggers/{ resource } " )
58+ except LookupError :
59+ logging .info (f"Downloading NLTK resource: { resource } " )
60+ nltk .download (resource , download_dir = os .path .expanduser ("~/.nltk_data" ))
61+
62+ logging .info ("NLTK resources downloaded successfully." )
63+ if gcs_bucket_folder is not None and gcs_bucket_folder .strip ()!= "" :
64+ if gcs_bucket_folder .endswith ('/' ):
65+ blob_name = gcs_bucket_folder + gcs_blob_filename
66+ else :
67+ blob_name = gcs_bucket_folder + '/' + gcs_blob_filename
5468 else :
55- blob_name = gcs_bucket_folder + '/' + gcs_blob_filename
56- else :
57- blob_name = gcs_blob_filename
58-
59- logging .info (f"GCS project_id : { gcs_project_id } " )
60-
61- if access_token is None :
62- storage_client = storage .Client (project = gcs_project_id )
63- bucket = storage_client .bucket (gcs_bucket_name )
64- blob = bucket .blob (blob_name )
69+ blob_name = gcs_blob_filename
6570
66- if blob .exists ():
67- loader = GCSFileLoader (project_name = gcs_project_id , bucket = gcs_bucket_name , blob = blob_name , loader_func = gcs_loader_func )
68- pages = loader .load ()
69- else :
70- raise LLMGraphBuilderException ('File does not exist, Please re-upload the file and try again.' )
71- else :
72- creds = Credentials (access_token )
73- storage_client = storage .Client (project = gcs_project_id , credentials = creds )
71+ logging .info (f"GCS project_id : { gcs_project_id } " )
7472
75- bucket = storage_client .bucket (gcs_bucket_name )
76- blob = bucket .blob (blob_name )
77- if blob .exists ():
78- content = blob .download_as_bytes ()
79- pdf_file = io .BytesIO (content )
80- pdf_reader = PdfReader (pdf_file )
81- # Extract text from all pages
82- text = ""
83- for page in pdf_reader .pages :
84- text += page .extract_text ()
85- pages = [Document (page_content = text )]
73+ if access_token is None :
74+ storage_client = storage .Client (project = gcs_project_id )
75+ bucket = storage_client .bucket (gcs_bucket_name )
76+ blob = bucket .blob (blob_name )
77+
78+ if blob .exists ():
79+ loader = GCSFileLoader (project_name = gcs_project_id , bucket = gcs_bucket_name , blob = blob_name , loader_func = gcs_loader_func )
80+ pages = loader .load ()
81+ else :
82+ raise LLMGraphBuilderException ('File does not exist, Please re-upload the file and try again.' )
8683 else :
87- raise LLMGraphBuilderException (f'File Not Found in GCS bucket - { gcs_bucket_name } ' )
88- return gcs_blob_filename , pages
84+ creds = Credentials (access_token )
85+ storage_client = storage .Client (project = gcs_project_id , credentials = creds )
86+
87+ bucket = storage_client .bucket (gcs_bucket_name )
88+ blob = bucket .blob (blob_name )
89+ if blob .exists ():
90+ content = blob .download_as_bytes ()
91+ pdf_file = io .BytesIO (content )
92+ pdf_reader = PdfReader (pdf_file )
93+ # Extract text from all pages
94+ text = ""
95+ for page in pdf_reader .pages :
96+ text += page .extract_text ()
97+ pages = [Document (page_content = text )]
98+ else :
99+ raise LLMGraphBuilderException (f'File Not Found in GCS bucket - { gcs_bucket_name } ' )
100+ return gcs_blob_filename , pages
89101
90102def upload_file_to_gcs (file_chunk , chunk_number , original_file_name , bucket_name , folder_name_sha1_hashed ):
91103 try :
0 commit comments