@@ -325,6 +325,90 @@ def bulk_update(
325325 logger .info (f"Bulk update complete: { json .dumps (summary_results )} " )
326326
327327
328+ # Bulk update existing records with embeddings commands
329+
330+
331+ @main .command ()
332+ @click .option (
333+ "-i" ,
334+ "--index" ,
335+ help = "Name of the index where the bulk update to add embeddings is performed." ,
336+ )
337+ @click .option (
338+ "-s" ,
339+ "--source" ,
340+ type = click .Choice (VALID_SOURCES ),
341+ help = (
342+ "Source whose primary-aliased index will receive the bulk updated "
343+ "records with embeddings."
344+ ),
345+ )
346+ @click .option ("-d" , "--run-date" , help = "Run date, formatted as YYYY-MM-DD." )
347+ @click .option ("-rid" , "--run-id" , help = "Run ID." )
348+ @click .argument ("dataset_path" , type = click .Path ())
349+ @click .pass_context
350+ def bulk_update_embeddings (
351+ ctx : click .Context ,
352+ index : str ,
353+ source : str ,
354+ run_date : str ,
355+ run_id : str ,
356+ dataset_path : str ,
357+ ) -> None :
358+ client = ctx .obj ["CLIENT" ]
359+ index = helpers .validate_bulk_cli_options (index , source , client )
360+
361+ logger .info (
362+ f"Bulk updating records with embeddings from dataset '{ dataset_path } ' "
363+ f"into '{ index } '"
364+ )
365+
366+ update_results = {"updated" : 0 , "errors" : 0 , "total" : 0 }
367+
368+ td = TIMDEXDataset (location = dataset_path )
369+
370+ # TODO @ghukill: https://mitlibraries.atlassian.net/browse/USE-143 # noqa: FIX002
371+ # Remove temporary code and replace with TDA
372+ # method to read embeddings
373+ # ==== START TEMPORARY CODE ====
374+ # The code below reads transformed records from
375+ # the TIMDEX dataset. To simulate embeddings,
376+ # which are added to the record post-creation, a list
377+ # of dicts containing only the 'timdex_record_id' and
378+ # the new field (i.e., what would be the embedding fields)
379+ # is created. For simulation purposes, the 'alternate_titles'
380+ # field represents the new field as it is already added
381+ # to the OpenSearch mapping in config/opensearch_mappings.json.
382+ # When testing, the user is expected to pass in a source that
383+ # does not set this field (e.g., libguides).
384+ # Once TDA has been updated to read/write embeddings
385+ # from/to the TIMDEX dataset, this code should be replaced
386+ # with a simple call to read vector embeddings, which should
387+ # return an iter of dicts representing the embeddings.
388+ transformed_records = td .read_transformed_records_iter (
389+ run_date = run_date ,
390+ run_id = run_id ,
391+ action = "index" ,
392+ )
393+
394+ records_to_update = iter (
395+ [
396+ {
397+ "timdex_record_id" : record ["timdex_record_id" ],
398+ "alternate_titles" : [{"kind" : "Test" , "value" : "Test Alternate Title" }],
399+ }
400+ for record in transformed_records
401+ ]
402+ )
403+ # ==== END TEMPORARY CODE ====
404+ try :
405+ update_results .update (tim_os .bulk_update (client , index , records_to_update ))
406+ except BulkIndexingError as exception :
407+ logger .info (f"Bulk update with embeddings failed: { exception } " )
408+
409+ logger .info (f"Bulk update with embeddings complete: { json .dumps (update_results )} " )
410+
411+
328412@main .command ()
329413@click .option (
330414 "-s" ,
@@ -340,7 +424,12 @@ def bulk_update(
340424 help = "Alias to promote the index to in addition to the primary alias. May "
341425 "be repeated to promote the index to multiple aliases at once." ,
342426)
343- @click .argument ("dataset_path" , type = click .Path ())
427+ @click .argument (
428+ "dataset_path" ,
429+ type = click .Path (),
430+ help = "Location of TIMDEX parquet dataset from which transformed records are read."
431+ "This value can be a local filepath or an S3 URI." ,
432+ )
344433@click .pass_context
345434def reindex_source (
346435 ctx : click .Context ,
@@ -395,3 +484,7 @@ def reindex_source(
395484
396485 summary_results = {"index" : index_results }
397486 logger .info (f"Reindex source complete: { json .dumps (summary_results )} " )
487+
488+
489+ if __name__ == "__main__" :
490+ main ()
0 commit comments