[wip]

jonavellecuerdo · jonavellecuerdo · commit 128be8585579 · 2025-11-10T13:04:52.000-05:00
diff --git a/tests/test_opensearch.py b/tests/test_opensearch.py
@@ -459,7 +459,7 @@ def test_bulk_index_creates_records(
     test_opensearch_client, five_valid_index_libguides_records
 ):
     assert tim_os.bulk_index(
-        test_opensearch_client, "test-index", five_valid_index_libguides_records
+        test_opensearch_client, "test-index", five_valid_index_libguides_records, "index"
     ) == {
         "created": 5,
         "updated": 0,
@@ -474,22 +474,22 @@ def test_bulk_index_updates_records(
 ):
     monkeypatch.setenv("STATUS_UPDATE_INTERVAL", "5")
     assert tim_os.bulk_index(
-        test_opensearch_client, "test-index", five_valid_index_libguides_records
+        test_opensearch_client, "test-index", five_valid_index_libguides_records, "index"
     ) == {
         "created": 0,
         "updated": 5,
         "errors": 0,
         "total": 5,
     }
-    assert "Status update: 5 records indexed so far!" in caplog.text
+    assert "Status update: 5 records processed so far!" in caplog.text
 
 
 @my_vcr.use_cassette("opensearch/bulk_index_record_mapper_parsing_error.yaml")
 def test_bulk_index_logs_mapper_parsing_errors(
     caplog, test_opensearch_client, one_invalid_index_libguides_records
 ):
     assert tim_os.bulk_index(
-        test_opensearch_client, "test-index", one_invalid_index_libguides_records
+        test_opensearch_client, "test-index", one_invalid_index_libguides_records, "index"
     ) == {
         "created": 0,
         "updated": 0,
diff --git a/tim/cli.py b/tim/cli.py
@@ -308,7 +308,9 @@ def bulk_update(
         action="index",
     )
     try:
-        index_results.update(tim_os.bulk_index(client, index, records_to_index))
+        index_results.update(
+            tim_os.bulk_index(client, index, records_to_index, action="index")
+        )
     except BulkIndexingError as exception:
         logger.info(f"Bulk indexing failed: {exception}")
 
@@ -343,29 +345,72 @@ def bulk_update(
         "records with embeddings."
     ),
 )
+@click.option("-d", "--run-date", help="Run date, formatted as YYYY-MM-DD.")
 @click.option("-rid", "--run-id", help="Run ID.")
 @click.argument("dataset_path", type=click.Path())
 @click.pass_context
 def bulk_update_embeddings(
-    ctx: click.Context, index: str, source: str, run_id: str, dataset_path: str
-):
+    ctx: click.Context,
+    index: str,
+    source: str,
+    run_date: str,
+    run_id: str,
+    dataset_path: str,
+) -> None:
     client = ctx.obj["CLIENT"]
     index = helpers.validate_bulk_cli_options(index, source, client)
 
     logger.info(
-        f"Bulk updating records with embeddings from dataset '{dataset_path}' into '{index}'"
+        f"Bulk updating records with embeddings from dataset '{dataset_path}' "
+        f"into '{index}'"
     )
 
     update_results = {"updated": 0, "errors": 0, "total": 0}
 
     td = TIMDEXDataset(location=dataset_path)
 
-    # TODO: update TDA to read embeddings
+    # TODO @ghukill: https://mitlibraries.atlassian.net/browse/USE-143 # noqa: FIX002
+    # Remove temporary code and replace with TDA
+    # method to read embeddings
+    # ==== START TEMPORARY CODE ====
+    # The code below reads transformed records from
+    # the TIMDEX dataset. To simulate embeddings,
+    # which are added to the record post-creation, a list
+    # of dicts containing only the 'timdex_record_id' and
+    # the new field (i.e., what would be the embedding fields)
+    # is created. For simulation purposes, the 'alternate_titles'
+    # field represents the new field as it is already added
+    # to the OpenSearch mapping in config/opensearch_mappings.json.
+    # When testing, the user is expected to pass in a source that
+    # does not set this field (e.g., libguides).
+    # Once TDA has been updated to read/write embeddings
+    # from/to the TIMDEX dataset, this code should be replaced
+    # with a simple call to read vector embeddings, which should
+    # return an iter of dicts representing the embeddings.
+    transformed_records = td.read_transformed_records_iter(
+        run_date=run_date,
+        run_id=run_id,
+        action="index",
+    )
 
+    records_to_update = iter(
+        [
+            {
+                "timdex_record_id": record["timdex_record_id"],
+                "alternate_titles": [{"kind": "Test", "value": "Test Alternate Title"}],
+            }
+            for record in transformed_records
+        ]
+    )
+    # ==== END TEMPORARY CODE ====
     try:
-        update_results.update(tim_os.bulk_index(client, index, records_to_index))
+        update_results.update(
+            tim_os.bulk_index(client, index, records_to_update, action="update")
+        )
     except BulkIndexingError as exception:
-        logger.info(f"Bulk indexing failed: {exception}")
+        logger.info(f"Bulk update with embeddings failed: {exception}")
+
+    logger.info(f"Bulk update with embeddings complete: {json.dumps(update_results)}")
 
 
 @main.command()
@@ -383,7 +428,12 @@ def bulk_update_embeddings(
     help="Alias to promote the index to in addition to the primary alias. May "
     "be repeated to promote the index to multiple aliases at once.",
 )
-@click.argument("dataset_path", type=click.Path())
+@click.argument(
+    "dataset_path",
+    type=click.Path(),
+    help="Location of TIMDEX parquet dataset from which transformed records are read."
+    "This value can be a local filepath or an S3 URI.",
+)
 @click.pass_context
 def reindex_source(
     ctx: click.Context,
@@ -432,9 +482,15 @@ def reindex_source(
         action="index",
     )
     try:
-        index_results.update(tim_os.bulk_index(client, index, records_to_index))
+        index_results.update(
+            tim_os.bulk_index(client, index, records_to_index, action="index")
+        )
     except BulkIndexingError as exception:
         logger.info(f"Bulk indexing failed: {exception}")
 
     summary_results = {"index": index_results}
     logger.info(f"Reindex source complete: {json.dumps(summary_results)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tim/helpers.py b/tim/helpers.py
@@ -50,8 +50,13 @@ def generate_bulk_actions(
             "_index": index,
             "_id": record["timdex_record_id"],
         }
-        if action != "delete":
-            doc["_source"] = record
+
+        match action:
+            case "update":
+                doc["doc"] = record
+            case _ if action != "delete":
+                doc["_source"] = record
+
         yield doc
 
 
diff --git a/tim/opensearch.py b/tim/opensearch.py
@@ -359,22 +359,26 @@ def bulk_delete(
     return result
 
 
-def bulk_index(client: OpenSearch, index: str, records: Iterator[dict]) -> dict[str, int]:
+def bulk_index(
+    client: OpenSearch, index: str, records: Iterator[dict], action: str
+) -> dict[str, int]:
     """Indexes records into an existing index using the streaming bulk helper.
 
-    This action function uses the OpenSearch "index" action, which is a
-    combination of create and update: if a record with the same _id exists in the
-    index, it will be updated. If it does not exist, the record will be indexed as a
-    new document.
+    This method uses the OpenSearch "index" and "update" operations.
+    - Setting `action` to "index" will either create or update a record.
+      If a record with the same _id exists in the index, it will be updated;
+      if it does not exist, the record will be added as a new document.
+    - Setting `action` to "update" will update a document only if it exists
+      in the index. Otherwise, an error is raised.
 
-    If an error occurs during record indexing, it will be logged and bulk indexing will
-    continue until all records have been processed.
+    If an error occurs during the operation, it will be logged, and the bulk
+    operation will continue until all records have been processed.
 
     Returns total sums of: records created, records updated, errors, and total records
     processed.
     """
     result = {"created": 0, "updated": 0, "errors": 0, "total": 0}
-    actions = helpers.generate_bulk_actions(index, records, "index")
+    actions = helpers.generate_bulk_actions(index, records, action)
     responses = streaming_bulk(
         client,
         actions,
@@ -400,13 +404,14 @@ def bulk_index(client: OpenSearch, index: str, records: Iterator[dict]) -> dict[
             result["updated"] += 1
         else:
             logger.error(
-                "Something unexpected happened during ingest. Bulk index response: %s",
+                "Something unexpected happened during ingest. "
+                f"Bulk {action} response: %s",
                 json.dumps(response),
             )
             result["errors"] += 1
         result["total"] += 1
         if result["total"] % int(os.getenv("STATUS_UPDATE_INTERVAL", "1000")) == 0:
-            logger.info("Status update: %s records indexed so far!", result["total"])
+            logger.info("Status update: %s records processed so far!", result["total"])
     logger.info("All records ingested, refreshing index.")
     response = client.indices.refresh(
         index=index,