Update Incremental Indexing to new embeddings workflow (#1359)

AlonsoGuevara · web-flow · commit 80c0c7bdd12d · 2024-11-05T16:54:02.000-06:00
diff --git a/.semversioner/next-release/patch-20241105223157965625.json b/.semversioner/next-release/patch-20241105223157965625.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Update Incremental Indexing to new embeddings workflow"
+}
diff --git a/graphrag/index/run/run.py b/graphrag/index/run/run.py
@@ -153,13 +153,15 @@ async def run_pipeline_with_config(
         ):
             tables_dict[table.workflow] = table.result
 
+        progress_reporter.success("Finished running workflows on new documents.")
         await update_dataframe_outputs(
             dataframe_dict=tables_dict,
             storage=storage,
             update_storage=update_index_storage,
             config=config,
             cache=cache,
             callbacks=NoopVerbCallbacks(),
+            progress_reporter=progress_reporter,
         )
 
     else:
diff --git a/graphrag/index/run/workflow.py b/graphrag/index/run/workflow.py
@@ -119,7 +119,7 @@ async def _process_workflow(
 
 
 def _find_workflow_config(
-    config: PipelineConfig, workflow_name: str, step: str
+    config: PipelineConfig, workflow_name: str, step: str | None = None
 ) -> dict:
     """Find a workflow in the pipeline configuration.
 
@@ -147,8 +147,6 @@ def _find_workflow_config(
         )
         raise ValueError(error_message) from err
 
-    return (
-        workflow.config.get(step, {})
-        if workflow.config and step in workflow.config
-        else {}
-    )
+    if not workflow.config:
+        return {}
+    return workflow.config if not step else workflow.config.get(step, {})
diff --git a/graphrag/index/update/entities.py b/graphrag/index/update/entities.py
@@ -11,7 +11,6 @@
 
 from graphrag.index.cache.pipeline_cache import PipelineCache
 from graphrag.index.config.pipeline import PipelineConfig
-from graphrag.index.operations.embed_text import embed_text
 from graphrag.index.operations.summarize_descriptions.strategies import (
     run_graph_intelligence as run_entity_summarization,
 )
@@ -67,8 +66,6 @@ def _group_and_resolve_entities(
             "description": lambda x: list(x.astype(str)),  # Ensure str
             # Concatenate nd.array into a single list
             "text_unit_ids": lambda x: ",".join(str(i) for j in x.tolist() for i in j),
-            # Keep only descriptions where the original value wasn't modified
-            "description_embedding": lambda x: x.iloc[0] if len(x) == 1 else np.nan,
         })
         .reset_index()
     )
@@ -87,7 +84,6 @@ def _group_and_resolve_entities(
             "human_readable_id",
             "graph_embedding",
             "text_unit_ids",
-            "description_embedding",
         ],
     ]
 
@@ -141,48 +137,3 @@ async def process_row(row):
     entities_df["description"] = results
 
     return entities_df
-
-
-async def _run_entity_description_embedding(
-    entities_df: pd.DataFrame,
-    config: PipelineConfig,
-    cache: PipelineCache,
-    callbacks: VerbCallbacks,
-) -> pd.DataFrame:
-    """Run entity description embedding.
-
-    Parameters
-    ----------
-    entities_df : pd.DataFrame
-        The entities dataframe.
-    config : PipelineConfig
-        The pipeline configuration.
-    cache : PipelineCache
-        Pipeline cache used during the embedding process.
-    callbacks : WorkflowCallbacks
-        The workflow callbacks.
-
-    Returns
-    -------
-    pd.DataFrame
-        The updated entities dataframe with description embeddings.
-    """
-    embed_config = _find_workflow_config(
-        config, "create_final_entities", "entity_name_description_embed"
-    )
-
-    # Concatenate name and description for embedding
-    entities_df["name_description"] = (
-        entities_df["name"] + ":" + entities_df["description"]
-    )
-
-    # Run embedding
-    entities_df["description_embedding"] = await embed_text(
-        entities_df,
-        callbacks,
-        cache,
-        embed_column="name_description",
-        embedding_name="entity.description",
-        strategy=embed_config.get("strategy", {}),
-    )
-    return entities_df.drop(columns=["name_description"])
diff --git a/graphrag/index/update/incremental_index.py b/graphrag/index/update/incremental_index.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "Update Incremental Indexing to new embeddings workflow"
 +}