Skip to content

Commit 80c0c7b

Browse files
Update Incremental Indexing to new embeddings workflow (#1359)
1 parent 83bd5ce commit 80c0c7b

File tree

5 files changed

+201
-105
lines changed

5 files changed

+201
-105
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "Update Incremental Indexing to new embeddings workflow"
4+
}

graphrag/index/run/run.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,13 +153,15 @@ async def run_pipeline_with_config(
153153
):
154154
tables_dict[table.workflow] = table.result
155155

156+
progress_reporter.success("Finished running workflows on new documents.")
156157
await update_dataframe_outputs(
157158
dataframe_dict=tables_dict,
158159
storage=storage,
159160
update_storage=update_index_storage,
160161
config=config,
161162
cache=cache,
162163
callbacks=NoopVerbCallbacks(),
164+
progress_reporter=progress_reporter,
163165
)
164166

165167
else:

graphrag/index/run/workflow.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ async def _process_workflow(
119119

120120

121121
def _find_workflow_config(
122-
config: PipelineConfig, workflow_name: str, step: str
122+
config: PipelineConfig, workflow_name: str, step: str | None = None
123123
) -> dict:
124124
"""Find a workflow in the pipeline configuration.
125125
@@ -147,8 +147,6 @@ def _find_workflow_config(
147147
)
148148
raise ValueError(error_message) from err
149149

150-
return (
151-
workflow.config.get(step, {})
152-
if workflow.config and step in workflow.config
153-
else {}
154-
)
150+
if not workflow.config:
151+
return {}
152+
return workflow.config if not step else workflow.config.get(step, {})

graphrag/index/update/entities.py

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111

1212
from graphrag.index.cache.pipeline_cache import PipelineCache
1313
from graphrag.index.config.pipeline import PipelineConfig
14-
from graphrag.index.operations.embed_text import embed_text
1514
from graphrag.index.operations.summarize_descriptions.strategies import (
1615
run_graph_intelligence as run_entity_summarization,
1716
)
@@ -67,8 +66,6 @@ def _group_and_resolve_entities(
6766
"description": lambda x: list(x.astype(str)), # Ensure str
6867
# Concatenate nd.array into a single list
6968
"text_unit_ids": lambda x: ",".join(str(i) for j in x.tolist() for i in j),
70-
# Keep only descriptions where the original value wasn't modified
71-
"description_embedding": lambda x: x.iloc[0] if len(x) == 1 else np.nan,
7269
})
7370
.reset_index()
7471
)
@@ -87,7 +84,6 @@ def _group_and_resolve_entities(
8784
"human_readable_id",
8885
"graph_embedding",
8986
"text_unit_ids",
90-
"description_embedding",
9187
],
9288
]
9389

@@ -141,48 +137,3 @@ async def process_row(row):
141137
entities_df["description"] = results
142138

143139
return entities_df
144-
145-
146-
async def _run_entity_description_embedding(
147-
entities_df: pd.DataFrame,
148-
config: PipelineConfig,
149-
cache: PipelineCache,
150-
callbacks: VerbCallbacks,
151-
) -> pd.DataFrame:
152-
"""Run entity description embedding.
153-
154-
Parameters
155-
----------
156-
entities_df : pd.DataFrame
157-
The entities dataframe.
158-
config : PipelineConfig
159-
The pipeline configuration.
160-
cache : PipelineCache
161-
Pipeline cache used during the embedding process.
162-
callbacks : WorkflowCallbacks
163-
The workflow callbacks.
164-
165-
Returns
166-
-------
167-
pd.DataFrame
168-
The updated entities dataframe with description embeddings.
169-
"""
170-
embed_config = _find_workflow_config(
171-
config, "create_final_entities", "entity_name_description_embed"
172-
)
173-
174-
# Concatenate name and description for embedding
175-
entities_df["name_description"] = (
176-
entities_df["name"] + ":" + entities_df["description"]
177-
)
178-
179-
# Run embedding
180-
entities_df["description_embedding"] = await embed_text(
181-
entities_df,
182-
callbacks,
183-
cache,
184-
embed_column="name_description",
185-
embedding_name="entity.description",
186-
strategy=embed_config.get("strategy", {}),
187-
)
188-
return entities_df.drop(columns=["name_description"])

0 commit comments

Comments
 (0)