Skip to content

Commit 61a309b

Browse files
authored
Incremental model alignment (#1766)
* Used shared schema lists for all final columns * Semver
1 parent 0144b3f commit 61a309b

File tree

17 files changed

+154
-162
lines changed

17 files changed

+154
-162
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "Use shared schema for final outputs."
4+
}

docs/examples_notebooks/index_migration_to_v2.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
},
6666
{
6767
"cell_type": "code",
68-
"execution_count": 4,
68+
"execution_count": null,
6969
"metadata": {},
7070
"outputs": [],
7171
"source": [
@@ -96,7 +96,7 @@
9696
" final_nodes.loc[:, [\"id\", \"degree\", \"x\", \"y\"]].groupby(\"id\").first().reset_index()\n",
9797
")\n",
9898
"final_entities = final_entities.merge(graph_props, on=\"id\", how=\"left\")\n",
99-
"# we're also persistint the frequency column\n",
99+
"# we're also persisting the frequency column\n",
100100
"final_entities[\"frequency\"] = final_entities[\"text_unit_ids\"].count()\n",
101101
"\n",
102102
"\n",

graphrag/data_model/schemas.py

Lines changed: 112 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
11
# Copyright (c) 2024 Microsoft Corporation.
22
# Licensed under the MIT License
3-
"""Common field name definitions for community reports."""
3+
"""Common field name definitions for data frames."""
44

55
ID = "id"
66
SHORT_ID = "human_readable_id"
77
TITLE = "title"
88
DESCRIPTION = "description"
99

10+
TYPE = "type"
11+
1012
# POST-PREP NODE TABLE SCHEMA
1113
NODE_DEGREE = "degree"
1214
NODE_FREQUENCY = "frequency"
1315
NODE_DETAILS = "node_details"
14-
15-
NODE_PARENT_COMMUNITY = "parent_community"
16+
NODE_X = "x"
17+
NODE_Y = "y"
1618

1719
# POST-PREP EDGE TABLE SCHEMA
1820
EDGE_SOURCE = "source"
@@ -23,13 +25,11 @@
2325

2426
# POST-PREP CLAIM TABLE SCHEMA
2527
CLAIM_SUBJECT = "subject_id"
26-
CLAIM_TYPE = "type"
2728
CLAIM_STATUS = "status"
2829
CLAIM_DETAILS = "claim_details"
2930

3031
# COMMUNITY HIERARCHY TABLE SCHEMA
3132
SUB_COMMUNITY = "sub_community"
32-
COMMUNITY_LEVEL = "level"
3333

3434
# COMMUNITY CONTEXT TABLE SCHEMA
3535
ALL_CONTEXT = "all_context"
@@ -40,6 +40,8 @@
4040
# COMMUNITY REPORT TABLE SCHEMA
4141
COMMUNITY_ID = "community"
4242
COMMUNITY_LEVEL = "level"
43+
COMMUNITY_PARENT = "parent"
44+
COMMUNITY_CHILDREN = "children"
4345
TITLE = "title"
4446
SUMMARY = "summary"
4547
FINDINGS = "findings"
@@ -48,9 +50,114 @@
4850
FULL_CONTENT = "full_content"
4951
FULL_CONTENT_JSON = "full_content_json"
5052

53+
ENTITY_IDS = "entity_ids"
54+
RELATIONSHIP_IDS = "relationship_ids"
5155
TEXT_UNIT_IDS = "text_unit_ids"
56+
COVARIATE_IDS = "covariate_ids"
57+
DOCUMENT_IDS = "document_ids"
58+
59+
PERIOD = "period"
60+
SIZE = "size"
5261

5362
# text units
5463
ENTITY_DEGREE = "entity_degree"
5564
ALL_DETAILS = "all_details"
5665
TEXT = "text"
66+
N_TOKENS = "n_tokens"
67+
68+
CREATION_DATE = "creation_date"
69+
METADATA = "metadata"
70+
71+
# the following lists define the final content and ordering of columns in the data model parquet outputs
72+
ENTITIES_FINAL_COLUMNS = [
73+
ID,
74+
SHORT_ID,
75+
TITLE,
76+
TYPE,
77+
DESCRIPTION,
78+
TEXT_UNIT_IDS,
79+
NODE_FREQUENCY,
80+
NODE_DEGREE,
81+
NODE_X,
82+
NODE_Y,
83+
]
84+
85+
RELATIONSHIPS_FINAL_COLUMNS = [
86+
ID,
87+
SHORT_ID,
88+
EDGE_SOURCE,
89+
EDGE_TARGET,
90+
DESCRIPTION,
91+
EDGE_WEIGHT,
92+
EDGE_DEGREE,
93+
TEXT_UNIT_IDS,
94+
]
95+
96+
COMMUNITIES_FINAL_COLUMNS = [
97+
ID,
98+
SHORT_ID,
99+
COMMUNITY_ID,
100+
COMMUNITY_LEVEL,
101+
COMMUNITY_PARENT,
102+
COMMUNITY_CHILDREN,
103+
TITLE,
104+
ENTITY_IDS,
105+
RELATIONSHIP_IDS,
106+
TEXT_UNIT_IDS,
107+
PERIOD,
108+
SIZE,
109+
]
110+
111+
COMMUNITY_REPORTS_FINAL_COLUMNS = [
112+
ID,
113+
SHORT_ID,
114+
COMMUNITY_ID,
115+
COMMUNITY_LEVEL,
116+
COMMUNITY_PARENT,
117+
COMMUNITY_CHILDREN,
118+
TITLE,
119+
SUMMARY,
120+
FULL_CONTENT,
121+
RATING,
122+
EXPLANATION,
123+
FINDINGS,
124+
FULL_CONTENT_JSON,
125+
PERIOD,
126+
SIZE,
127+
]
128+
129+
COVARIATES_FINAL_COLUMNS = [
130+
ID,
131+
SHORT_ID,
132+
"covariate_type",
133+
TYPE,
134+
DESCRIPTION,
135+
"subject_id",
136+
"object_id",
137+
"status",
138+
"start_date",
139+
"end_date",
140+
"source_text",
141+
"text_unit_id",
142+
]
143+
144+
TEXT_UNITS_FINAL_COLUMNS = [
145+
ID,
146+
SHORT_ID,
147+
TEXT,
148+
N_TOKENS,
149+
DOCUMENT_IDS,
150+
ENTITY_IDS,
151+
RELATIONSHIP_IDS,
152+
COVARIATE_IDS,
153+
]
154+
155+
DOCUMENTS_FINAL_COLUMNS = [
156+
ID,
157+
SHORT_ID,
158+
TITLE,
159+
TEXT,
160+
TEXT_UNIT_IDS,
161+
CREATION_DATE,
162+
METADATA,
163+
]

graphrag/index/operations/finalize_community_reports.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
import pandas as pd
99

10+
from graphrag.data_model.schemas import COMMUNITY_REPORTS_FINAL_COLUMNS
11+
1012

1113
def finalize_community_reports(
1214
reports: pd.DataFrame,
@@ -27,21 +29,5 @@ def finalize_community_reports(
2729

2830
return community_reports.loc[
2931
:,
30-
[
31-
"id",
32-
"human_readable_id",
33-
"community",
34-
"level",
35-
"parent",
36-
"children",
37-
"title",
38-
"summary",
39-
"full_content",
40-
"rank",
41-
"rank_explanation",
42-
"findings",
43-
"full_content_json",
44-
"period",
45-
"size",
46-
],
32+
COMMUNITY_REPORTS_FINAL_COLUMNS,
4733
]

graphrag/index/operations/finalize_entities.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
1111
from graphrag.config.models.embed_graph_config import EmbedGraphConfig
12+
from graphrag.data_model.schemas import ENTITIES_FINAL_COLUMNS
1213
from graphrag.index.operations.compute_degree import compute_degree
1314
from graphrag.index.operations.create_graph import create_graph
1415
from graphrag.index.operations.embed_graph.embed_graph import embed_graph
@@ -52,16 +53,5 @@ def finalize_entities(
5253
)
5354
return final_entities.loc[
5455
:,
55-
[
56-
"id",
57-
"human_readable_id",
58-
"title",
59-
"type",
60-
"description",
61-
"text_unit_ids",
62-
"frequency",
63-
"degree",
64-
"x",
65-
"y",
66-
],
56+
ENTITIES_FINAL_COLUMNS,
6757
]

graphrag/index/operations/finalize_relationships.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import pandas as pd
99

10+
from graphrag.data_model.schemas import RELATIONSHIPS_FINAL_COLUMNS
1011
from graphrag.index.operations.compute_degree import compute_degree
1112
from graphrag.index.operations.compute_edge_combined_degree import (
1213
compute_edge_combined_degree,
@@ -39,14 +40,5 @@ def finalize_relationships(
3940

4041
return final_relationships.loc[
4142
:,
42-
[
43-
"id",
44-
"human_readable_id",
45-
"source",
46-
"target",
47-
"description",
48-
"weight",
49-
"combined_degree",
50-
"text_unit_ids",
51-
],
43+
RELATIONSHIPS_FINAL_COLUMNS,
5244
]

graphrag/index/operations/summarize_communities/strategies.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ async def _run_extractor(
7878
level=level,
7979
rank=report.rating,
8080
title=report.title,
81-
rank_explanation=report.rating_explanation,
81+
rating_explanation=report.rating_explanation,
8282
summary=report.summary,
8383
findings=[
8484
Finding(explanation=f.explanation, summary=f.summary)

graphrag/index/operations/summarize_communities/typing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class CommunityReport(TypedDict):
3636
full_content_json: str
3737
rank: float
3838
level: int
39-
rank_explanation: str
39+
rating_explanation: str
4040
findings: list[Finding]
4141

4242

graphrag/index/update/communities.py

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,11 @@
55

66
import pandas as pd
77

8+
from graphrag.data_model.schemas import (
9+
COMMUNITIES_FINAL_COLUMNS,
10+
COMMUNITY_REPORTS_FINAL_COLUMNS,
11+
)
12+
813

914
def _update_and_merge_communities(
1015
old_communities: pd.DataFrame,
@@ -76,19 +81,7 @@ def _update_and_merge_communities(
7681

7782
merged_communities = merged_communities.loc[
7883
:,
79-
[
80-
"id",
81-
"human_readable_id",
82-
"community",
83-
"parent",
84-
"level",
85-
"title",
86-
"entity_ids",
87-
"relationship_ids",
88-
"text_unit_ids",
89-
"period",
90-
"size",
91-
],
84+
COMMUNITIES_FINAL_COLUMNS,
9285
]
9386
return merged_communities, community_id_mapping
9487

@@ -155,22 +148,4 @@ def _update_and_merge_community_reports(
155148
"community"
156149
]
157150

158-
return merged_community_reports.loc[
159-
:,
160-
[
161-
"id",
162-
"human_readable_id",
163-
"community",
164-
"parent",
165-
"level",
166-
"title",
167-
"summary",
168-
"full_content",
169-
"rank",
170-
"rank_explanation",
171-
"findings",
172-
"full_content_json",
173-
"period",
174-
"size",
175-
],
176-
]
151+
return merged_community_reports.loc[:, COMMUNITY_REPORTS_FINAL_COLUMNS]

graphrag/index/update/entities.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from graphrag.cache.pipeline_cache import PipelineCache
1313
from graphrag.callbacks.workflow_callbacks import WorkflowCallbacks
1414
from graphrag.config.models.graph_rag_config import GraphRagConfig
15+
from graphrag.data_model.schemas import ENTITIES_FINAL_COLUMNS
1516
from graphrag.index.operations.summarize_descriptions.graph_intelligence_strategy import (
1617
run_graph_intelligence as run_entity_summarization,
1718
)
@@ -79,21 +80,7 @@ def _group_and_resolve_entities(
7980
resolved: pd.DataFrame = pd.DataFrame(aggregated)
8081

8182
# Modify column order to keep consistency
82-
resolved = resolved.loc[
83-
:,
84-
[
85-
"id",
86-
"human_readable_id",
87-
"title",
88-
"type",
89-
"description",
90-
"text_unit_ids",
91-
"frequency",
92-
"degree",
93-
"x",
94-
"y",
95-
],
96-
]
83+
resolved = resolved.loc[:, ENTITIES_FINAL_COLUMNS]
9784

9885
return resolved, id_mapping
9986

0 commit comments

Comments
 (0)