Skip to content

Commit 2ddee65

Browse files
authored
Read/write files as binary utf-8 (#639)
1 parent 54f48d6 commit 2ddee65

17 files changed

+305
-277
lines changed

.github/workflows/python-ci.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
python-ci:
2222
strategy:
2323
matrix:
24-
python-version: ["3.10", "3.11", "3.12"]
24+
python-version: ["3.10", "3.11"] # add 3.12 once gensim supports it. TODO: watch this issue - https://github.com/piskvorky/gensim/issues/3510
2525
os: [ubuntu-latest, windows-latest]
2626
env:
2727
DEBUG: 1
@@ -79,7 +79,10 @@ jobs:
7979

8080
- name: Install dependencies
8181
shell: bash
82-
run: poetry self add setuptools && poetry run python -m pip install gensim && poetry install
82+
run: |
83+
poetry self add setuptools wheel
84+
poetry run python -m pip install gensim
85+
poetry install
8386
8487
- name: Check Semversioner
8588
run: |
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "use binary io processing for all file io operations"
4+
}

graphrag/config/models/claim_extraction_config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ def resolved_strategy(self, root_dir: str) -> dict:
4343
"type": ExtractClaimsStrategyType.graph_intelligence,
4444
"llm": self.llm.model_dump(),
4545
**self.parallelization.model_dump(),
46-
"extraction_prompt": (Path(root_dir) / self.prompt).read_text()
46+
"extraction_prompt": (Path(root_dir) / self.prompt)
47+
.read_bytes()
48+
.decode(encoding="utf-8")
4749
if self.prompt
4850
else None,
4951
"claim_description": self.description,

graphrag/config/models/community_reports_config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ def resolved_strategy(self, root_dir) -> dict:
3838
"type": CreateCommunityReportsStrategyType.graph_intelligence,
3939
"llm": self.llm.model_dump(),
4040
**self.parallelization.model_dump(),
41-
"extraction_prompt": (Path(root_dir) / self.prompt).read_text()
41+
"extraction_prompt": (Path(root_dir) / self.prompt)
42+
.read_bytes()
43+
.decode(encoding="utf-8")
4244
if self.prompt
4345
else None,
4446
"max_report_length": self.max_length,

graphrag/config/models/entity_extraction_config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ def resolved_strategy(self, root_dir: str, encoding_model: str) -> dict:
3838
"type": ExtractEntityStrategyType.graph_intelligence,
3939
"llm": self.llm.model_dump(),
4040
**self.parallelization.model_dump(),
41-
"extraction_prompt": (Path(root_dir) / self.prompt).read_text()
41+
"extraction_prompt": (Path(root_dir) / self.prompt)
42+
.read_bytes()
43+
.decode(encoding="utf-8")
4244
if self.prompt
4345
else None,
4446
"max_gleanings": self.max_gleanings,

graphrag/config/models/summarize_descriptions_config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ def resolved_strategy(self, root_dir: str) -> dict:
3434
"type": SummarizeStrategyType.graph_intelligence,
3535
"llm": self.llm.model_dump(),
3636
**self.parallelization.model_dump(),
37-
"summarize_prompt": (Path(root_dir) / self.prompt).read_text()
37+
"summarize_prompt": (Path(root_dir) / self.prompt)
38+
.read_bytes()
39+
.decode(encoding="utf-8")
3840
if self.prompt
3941
else None,
4042
"max_summary_length": self.max_length,

graphrag/index/cli.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -185,35 +185,41 @@ def _initialize_project_at(path: str, reporter: ProgressReporter) -> None:
185185

186186
dotenv = root / ".env"
187187
if not dotenv.exists():
188-
with settings_yaml.open("w") as file:
189-
file.write(INIT_YAML)
188+
with settings_yaml.open("wb") as file:
189+
file.write(INIT_YAML.encode(encoding="utf-8", errors="strict"))
190190

191-
with dotenv.open("w") as file:
192-
file.write(INIT_DOTENV)
191+
with dotenv.open("wb") as file:
192+
file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict"))
193193

194194
prompts_dir = root / "prompts"
195195
if not prompts_dir.exists():
196196
prompts_dir.mkdir(parents=True, exist_ok=True)
197197

198198
entity_extraction = prompts_dir / "entity_extraction.txt"
199199
if not entity_extraction.exists():
200-
with entity_extraction.open("w") as file:
201-
file.write(GRAPH_EXTRACTION_PROMPT)
200+
with entity_extraction.open("wb") as file:
201+
file.write(
202+
GRAPH_EXTRACTION_PROMPT.encode(encoding="utf-8", errors="strict")
203+
)
202204

203205
summarize_descriptions = prompts_dir / "summarize_descriptions.txt"
204206
if not summarize_descriptions.exists():
205-
with summarize_descriptions.open("w") as file:
206-
file.write(SUMMARIZE_PROMPT)
207+
with summarize_descriptions.open("wb") as file:
208+
file.write(SUMMARIZE_PROMPT.encode(encoding="utf-8", errors="strict"))
207209

208210
claim_extraction = prompts_dir / "claim_extraction.txt"
209211
if not claim_extraction.exists():
210-
with claim_extraction.open("w") as file:
211-
file.write(CLAIM_EXTRACTION_PROMPT)
212+
with claim_extraction.open("wb") as file:
213+
file.write(
214+
CLAIM_EXTRACTION_PROMPT.encode(encoding="utf-8", errors="strict")
215+
)
212216

213217
community_report = prompts_dir / "community_report.txt"
214218
if not community_report.exists():
215-
with community_report.open("w") as file:
216-
file.write(COMMUNITY_REPORT_PROMPT)
219+
with community_report.open("wb") as file:
220+
file.write(
221+
COMMUNITY_REPORT_PROMPT.encode(encoding="utf-8", errors="strict")
222+
)
217223

218224

219225
def _create_default_config(
@@ -267,18 +273,18 @@ def _read_config_parameters(root: str, config: str | None, reporter: ProgressRep
267273

268274
if settings_yaml.exists():
269275
reporter.success(f"Reading settings from {settings_yaml}")
270-
with settings_yaml.open("r") as file:
276+
with settings_yaml.open("rb") as file:
271277
import yaml
272278

273-
data = yaml.safe_load(file)
279+
data = yaml.safe_load(file.read().decode(encoding="utf-8", errors="strict"))
274280
return create_graphrag_config(data, root)
275281

276282
if settings_json.exists():
277283
reporter.success(f"Reading settings from {settings_json}")
278-
with settings_json.open("r") as file:
284+
with settings_json.open("rb") as file:
279285
import json
280286

281-
data = json.loads(file.read())
287+
data = json.loads(file.read().decode(encoding="utf-8", errors="strict"))
282288
return create_graphrag_config(data, root)
283289

284290
reporter.success("Reading settings from environment variables")

graphrag/index/load_pipeline_config.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ def load_pipeline_config(config_or_path: str | PipelineConfig) -> PipelineConfig
2626
read_dotenv(str(Path(config_or_path).parent))
2727

2828
if config_or_path.endswith(".json"):
29-
with Path(config_or_path).open(encoding="utf-8") as f:
30-
config = json.load(f)
29+
with Path(config_or_path).open("rb") as f:
30+
config = json.loads(f.read().decode(encoding="utf-8", errors="strict"))
3131
elif config_or_path.endswith((".yml", ".yaml")):
3232
config = _parse_yaml(config_or_path)
3333
else:
@@ -73,7 +73,7 @@ def handle_include(loader: yaml.Loader, node: yaml.Node):
7373
if filename.endswith((".yml", ".yaml")):
7474
return _parse_yaml(filename)
7575

76-
with Path(filename).open(encoding="utf-8") as f:
77-
return f.read()
76+
with Path(filename).open("rb") as f:
77+
return f.read().decode(encoding="utf-8", errors="strict")
7878

7979
return handle_include

graphrag/index/reporting/file_workflow_callbacks.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ class FileWorkflowCallbacks(NoopWorkflowCallbacks):
2121
def __init__(self, directory: str):
2222
"""Create a new file-based workflow reporter."""
2323
Path(directory).mkdir(parents=True, exist_ok=True)
24-
self._out_stream = open( # noqa SIM115
25-
Path(directory) / "logs.json", "a", encoding="utf-8"
24+
self._out_stream = open( # noqa: PTH123, SIM115
25+
Path(directory) / "logs.json", "a", encoding="utf-8", errors="strict"
2626
)
2727

2828
def on_error(

graphrag/index/storage/file_pipeline_storage.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,9 @@ async def set(self, key: str, value: Any, encoding: str | None = None) -> None:
114114
write_type = "wb" if is_bytes else "w"
115115
encoding = None if is_bytes else encoding or self._encoding
116116
async with aiofiles.open(
117-
join_path(self._root_dir, key), cast(Any, write_type), encoding=encoding
117+
join_path(self._root_dir, key),
118+
cast(Any, write_type),
119+
encoding=encoding,
118120
) as f:
119121
await f.write(value)
120122

0 commit comments

Comments
 (0)