Read/write files as binary utf-8 (#639)

jgbradley1 · web-flow · commit 2ddee65c29b2 · 2024-07-24T13:28:22.000-04:00
diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -21,7 +21,7 @@ jobs:
   python-ci:
     strategy:
       matrix:
-        python-version: ["3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11"] # add 3.12 once gensim supports it. TODO: watch this issue - https://github.com/piskvorky/gensim/issues/3510
         os: [ubuntu-latest, windows-latest]
     env:
       DEBUG: 1
@@ -79,7 +79,10 @@ jobs:
 
       - name: Install dependencies
         shell: bash
-        run: poetry self add setuptools && poetry run python -m pip install gensim && poetry install
+        run: |
+          poetry self add setuptools wheel
+          poetry run python -m pip install gensim
+          poetry install
 
       - name: Check Semversioner
         run: |
diff --git a/.semversioner/next-release/patch-20240721063703879643.json b/.semversioner/next-release/patch-20240721063703879643.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "use binary io processing for all file io operations"
+}
diff --git a/graphrag/config/models/claim_extraction_config.py b/graphrag/config/models/claim_extraction_config.py
@@ -43,7 +43,9 @@ def resolved_strategy(self, root_dir: str) -> dict:
             "type": ExtractClaimsStrategyType.graph_intelligence,
             "llm": self.llm.model_dump(),
             **self.parallelization.model_dump(),
-            "extraction_prompt": (Path(root_dir) / self.prompt).read_text()
+            "extraction_prompt": (Path(root_dir) / self.prompt)
+            .read_bytes()
+            .decode(encoding="utf-8")
             if self.prompt
             else None,
             "claim_description": self.description,
diff --git a/graphrag/config/models/community_reports_config.py b/graphrag/config/models/community_reports_config.py
@@ -38,7 +38,9 @@ def resolved_strategy(self, root_dir) -> dict:
             "type": CreateCommunityReportsStrategyType.graph_intelligence,
             "llm": self.llm.model_dump(),
             **self.parallelization.model_dump(),
-            "extraction_prompt": (Path(root_dir) / self.prompt).read_text()
+            "extraction_prompt": (Path(root_dir) / self.prompt)
+            .read_bytes()
+            .decode(encoding="utf-8")
             if self.prompt
             else None,
             "max_report_length": self.max_length,
diff --git a/graphrag/config/models/entity_extraction_config.py b/graphrag/config/models/entity_extraction_config.py
@@ -38,7 +38,9 @@ def resolved_strategy(self, root_dir: str, encoding_model: str) -> dict:
             "type": ExtractEntityStrategyType.graph_intelligence,
             "llm": self.llm.model_dump(),
             **self.parallelization.model_dump(),
-            "extraction_prompt": (Path(root_dir) / self.prompt).read_text()
+            "extraction_prompt": (Path(root_dir) / self.prompt)
+            .read_bytes()
+            .decode(encoding="utf-8")
             if self.prompt
             else None,
             "max_gleanings": self.max_gleanings,
diff --git a/graphrag/config/models/summarize_descriptions_config.py b/graphrag/config/models/summarize_descriptions_config.py
@@ -34,7 +34,9 @@ def resolved_strategy(self, root_dir: str) -> dict:
             "type": SummarizeStrategyType.graph_intelligence,
             "llm": self.llm.model_dump(),
             **self.parallelization.model_dump(),
-            "summarize_prompt": (Path(root_dir) / self.prompt).read_text()
+            "summarize_prompt": (Path(root_dir) / self.prompt)
+            .read_bytes()
+            .decode(encoding="utf-8")
             if self.prompt
             else None,
             "max_summary_length": self.max_length,
diff --git a/graphrag/index/cli.py b/graphrag/index/cli.py
@@ -185,35 +185,41 @@ def _initialize_project_at(path: str, reporter: ProgressReporter) -> None:
 
     dotenv = root / ".env"
     if not dotenv.exists():
-        with settings_yaml.open("w") as file:
-            file.write(INIT_YAML)
+        with settings_yaml.open("wb") as file:
+            file.write(INIT_YAML.encode(encoding="utf-8", errors="strict"))
 
-    with dotenv.open("w") as file:
-        file.write(INIT_DOTENV)
+    with dotenv.open("wb") as file:
+        file.write(INIT_DOTENV.encode(encoding="utf-8", errors="strict"))
 
     prompts_dir = root / "prompts"
     if not prompts_dir.exists():
         prompts_dir.mkdir(parents=True, exist_ok=True)
 
     entity_extraction = prompts_dir / "entity_extraction.txt"
     if not entity_extraction.exists():
-        with entity_extraction.open("w") as file:
-            file.write(GRAPH_EXTRACTION_PROMPT)
+        with entity_extraction.open("wb") as file:
+            file.write(
+                GRAPH_EXTRACTION_PROMPT.encode(encoding="utf-8", errors="strict")
+            )
 
     summarize_descriptions = prompts_dir / "summarize_descriptions.txt"
     if not summarize_descriptions.exists():
-        with summarize_descriptions.open("w") as file:
-            file.write(SUMMARIZE_PROMPT)
+        with summarize_descriptions.open("wb") as file:
+            file.write(SUMMARIZE_PROMPT.encode(encoding="utf-8", errors="strict"))
 
     claim_extraction = prompts_dir / "claim_extraction.txt"
     if not claim_extraction.exists():
-        with claim_extraction.open("w") as file:
-            file.write(CLAIM_EXTRACTION_PROMPT)
+        with claim_extraction.open("wb") as file:
+            file.write(
+                CLAIM_EXTRACTION_PROMPT.encode(encoding="utf-8", errors="strict")
+            )
 
     community_report = prompts_dir / "community_report.txt"
     if not community_report.exists():
-        with community_report.open("w") as file:
-            file.write(COMMUNITY_REPORT_PROMPT)
+        with community_report.open("wb") as file:
+            file.write(
+                COMMUNITY_REPORT_PROMPT.encode(encoding="utf-8", errors="strict")
+            )
 
 
 def _create_default_config(
@@ -267,18 +273,18 @@ def _read_config_parameters(root: str, config: str | None, reporter: ProgressRep
 
     if settings_yaml.exists():
         reporter.success(f"Reading settings from {settings_yaml}")
-        with settings_yaml.open("r") as file:
+        with settings_yaml.open("rb") as file:
             import yaml
 
-            data = yaml.safe_load(file)
+            data = yaml.safe_load(file.read().decode(encoding="utf-8", errors="strict"))
             return create_graphrag_config(data, root)
 
     if settings_json.exists():
         reporter.success(f"Reading settings from {settings_json}")
-        with settings_json.open("r") as file:
+        with settings_json.open("rb") as file:
             import json
 
-            data = json.loads(file.read())
+            data = json.loads(file.read().decode(encoding="utf-8", errors="strict"))
             return create_graphrag_config(data, root)
 
     reporter.success("Reading settings from environment variables")
diff --git a/graphrag/index/load_pipeline_config.py b/graphrag/index/load_pipeline_config.py
@@ -26,8 +26,8 @@ def load_pipeline_config(config_or_path: str | PipelineConfig) -> PipelineConfig
         read_dotenv(str(Path(config_or_path).parent))
 
         if config_or_path.endswith(".json"):
-            with Path(config_or_path).open(encoding="utf-8") as f:
-                config = json.load(f)
+            with Path(config_or_path).open("rb") as f:
+                config = json.loads(f.read().decode(encoding="utf-8", errors="strict"))
         elif config_or_path.endswith((".yml", ".yaml")):
             config = _parse_yaml(config_or_path)
         else:
@@ -73,7 +73,7 @@ def handle_include(loader: yaml.Loader, node: yaml.Node):
         if filename.endswith((".yml", ".yaml")):
             return _parse_yaml(filename)
 
-        with Path(filename).open(encoding="utf-8") as f:
-            return f.read()
+        with Path(filename).open("rb") as f:
+            return f.read().decode(encoding="utf-8", errors="strict")
 
     return handle_include
diff --git a/graphrag/index/reporting/file_workflow_callbacks.py b/graphrag/index/reporting/file_workflow_callbacks.py
@@ -21,8 +21,8 @@ class FileWorkflowCallbacks(NoopWorkflowCallbacks):
     def __init__(self, directory: str):
         """Create a new file-based workflow reporter."""
         Path(directory).mkdir(parents=True, exist_ok=True)
-        self._out_stream = open(  # noqa SIM115
-            Path(directory) / "logs.json", "a", encoding="utf-8"
+        self._out_stream = open(  # noqa: PTH123, SIM115
+            Path(directory) / "logs.json", "a", encoding="utf-8", errors="strict"
         )
 
     def on_error(
diff --git a/graphrag/index/storage/file_pipeline_storage.py b/graphrag/index/storage/file_pipeline_storage.py
@@ -114,7 +114,9 @@ async def set(self, key: str, value: Any, encoding: str | None = None) -> None:
         write_type = "wb" if is_bytes else "w"
         encoding = None if is_bytes else encoding or self._encoding
         async with aiofiles.open(
-            join_path(self._root_dir, key), cast(Any, write_type), encoding=encoding
+            join_path(self._root_dir, key),
+            cast(Any, write_type),
+            encoding=encoding,
         ) as f:
             await f.write(value)
 
diff --git a/graphrag/prompt_tune/generator/community_report_summarization.py b/graphrag/prompt_tune/generator/community_report_summarization.py
@@ -42,7 +42,7 @@ def create_community_summarization_prompt(
 
         output_path = output_path / COMMUNITY_SUMMARIZATION_FILENAME
         # Write file to output path
-        with output_path.open("w") as file:
-            file.write(prompt)
+        with output_path.open("wb") as file:
+            file.write(prompt.encode(encoding="utf-8", errors="strict"))
 
     return prompt
diff --git a/graphrag/prompt_tune/generator/entity_extraction_prompt.py b/graphrag/prompt_tune/generator/entity_extraction_prompt.py
@@ -99,7 +99,7 @@ def create_entity_extraction_prompt(
 
         output_path = output_path / ENTITY_EXTRACTION_FILENAME
         # Write file to output path
-        with output_path.open("w") as file:
-            file.write(prompt)
+        with output_path.open("wb") as file:
+            file.write(prompt.encode(encoding="utf-8", errors="strict"))
 
     return prompt
diff --git a/graphrag/prompt_tune/generator/entity_summarization_prompt.py b/graphrag/prompt_tune/generator/entity_summarization_prompt.py
@@ -30,7 +30,7 @@ def create_entity_summarization_prompt(
 
         output_path = output_path / ENTITY_SUMMARIZATION_FILENAME
         # Write file to output path
-        with output_path.open("w") as file:
-            file.write(prompt)
+        with output_path.open("wb") as file:
+            file.write(prompt.encode(encoding="utf-8", errors="strict"))
 
     return prompt
diff --git a/graphrag/prompt_tune/loader/config.py b/graphrag/prompt_tune/loader/config.py
@@ -25,18 +25,18 @@ def read_config_parameters(root: str, reporter: ProgressReporter):
 
     if settings_yaml.exists():
         reporter.info(f"Reading settings from {settings_yaml}")
-        with settings_yaml.open("r") as file:
+        with settings_yaml.open("rb") as file:
             import yaml
 
-            data = yaml.safe_load(file)
+            data = yaml.safe_load(file.read().decode(encoding="utf-8", errors="strict"))
             return create_graphrag_config(data, root)
 
     if settings_json.exists():
         reporter.info(f"Reading settings from {settings_json}")
-        with settings_json.open("r") as file:
+        with settings_json.open("rb") as file:
             import json
 
-            data = json.loads(file.read())
+            data = json.loads(file.read().decode(encoding="utf-8", errors="strict"))
             return create_graphrag_config(data, root)
 
     reporter.info("Reading settings from environment variables")
diff --git a/graphrag/query/cli.py b/graphrag/query/cli.py
@@ -194,18 +194,20 @@ def _read_config_parameters(root: str):
 
     if settings_yaml.exists():
         reporter.info(f"Reading settings from {settings_yaml}")
-        with settings_yaml.open("r") as file:
+        with settings_yaml.open(
+            "rb",
+        ) as file:
             import yaml
 
-            data = yaml.safe_load(file)
+            data = yaml.safe_load(file.read().decode(encoding="utf-8", errors="strict"))
             return create_graphrag_config(data, root)
 
     if settings_json.exists():
         reporter.info(f"Reading settings from {settings_json}")
-        with settings_json.open("r") as file:
+        with settings_json.open("rb") as file:
             import json
 
-            data = json.loads(file.read())
+            data = json.loads(file.read().decode(encoding="utf-8", errors="strict"))
             return create_graphrag_config(data, root)
 
     reporter.info("Reading settings from environment variables")
diff --git a/poetry.lock b/poetry.lock
diff --git a/tests/smoke/test_fixtures.py b/tests/smoke/test_fixtures.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "use binary io processing for all file io operations"
 +}