Skip to content

Commit 2a1096b

Browse files
[Worktree] Persist optimization patches metadata (#690)
* save optimization patches metadata * typo * lsp: get previous optimizations * fix patch name in non-lsp mode * ⚡️ Speed up function `get_patches_metadata` by 45% in PR #690 (`worktree/persist-optimization-patches`) The optimized code achieves a **44% speedup** through two key optimizations: **1. Added `@lru_cache(maxsize=1)` to `get_patches_dir_for_project()`** - This caches the Path object construction, avoiding repeated calls to `get_git_project_id()` and `Path()` creation - The line profiler shows this function's total time dropped from 5.32ms to being completely eliminated from the hot path in `get_patches_metadata()` - Since `get_git_project_id()` was already cached but still being called repeatedly, this second-level caching eliminates that redundancy **2. Replaced `read_text()` + `json.loads()` with `open()` + `json.load()`** - Using `json.load()` with a file handle is more efficient than reading the entire file into memory first with `read_text()` then parsing it - This avoids the intermediate string creation and is particularly beneficial for larger JSON files - Added explicit UTF-8 encoding for consistency **Performance Impact by Test Type:** - **Basic cases** (small/missing files): 45-65% faster - benefits primarily from the caching optimization - **Edge cases** (malformed JSON): 38-47% faster - still benefits from both optimizations - **Large scale cases** (1000+ patches, large files): 39-52% faster - the file I/O optimization becomes more significant with larger JSON files The caching optimization provides the most consistent gains across all scenarios since it eliminates repeated expensive operations, while the file I/O optimization scales with file size. * fix: patch path * codeflash suggestions * split the worktree utils in a separate file --------- Co-authored-by: codeflash-ai[bot] <148906541+codeflash-ai[bot]@users.noreply.github.com>
1 parent a59b9ed commit 2a1096b

File tree

4 files changed

+233
-97
lines changed

4 files changed

+233
-97
lines changed

codeflash/code_utils/git_utils.py

Lines changed: 1 addition & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,13 @@
99
from functools import cache
1010
from io import StringIO
1111
from pathlib import Path
12-
from typing import TYPE_CHECKING, Optional
12+
from typing import TYPE_CHECKING
1313

1414
import git
1515
from rich.prompt import Confirm
1616
from unidiff import PatchSet
1717

1818
from codeflash.cli_cmds.console import logger
19-
from codeflash.code_utils.compat import codeflash_cache_dir
2019
from codeflash.code_utils.config_consts import N_CANDIDATES
2120

2221
if TYPE_CHECKING:
@@ -199,84 +198,3 @@ def get_last_commit_author_if_pr_exists(repo: Repo | None = None) -> str | None:
199198
return None
200199
else:
201200
return last_commit.author.name
202-
203-
204-
worktree_dirs = codeflash_cache_dir / "worktrees"
205-
patches_dir = codeflash_cache_dir / "patches"
206-
207-
208-
def create_worktree_snapshot_commit(worktree_dir: Path, commit_message: str) -> None:
209-
repository = git.Repo(worktree_dir, search_parent_directories=True)
210-
repository.git.add(".")
211-
repository.git.commit("-m", commit_message, "--no-verify")
212-
213-
214-
def create_detached_worktree(module_root: Path) -> Optional[Path]:
215-
if not check_running_in_git_repo(module_root):
216-
logger.warning("Module is not in a git repository. Skipping worktree creation.")
217-
return None
218-
git_root = git_root_dir()
219-
current_time_str = time.strftime("%Y%m%d-%H%M%S")
220-
worktree_dir = worktree_dirs / f"{git_root.name}-{current_time_str}"
221-
222-
repository = git.Repo(git_root, search_parent_directories=True)
223-
224-
repository.git.worktree("add", "-d", str(worktree_dir))
225-
226-
# Get uncommitted diff from the original repo
227-
repository.git.add("-N", ".") # add the index for untracked files to be included in the diff
228-
exclude_binary_files = [":!*.pyc", ":!*.pyo", ":!*.pyd", ":!*.so", ":!*.dll", ":!*.whl", ":!*.egg", ":!*.egg-info", ":!*.pyz", ":!*.pkl", ":!*.pickle", ":!*.joblib", ":!*.npy", ":!*.npz", ":!*.h5", ":!*.hdf5", ":!*.pth", ":!*.pt", ":!*.pb", ":!*.onnx", ":!*.db", ":!*.sqlite", ":!*.sqlite3", ":!*.feather", ":!*.parquet", ":!*.jpg", ":!*.jpeg", ":!*.png", ":!*.gif", ":!*.bmp", ":!*.tiff", ":!*.webp", ":!*.wav", ":!*.mp3", ":!*.ogg", ":!*.flac", ":!*.mp4", ":!*.avi", ":!*.mov", ":!*.mkv", ":!*.pdf", ":!*.doc", ":!*.docx", ":!*.xls", ":!*.xlsx", ":!*.ppt", ":!*.pptx", ":!*.zip", ":!*.rar", ":!*.tar", ":!*.tar.gz", ":!*.tgz", ":!*.bz2", ":!*.xz"] # fmt: off
229-
uni_diff_text = repository.git.diff(
230-
None, "HEAD", "--", *exclude_binary_files, ignore_blank_lines=True, ignore_space_at_eol=True
231-
)
232-
233-
if not uni_diff_text.strip():
234-
logger.info("No uncommitted changes to copy to worktree.")
235-
return worktree_dir
236-
237-
# Write the diff to a temporary file
238-
with tempfile.NamedTemporaryFile(mode="w", suffix=".codeflash.patch", delete=False) as tmp_patch_file:
239-
tmp_patch_file.write(uni_diff_text + "\n") # the new line here is a must otherwise the last hunk won't be valid
240-
tmp_patch_file.flush()
241-
242-
patch_path = Path(tmp_patch_file.name).resolve()
243-
244-
# Apply the patch inside the worktree
245-
try:
246-
subprocess.run(
247-
["git", "apply", "--ignore-space-change", "--ignore-whitespace", "--whitespace=nowarn", patch_path],
248-
cwd=worktree_dir,
249-
check=True,
250-
)
251-
create_worktree_snapshot_commit(worktree_dir, "Initial Snapshot")
252-
except subprocess.CalledProcessError as e:
253-
logger.error(f"Failed to apply patch to worktree: {e}")
254-
255-
return worktree_dir
256-
257-
258-
def remove_worktree(worktree_dir: Path) -> None:
259-
try:
260-
repository = git.Repo(worktree_dir, search_parent_directories=True)
261-
repository.git.worktree("remove", "--force", worktree_dir)
262-
except Exception:
263-
logger.exception(f"Failed to remove worktree: {worktree_dir}")
264-
265-
266-
def create_diff_patch_from_worktree(worktree_dir: Path, files: list[str], fto_name: str) -> Path:
267-
repository = git.Repo(worktree_dir, search_parent_directories=True)
268-
uni_diff_text = repository.git.diff(None, "HEAD", *files, ignore_blank_lines=True, ignore_space_at_eol=True)
269-
270-
if not uni_diff_text:
271-
logger.warning("No changes found in worktree.")
272-
return None
273-
274-
if not uni_diff_text.endswith("\n"):
275-
uni_diff_text += "\n"
276-
277-
# write to patches_dir
278-
patches_dir.mkdir(parents=True, exist_ok=True)
279-
patch_path = patches_dir / f"{worktree_dir.name}.{fto_name}.patch"
280-
with patch_path.open("w", encoding="utf8") as f:
281-
f.write(uni_diff_text)
282-
return patch_path
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
from __future__ import annotations
2+
3+
import json
4+
import subprocess
5+
import tempfile
6+
import time
7+
from functools import lru_cache
8+
from pathlib import Path
9+
from typing import TYPE_CHECKING, Optional
10+
11+
import git
12+
from filelock import FileLock
13+
14+
from codeflash.cli_cmds.console import logger
15+
from codeflash.code_utils.compat import codeflash_cache_dir
16+
from codeflash.code_utils.git_utils import check_running_in_git_repo, git_root_dir
17+
18+
if TYPE_CHECKING:
19+
from typing import Any
20+
21+
from git import Repo
22+
23+
24+
worktree_dirs = codeflash_cache_dir / "worktrees"
25+
patches_dir = codeflash_cache_dir / "patches"
26+
27+
if TYPE_CHECKING:
28+
from git import Repo
29+
30+
31+
@lru_cache(maxsize=1)
32+
def get_git_project_id() -> str:
33+
"""Return the first commit sha of the repo."""
34+
repo: Repo = git.Repo(search_parent_directories=True)
35+
root_commits = list(repo.iter_commits(rev="HEAD", max_parents=0))
36+
return root_commits[0].hexsha
37+
38+
39+
def create_worktree_snapshot_commit(worktree_dir: Path, commit_message: str) -> None:
40+
repository = git.Repo(worktree_dir, search_parent_directories=True)
41+
repository.git.add(".")
42+
repository.git.commit("-m", commit_message, "--no-verify")
43+
44+
45+
def create_detached_worktree(module_root: Path) -> Optional[Path]:
46+
if not check_running_in_git_repo(module_root):
47+
logger.warning("Module is not in a git repository. Skipping worktree creation.")
48+
return None
49+
git_root = git_root_dir()
50+
current_time_str = time.strftime("%Y%m%d-%H%M%S")
51+
worktree_dir = worktree_dirs / f"{git_root.name}-{current_time_str}"
52+
53+
repository = git.Repo(git_root, search_parent_directories=True)
54+
55+
repository.git.worktree("add", "-d", str(worktree_dir))
56+
57+
# Get uncommitted diff from the original repo
58+
repository.git.add("-N", ".") # add the index for untracked files to be included in the diff
59+
exclude_binary_files = [":!*.pyc", ":!*.pyo", ":!*.pyd", ":!*.so", ":!*.dll", ":!*.whl", ":!*.egg", ":!*.egg-info", ":!*.pyz", ":!*.pkl", ":!*.pickle", ":!*.joblib", ":!*.npy", ":!*.npz", ":!*.h5", ":!*.hdf5", ":!*.pth", ":!*.pt", ":!*.pb", ":!*.onnx", ":!*.db", ":!*.sqlite", ":!*.sqlite3", ":!*.feather", ":!*.parquet", ":!*.jpg", ":!*.jpeg", ":!*.png", ":!*.gif", ":!*.bmp", ":!*.tiff", ":!*.webp", ":!*.wav", ":!*.mp3", ":!*.ogg", ":!*.flac", ":!*.mp4", ":!*.avi", ":!*.mov", ":!*.mkv", ":!*.pdf", ":!*.doc", ":!*.docx", ":!*.xls", ":!*.xlsx", ":!*.ppt", ":!*.pptx", ":!*.zip", ":!*.rar", ":!*.tar", ":!*.tar.gz", ":!*.tgz", ":!*.bz2", ":!*.xz"] # fmt: off
60+
uni_diff_text = repository.git.diff(
61+
None, "HEAD", "--", *exclude_binary_files, ignore_blank_lines=True, ignore_space_at_eol=True
62+
)
63+
64+
if not uni_diff_text.strip():
65+
logger.info("No uncommitted changes to copy to worktree.")
66+
return worktree_dir
67+
68+
# Write the diff to a temporary file
69+
with tempfile.NamedTemporaryFile(mode="w", suffix=".codeflash.patch", delete=False) as tmp_patch_file:
70+
tmp_patch_file.write(uni_diff_text + "\n") # the new line here is a must otherwise the last hunk won't be valid
71+
tmp_patch_file.flush()
72+
73+
patch_path = Path(tmp_patch_file.name).resolve()
74+
75+
# Apply the patch inside the worktree
76+
try:
77+
subprocess.run(
78+
["git", "apply", "--ignore-space-change", "--ignore-whitespace", "--whitespace=nowarn", patch_path],
79+
cwd=worktree_dir,
80+
check=True,
81+
)
82+
create_worktree_snapshot_commit(worktree_dir, "Initial Snapshot")
83+
except subprocess.CalledProcessError as e:
84+
logger.error(f"Failed to apply patch to worktree: {e}")
85+
86+
return worktree_dir
87+
88+
89+
def remove_worktree(worktree_dir: Path) -> None:
90+
try:
91+
repository = git.Repo(worktree_dir, search_parent_directories=True)
92+
repository.git.worktree("remove", "--force", worktree_dir)
93+
except Exception:
94+
logger.exception(f"Failed to remove worktree: {worktree_dir}")
95+
96+
97+
@lru_cache(maxsize=1)
98+
def get_patches_dir_for_project() -> Path:
99+
project_id = get_git_project_id() or ""
100+
return Path(patches_dir / project_id)
101+
102+
103+
def get_patches_metadata() -> dict[str, Any]:
104+
project_patches_dir = get_patches_dir_for_project()
105+
meta_file = project_patches_dir / "metadata.json"
106+
if meta_file.exists():
107+
with meta_file.open("r", encoding="utf-8") as f:
108+
return json.load(f)
109+
return {"id": get_git_project_id() or "", "patches": []}
110+
111+
112+
def save_patches_metadata(patch_metadata: dict) -> dict:
113+
project_patches_dir = get_patches_dir_for_project()
114+
meta_file = project_patches_dir / "metadata.json"
115+
lock_file = project_patches_dir / "metadata.json.lock"
116+
117+
# we are not supporting multiple concurrent optimizations within the same process, but keep that in case we decide to do so in the future.
118+
with FileLock(lock_file, timeout=10):
119+
metadata = get_patches_metadata()
120+
121+
patch_metadata["id"] = time.strftime("%Y%m%d-%H%M%S")
122+
metadata["patches"].append(patch_metadata)
123+
124+
meta_file.write_text(json.dumps(metadata, indent=2))
125+
126+
return patch_metadata
127+
128+
129+
def overwrite_patch_metadata(patches: list[dict]) -> bool:
130+
project_patches_dir = get_patches_dir_for_project()
131+
meta_file = project_patches_dir / "metadata.json"
132+
lock_file = project_patches_dir / "metadata.json.lock"
133+
134+
with FileLock(lock_file, timeout=10):
135+
metadata = get_patches_metadata()
136+
metadata["patches"] = patches
137+
meta_file.write_text(json.dumps(metadata, indent=2))
138+
return True
139+
140+
141+
def create_diff_patch_from_worktree(
142+
worktree_dir: Path,
143+
files: list[str],
144+
fto_name: Optional[str] = None,
145+
metadata_input: Optional[dict[str, Any]] = None,
146+
) -> dict[str, Any]:
147+
repository = git.Repo(worktree_dir, search_parent_directories=True)
148+
uni_diff_text = repository.git.diff(None, "HEAD", *files, ignore_blank_lines=True, ignore_space_at_eol=True)
149+
150+
if not uni_diff_text:
151+
logger.warning("No changes found in worktree.")
152+
return {}
153+
154+
if not uni_diff_text.endswith("\n"):
155+
uni_diff_text += "\n"
156+
157+
project_patches_dir = get_patches_dir_for_project()
158+
project_patches_dir.mkdir(parents=True, exist_ok=True)
159+
160+
final_function_name = fto_name or metadata_input.get("fto_name", "unknown")
161+
patch_path = project_patches_dir / f"{worktree_dir.name}.{final_function_name}.patch"
162+
with patch_path.open("w", encoding="utf8") as f:
163+
f.write(uni_diff_text)
164+
165+
final_metadata = {"patch_path": str(patch_path)}
166+
if metadata_input:
167+
final_metadata.update(metadata_input)
168+
final_metadata = save_patches_metadata(final_metadata)
169+
170+
return final_metadata

codeflash/lsp/beta.py

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111

1212
from codeflash.api.cfapi import get_codeflash_api_key, get_user_id
1313
from codeflash.cli_cmds.cli import process_pyproject_config
14-
from codeflash.code_utils.git_utils import create_diff_patch_from_worktree
14+
from codeflash.code_utils.git_worktree_utils import (
15+
create_diff_patch_from_worktree,
16+
get_patches_metadata,
17+
overwrite_patch_metadata,
18+
)
1519
from codeflash.code_utils.shell_utils import save_api_key_to_rc
1620
from codeflash.discovery.functions_to_optimize import (
1721
filter_functions,
@@ -45,6 +49,10 @@ class ProvideApiKeyParams:
4549
api_key: str
4650

4751

52+
@dataclass
53+
class OnPatchAppliedParams:
54+
patch_id: str
55+
4856
@dataclass
4957
class OptimizableFunctionsInCommitParams:
5058
commit_hash: str
@@ -245,6 +253,34 @@ def provide_api_key(server: CodeflashLanguageServer, params: ProvideApiKeyParams
245253
return {"status": "error", "message": "something went wrong while saving the api key"}
246254

247255

256+
@server.feature("retrieveSuccessfulOptimizations")
257+
def retrieve_successful_optimizations(_server: CodeflashLanguageServer, _params: any) -> dict[str, str]:
258+
metadata = get_patches_metadata()
259+
return {"status": "success", "patches": metadata["patches"]}
260+
261+
262+
@server.feature("onPatchApplied")
263+
def on_patch_applied(_server: CodeflashLanguageServer, params: OnPatchAppliedParams) -> dict[str, str]:
264+
# first remove the patch from the metadata
265+
metadata = get_patches_metadata()
266+
267+
deleted_patch_file = None
268+
new_patches = []
269+
for patch in metadata["patches"]:
270+
if patch["id"] == params.patch_id:
271+
deleted_patch_file = patch["patch_path"]
272+
continue
273+
new_patches.append(patch)
274+
275+
# then remove the patch file
276+
if deleted_patch_file:
277+
overwrite_patch_metadata(new_patches)
278+
patch_path = Path(deleted_patch_file)
279+
patch_path.unlink(missing_ok=True)
280+
return {"status": "success"}
281+
return {"status": "error", "message": "Patch not found"}
282+
283+
248284
@server.feature("performFunctionOptimization")
249285
@server.thread()
250286
def perform_function_optimization( # noqa: PLR0911
@@ -346,24 +382,34 @@ def perform_function_optimization( # noqa: PLR0911
346382

347383
# generate a patch for the optimization
348384
relative_file_paths = [code_string.file_path for code_string in code_context.read_writable_code.code_strings]
349-
patch_file = create_diff_patch_from_worktree(
385+
386+
speedup = original_code_baseline.runtime / best_optimization.runtime
387+
388+
# get the original file path in the actual project (not in the worktree)
389+
original_args, _ = server.optimizer.original_args_and_test_cfg
390+
relative_file_path = current_function.file_path.relative_to(server.optimizer.current_worktree)
391+
original_file_path = Path(original_args.project_root / relative_file_path).resolve()
392+
393+
metadata = create_diff_patch_from_worktree(
350394
server.optimizer.current_worktree,
351395
relative_file_paths,
352-
server.optimizer.current_function_optimizer.function_to_optimize.qualified_name,
396+
metadata_input={
397+
"fto_name": function_to_optimize_qualified_name,
398+
"explanation": best_optimization.explanation_v2,
399+
"file_path": str(original_file_path),
400+
"speedup": speedup,
401+
},
353402
)
354403

355-
optimized_source = best_optimization.candidate.source_code.markdown
356-
speedup = original_code_baseline.runtime / best_optimization.runtime
357-
358404
server.show_message_log(f"Optimization completed for {params.functionName} with {speedup:.2f}x speedup", "Info")
359405

360406
return {
361407
"functionName": params.functionName,
362408
"status": "success",
363409
"message": "Optimization completed successfully",
364410
"extra": f"Speedup: {speedup:.2f}x faster",
365-
"optimization": optimized_source,
366-
"patch_file": str(patch_file),
411+
"patch_file": metadata["patch_path"],
412+
"patch_id": metadata["id"],
367413
"explanation": best_optimization.explanation_v2,
368414
}
369415
finally:

0 commit comments

Comments
 (0)