Skip to content

Commit a3faabe

Browse files
committed
fix: handle non-UTF-8 file encodings in git operations
- Remove text=True parameter from subprocess.run() calls to get raw bytes - Add explicit UTF-8 decoding with error replacement for better encoding handling - Fix UnicodeDecodeError when processing files with non-standard encodings - Update all git subprocess operations to handle encoding errors gracefully This fixes the issue where CommitLoom would crash with 'utf-8 codec can't decode' errors when encountering files with different character encodings (e.g., Lua files with special characters, legacy files with ISO-8859-1, etc.)
1 parent c7b0967 commit a3faabe

File tree

1 file changed

+57
-38
lines changed

1 file changed

+57
-38
lines changed

commitloom/core/git.py

Lines changed: 57 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ def should_ignore_file(path: str) -> bool:
5353
def _handle_git_output(result: subprocess.CompletedProcess, context: str = "") -> None:
5454
"""Handle git command output and log messages."""
5555
if result.stderr:
56-
if result.stderr.startswith("warning:"):
57-
logger.warning("Git warning%s: %s", f" {context}" if context else "", result.stderr)
56+
# Handle both bytes and string
57+
stderr = result.stderr if isinstance(result.stderr, str) else result.stderr.decode('utf-8', errors='replace')
58+
if stderr.startswith("warning:"):
59+
logger.warning("Git warning%s: %s", f" {context}" if context else "", stderr)
5860
else:
59-
logger.info("Git message%s: %s", f" {context}" if context else "", result.stderr)
61+
logger.info("Git message%s: %s", f" {context}" if context else "", stderr)
6062

6163
@staticmethod
6264
def _is_binary_file(path: str) -> tuple[bool, int | None, str | None]:
@@ -70,18 +72,18 @@ def _is_binary_file(path: str) -> tuple[bool, int | None, str | None]:
7072
size = os.path.getsize(path)
7173

7274
# Get file hash
73-
result = subprocess.run(["git", "hash-object", path], capture_output=True, text=True, check=True)
74-
file_hash = result.stdout.strip()
75+
result = subprocess.run(["git", "hash-object", path], capture_output=True, check=True)
76+
file_hash = result.stdout.decode('utf-8', errors='replace').strip()
7577

7678
# Check if file is binary using git's internal mechanism
7779
result = subprocess.run(
7880
["git", "diff", "--numstat", "--cached", path],
7981
capture_output=True,
80-
text=True,
8182
check=True,
8283
)
8384
# Binary files show up as "-" in numstat output
84-
is_binary = "-\t-\t" in result.stdout
85+
stdout = result.stdout.decode('utf-8', errors='replace')
86+
is_binary = "-\t-\t" in stdout
8587

8688
return is_binary, size if is_binary else None, file_hash if is_binary else None
8789
except (subprocess.CalledProcessError, OSError):
@@ -91,10 +93,10 @@ def _is_binary_file(path: str) -> tuple[bool, int | None, str | None]:
9193
def reset_staged_changes() -> None:
9294
"""Reset all staged changes."""
9395
try:
94-
result = subprocess.run(["git", "reset"], capture_output=True, text=True, check=True)
96+
result = subprocess.run(["git", "reset"], capture_output=True, check=True)
9597
GitOperations._handle_git_output(result)
9698
except subprocess.CalledProcessError as e:
97-
error_msg = e.stderr if e.stderr else str(e)
99+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
98100
raise GitError(f"Failed to reset staged changes: {error_msg}")
99101

100102
@staticmethod
@@ -109,20 +111,20 @@ def stage_files(files: list[str]) -> None:
109111
result = subprocess.run(
110112
["git", "add", "--", file],
111113
capture_output=True,
112-
text=True,
113114
check=True,
114115
)
115116
if result.stderr:
116-
if result.stderr.startswith("warning:"):
117-
logger.warning("Git warning while staging %s: %s", file, result.stderr)
117+
stderr = result.stderr.decode('utf-8', errors='replace')
118+
if stderr.startswith("warning:"):
119+
logger.warning("Git warning while staging %s: %s", file, stderr)
118120
else:
119-
logger.info("Git message while staging %s: %s", file, result.stderr)
121+
logger.info("Git message while staging %s: %s", file, stderr)
120122
except subprocess.CalledProcessError as file_error:
121123
# Log the error but continue with other files
122-
error_msg = file_error.stderr or str(file_error)
124+
error_msg = file_error.stderr.decode('utf-8', errors='replace') if file_error.stderr else str(file_error)
123125
logger.warning("Failed to stage file %s: %s", file, error_msg)
124126
except subprocess.CalledProcessError as e:
125-
error_msg = e.stderr if e.stderr else str(e)
127+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
126128
raise GitError(f"Failed to stage files: {error_msg}")
127129

128130
@staticmethod
@@ -131,11 +133,17 @@ def get_staged_files() -> list[GitFile]:
131133
try:
132134
# Get status in porcelain format for both staged and unstaged changes
133135
result = subprocess.run(
134-
["git", "status", "--porcelain"], capture_output=True, text=True, check=True
136+
["git", "status", "--porcelain"], capture_output=True, check=True
135137
)
136138

139+
# Decode with error handling for non-UTF-8 filenames
140+
try:
141+
stdout = result.stdout.decode('utf-8')
142+
except UnicodeDecodeError:
143+
stdout = result.stdout.decode('utf-8', errors='replace')
144+
137145
files = []
138-
for line in result.stdout.splitlines():
146+
for line in stdout.splitlines():
139147
if not line.strip():
140148
continue
141149

@@ -176,19 +184,24 @@ def get_staged_files() -> list[GitFile]:
176184
return files
177185

178186
except subprocess.CalledProcessError as e:
179-
error_msg = e.stderr if e.stderr else str(e)
187+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
180188
raise GitError(f"Failed to get staged files: {error_msg}")
181189

182190
@staticmethod
183191
def get_file_status(file: str) -> str:
184192
"""Get git status for a specific file."""
185193
try:
186194
result = subprocess.run(
187-
["git", "status", "--porcelain", file], capture_output=True, text=True, check=True
195+
["git", "status", "--porcelain", file], capture_output=True, check=True
188196
)
189-
return result.stdout[:2] if result.stdout else " "
197+
# Decode with error handling
198+
try:
199+
stdout = result.stdout.decode('utf-8')
200+
except UnicodeDecodeError:
201+
stdout = result.stdout.decode('utf-8', errors='replace')
202+
return stdout[:2] if stdout else " "
190203
except subprocess.CalledProcessError as e:
191-
error_msg = e.stderr if e.stderr else str(e)
204+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
192205
raise GitError(f"Failed to get file status: {error_msg}")
193206

194207
@staticmethod
@@ -199,7 +212,6 @@ def create_commit(title: str, message: str | None = None) -> bool:
199212
status = subprocess.run(
200213
["git", "diff", "--cached", "--quiet"],
201214
capture_output=True,
202-
text=True,
203215
)
204216

205217
if status.returncode == 0:
@@ -212,16 +224,17 @@ def create_commit(title: str, message: str | None = None) -> bool:
212224
if message:
213225
cmd.extend(["-m", message])
214226

215-
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
227+
result = subprocess.run(cmd, capture_output=True, check=True)
216228
if result.stderr:
217-
if result.stderr.startswith("warning:"):
218-
logger.warning("Git warning during commit: %s", result.stderr)
229+
stderr = result.stderr.decode('utf-8', errors='replace')
230+
if stderr.startswith("warning:"):
231+
logger.warning("Git warning during commit: %s", stderr)
219232
else:
220-
logger.info("Git message during commit: %s", result.stderr)
233+
logger.info("Git message during commit: %s", stderr)
221234
return True
222235

223236
except subprocess.CalledProcessError as e:
224-
error_msg = e.stderr if e.stderr else str(e)
237+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
225238
raise GitError(f"Failed to create commit: {error_msg}")
226239

227240
@staticmethod
@@ -242,11 +255,19 @@ def get_diff(files: list[GitFile] | None = None) -> str:
242255
if valid_paths:
243256
cmd.extend(["--"] + valid_paths)
244257

245-
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
246-
return result.stdout
258+
# Get raw bytes to handle different encodings
259+
result = subprocess.run(cmd, capture_output=True, check=True)
260+
261+
# Try to decode with UTF-8 first, fallback to latin-1 with replacement
262+
try:
263+
return result.stdout.decode('utf-8')
264+
except UnicodeDecodeError:
265+
logger.warning("Failed to decode diff as UTF-8, using fallback encoding")
266+
# Use latin-1 which accepts any byte sequence, or replace errors
267+
return result.stdout.decode('utf-8', errors='replace')
247268

248269
except subprocess.CalledProcessError as e:
249-
error_msg = e.stderr if e.stderr else str(e)
270+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
250271
raise GitError(f"Failed to get diff: {error_msg}")
251272

252273
@staticmethod
@@ -257,20 +278,20 @@ def stash_save(message: str = "") -> None:
257278
if message:
258279
cmd.extend(["-m", message])
259280

260-
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
281+
result = subprocess.run(cmd, capture_output=True, check=True)
261282
GitOperations._handle_git_output(result, "during stash save")
262283
except subprocess.CalledProcessError as e:
263-
error_msg = e.stderr if e.stderr else str(e)
284+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
264285
raise GitError(f"Failed to save stash: {error_msg}")
265286

266287
@staticmethod
267288
def stash_pop() -> None:
268289
"""Pop most recent stash."""
269290
try:
270-
result = subprocess.run(["git", "stash", "pop"], capture_output=True, text=True, check=True)
291+
result = subprocess.run(["git", "stash", "pop"], capture_output=True, check=True)
271292
GitOperations._handle_git_output(result, "during stash pop")
272293
except subprocess.CalledProcessError as e:
273-
error_msg = e.stderr if e.stderr else str(e)
294+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
274295
raise GitError(f"Failed to pop stash: {error_msg}")
275296

276297
@staticmethod
@@ -280,12 +301,11 @@ def unstage_file(file: str) -> None:
280301
result = subprocess.run(
281302
["git", "reset", "--", file],
282303
capture_output=True,
283-
text=True,
284304
check=True,
285305
)
286306
GitOperations._handle_git_output(result, f"while unstaging {file}")
287307
except subprocess.CalledProcessError as e:
288-
error_msg = e.stderr if e.stderr else str(e)
308+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
289309
raise GitError(f"Failed to unstage file: {error_msg}")
290310

291311
@staticmethod
@@ -295,10 +315,9 @@ def create_and_checkout_branch(branch: str) -> None:
295315
result = subprocess.run(
296316
["git", "checkout", "-b", branch],
297317
capture_output=True,
298-
text=True,
299318
check=True,
300319
)
301320
GitOperations._handle_git_output(result, f"while creating branch {branch}")
302321
except subprocess.CalledProcessError as e:
303-
error_msg = e.stderr if e.stderr else str(e)
322+
error_msg = e.stderr.decode('utf-8', errors='replace') if e.stderr else str(e)
304323
raise GitError(f"Failed to create branch '{branch}': {error_msg}")

0 commit comments

Comments
 (0)