Skip to content

Commit 51ab821

Browse files
Copilotoleander
andcommitted
Extract diff parsing into dedicated src/diff module
Co-authored-by: oleander <220827+oleander@users.noreply.github.com>
1 parent e272d7f commit 51ab821

File tree

8 files changed

+449
-439
lines changed

8 files changed

+449
-439
lines changed

examples/multi_step_commit.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ use std::env;
22

33
use anyhow::Result;
44
use async_openai::Client;
5-
use ai::multi_step_integration::{generate_commit_message_local, generate_commit_message_multi_step, parse_diff};
5+
use ai::multi_step_integration::{generate_commit_message_local, generate_commit_message_multi_step};
6+
use ai::diff::parser::parse_diff;
67

78
#[tokio::main]
89
async fn main() -> Result<()> {

src/diff/mod.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
//! Diff processing and parsing utilities.
2+
//!
3+
//! This module handles parsing git diffs into structured data
4+
//! and provides utilities for working with diff content.
5+
6+
pub mod parser;
7+
pub mod traits;
8+
9+
pub use parser::{ParsedFile, parse_diff};
10+
pub use traits::{FilePath, Utf8String, DiffDeltaPath};

src/diff/parser.rs

Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
//! Git diff parsing utilities.
2+
3+
use anyhow::Result;
4+
5+
/// Represents a parsed file from a git diff
6+
#[derive(Debug, Clone)]
7+
pub struct ParsedFile {
8+
pub path: String,
9+
pub operation: String,
10+
pub diff_content: String,
11+
}
12+
13+
/// Extracts file path from diff header parts
14+
///
15+
/// Handles various git prefixes (a/, b/, c/, i/) and /dev/null for deleted files.
16+
///
17+
/// # Arguments
18+
/// * `parts` - The whitespace-split parts from a "diff --git" line
19+
///
20+
/// # Returns
21+
/// * `Option<String>` - The extracted path without prefixes, or None if parsing fails
22+
fn extract_file_path_from_diff_parts(parts: &[&str]) -> Option<String> {
23+
if parts.len() < 4 {
24+
return None;
25+
}
26+
27+
// Helper to strip git prefixes (a/, b/, c/, i/)
28+
let strip_prefix = |s: &str| {
29+
s.trim_start_matches("a/")
30+
.trim_start_matches("b/")
31+
.trim_start_matches("c/")
32+
.trim_start_matches("i/")
33+
.to_string()
34+
};
35+
36+
let new_path = strip_prefix(parts[3]);
37+
let old_path = strip_prefix(parts[2]);
38+
39+
// Prefer new path unless it's /dev/null (deleted file)
40+
Some(if new_path == "/dev/null" || new_path == "dev/null" {
41+
old_path
42+
} else {
43+
new_path
44+
})
45+
}
46+
47+
/// Parse git diff into individual file changes.
48+
///
49+
/// Handles various diff formats including:
50+
/// - Standard git diff output
51+
/// - Diffs with commit hashes
52+
/// - Diffs with various path prefixes (a/, b/, c/, i/)
53+
/// - Deleted files (/dev/null paths)
54+
///
55+
/// # Arguments
56+
/// * `diff_content` - Raw git diff text
57+
///
58+
/// # Returns
59+
/// * `Result<Vec<ParsedFile>>` - Parsed files or error
60+
pub fn parse_diff(diff_content: &str) -> Result<Vec<ParsedFile>> {
61+
let mut files = Vec::new();
62+
let mut current_file: Option<ParsedFile> = None;
63+
let mut current_diff = String::new();
64+
65+
// Debug output
66+
log::debug!("Parsing diff with {} lines", diff_content.lines().count());
67+
68+
// Add more detailed logging for debugging
69+
if log::log_enabled!(log::Level::Debug) && !diff_content.is_empty() {
70+
// Make sure we truncate at a valid UTF-8 character boundary
71+
let preview = if diff_content.len() > 500 {
72+
let truncated_index = diff_content
73+
.char_indices()
74+
.take_while(|(i, _)| *i < 500)
75+
.last()
76+
.map(|(i, c)| i + c.len_utf8())
77+
.unwrap_or(0);
78+
79+
format!("{}... (truncated)", &diff_content[..truncated_index])
80+
} else {
81+
diff_content.to_string()
82+
};
83+
log::debug!("Diff content preview: \n{preview}");
84+
}
85+
86+
// Handle different diff formats
87+
let mut in_diff_section = false;
88+
let mut _commit_hash_line: Option<&str> = None;
89+
90+
// First scan to detect if this is a commit message with hash
91+
for line in diff_content.lines().take(3) {
92+
if line.len() >= 40 && line.chars().take(40).all(|c| c.is_ascii_hexdigit()) {
93+
_commit_hash_line = Some(line);
94+
break;
95+
}
96+
}
97+
98+
// Process line by line
99+
for line in diff_content.lines() {
100+
// Skip commit hash lines and other metadata
101+
if line.starts_with("commit ") || (line.len() >= 40 && line.chars().take(40).all(|c| c.is_ascii_hexdigit())) || line.is_empty() {
102+
continue;
103+
}
104+
105+
// Check if we're starting a new file diff
106+
if line.starts_with("diff --git") {
107+
in_diff_section = true;
108+
// Save previous file if exists
109+
if let Some(mut file) = current_file.take() {
110+
file.diff_content = current_diff.clone();
111+
log::debug!("Adding file to results: {} ({})", file.path, file.operation);
112+
files.push(file);
113+
current_diff.clear();
114+
}
115+
116+
// Extract file path more carefully
117+
let parts: Vec<&str> = line.split_whitespace().collect();
118+
if let Some(path) = extract_file_path_from_diff_parts(&parts) {
119+
log::debug!("Found new file in diff: {path}");
120+
current_file = Some(ParsedFile {
121+
path,
122+
operation: "modified".to_string(), // Default, will be updated
123+
diff_content: String::new()
124+
});
125+
}
126+
127+
// Add the header line to the diff content
128+
current_diff.push_str(line);
129+
current_diff.push('\n');
130+
} else if line.starts_with("new file mode") {
131+
if let Some(ref mut file) = current_file {
132+
log::debug!("File {} is newly added", file.path);
133+
file.operation = "added".to_string();
134+
}
135+
current_diff.push_str(line);
136+
current_diff.push('\n');
137+
} else if line.starts_with("deleted file mode") {
138+
if let Some(ref mut file) = current_file {
139+
log::debug!("File {} is deleted", file.path);
140+
file.operation = "deleted".to_string();
141+
}
142+
current_diff.push_str(line);
143+
current_diff.push('\n');
144+
} else if line.starts_with("rename from") || line.starts_with("rename to") {
145+
if let Some(ref mut file) = current_file {
146+
log::debug!("File {} is renamed", file.path);
147+
file.operation = "renamed".to_string();
148+
}
149+
current_diff.push_str(line);
150+
current_diff.push('\n');
151+
} else if line.starts_with("Binary files") {
152+
if let Some(ref mut file) = current_file {
153+
log::debug!("File {} is binary", file.path);
154+
file.operation = "binary".to_string();
155+
}
156+
current_diff.push_str(line);
157+
current_diff.push('\n');
158+
} else if line.starts_with("index ") || line.starts_with("--- ") || line.starts_with("+++ ") || line.starts_with("@@ ") {
159+
// These are important diff headers that should be included
160+
current_diff.push_str(line);
161+
current_diff.push('\n');
162+
} else if in_diff_section {
163+
current_diff.push_str(line);
164+
current_diff.push('\n');
165+
}
166+
}
167+
168+
// Don't forget the last file
169+
if let Some(mut file) = current_file {
170+
file.diff_content = current_diff;
171+
log::debug!("Adding final file to results: {} ({})", file.path, file.operation);
172+
files.push(file);
173+
}
174+
175+
// If we didn't parse any files, check if this looks like a raw git diff output
176+
// from commands like `git show` that include commit info at the top
177+
if files.is_empty() && !diff_content.trim().is_empty() {
178+
log::debug!("Trying to parse as raw git diff output with commit info");
179+
180+
// Extract sections that start with "diff --git"
181+
let sections: Vec<&str> = diff_content.split("diff --git").skip(1).collect();
182+
183+
if !sections.is_empty() {
184+
for (i, section) in sections.iter().enumerate() {
185+
// Add the "diff --git" prefix back
186+
let full_section = format!("diff --git{section}");
187+
188+
// Extract file path from the section more carefully
189+
let mut found_path = false;
190+
191+
// Safer approach: iterate through lines and find the path
192+
let mut extracted_path = String::new();
193+
for section_line in full_section.lines().take(3) {
194+
if section_line.starts_with("diff --git") {
195+
let parts: Vec<&str> = section_line.split_whitespace().collect();
196+
if let Some(p) = extract_file_path_from_diff_parts(&parts) {
197+
extracted_path = p;
198+
found_path = true;
199+
break;
200+
}
201+
}
202+
}
203+
204+
if found_path {
205+
log::debug!("Found file in section {i}: {extracted_path}");
206+
files.push(ParsedFile {
207+
path: extracted_path,
208+
operation: "modified".to_string(), // Default
209+
diff_content: full_section
210+
});
211+
}
212+
}
213+
}
214+
}
215+
216+
// If still no files were parsed, treat the entire diff as a single change
217+
if files.is_empty() && !diff_content.trim().is_empty() {
218+
log::debug!("No standard diff format found, treating as single file change");
219+
files.push(ParsedFile {
220+
path: "unknown".to_string(),
221+
operation: "modified".to_string(),
222+
diff_content: diff_content.to_string()
223+
});
224+
}
225+
226+
log::debug!("Parsed {} files from diff", files.len());
227+
228+
// Add detailed debug output for each parsed file
229+
if log::log_enabled!(log::Level::Debug) {
230+
for (i, file) in files.iter().enumerate() {
231+
let content_preview = if file.diff_content.len() > 200 {
232+
// Make sure we truncate at a valid UTF-8 character boundary
233+
let truncated_index = file
234+
.diff_content
235+
.char_indices()
236+
.take_while(|(i, _)| *i < 200)
237+
.last()
238+
.map(|(i, c)| i + c.len_utf8())
239+
.unwrap_or(0);
240+
241+
format!("{}... (truncated)", &file.diff_content[..truncated_index])
242+
} else {
243+
file.diff_content.clone()
244+
};
245+
log::debug!("File {}: {} ({})\nContent preview:\n{}", i, file.path, file.operation, content_preview);
246+
}
247+
}
248+
249+
Ok(files)
250+
}
251+
252+
#[cfg(test)]
253+
mod tests {
254+
use super::*;
255+
256+
#[test]
257+
fn test_parse_diff() {
258+
let diff = r#"diff --git a/src/main.rs b/src/main.rs
259+
index 1234567..abcdefg 100644
260+
--- a/src/main.rs
261+
+++ b/src/main.rs
262+
@@ -1,5 +1,6 @@
263+
fn main() {
264+
- println!("Hello");
265+
+ println!("Hello, world!");
266+
+ println!("New line");
267+
}
268+
diff --git a/Cargo.toml b/Cargo.toml
269+
new file mode 100644
270+
index 0000000..1111111
271+
--- /dev/null
272+
+++ b/Cargo.toml
273+
@@ -0,0 +1,8 @@
274+
+[package]
275+
+name = "test"
276+
+version = "0.1.0"
277+
"#;
278+
279+
let files = parse_diff(diff).unwrap();
280+
assert_eq!(files.len(), 2);
281+
assert_eq!(files[0].path, "src/main.rs");
282+
assert_eq!(files[0].operation, "modified");
283+
assert_eq!(files[1].path, "Cargo.toml");
284+
assert_eq!(files[1].operation, "added");
285+
286+
// Verify files contain diff content
287+
assert!(!files[0].diff_content.is_empty());
288+
assert!(!files[1].diff_content.is_empty());
289+
}
290+
291+
#[test]
292+
fn test_parse_diff_with_commit_hash() {
293+
// Test with a commit hash and message before the diff
294+
let diff = r#"0472ffa1665c4c5573fb8f7698c9965122eda675 Update files
295+
296+
diff --git a/test.js b/test.js
297+
new file mode 100644
298+
index 0000000..a730e61
299+
--- /dev/null
300+
+++ b/test.js
301+
@@ -0,0 +1 @@
302+
+console.log('Hello');
303+
"#;
304+
305+
let files = parse_diff(diff).unwrap();
306+
assert_eq!(files.len(), 1);
307+
assert_eq!(files[0].path, "test.js");
308+
assert_eq!(files[0].operation, "added");
309+
}
310+
311+
#[test]
312+
fn test_parse_diff_with_c_i_prefixes() {
313+
// Test with c/ and i/ prefixes that appear in git hook diffs
314+
let diff = r#"diff --git c/test.md i/test.md
315+
new file mode 100644
316+
index 0000000..6c61a60
317+
--- /dev/null
318+
+++ i/test.md
319+
@@ -0,0 +1 @@
320+
+# Test File
321+
322+
diff --git c/test.js i/test.js
323+
new file mode 100644
324+
index 0000000..a730e61
325+
--- /dev/null
326+
+++ i/test.js
327+
@@ -0,0 +1 @@
328+
+console.log('Hello');
329+
"#;
330+
331+
let files = parse_diff(diff).unwrap();
332+
assert_eq!(files.len(), 2);
333+
assert_eq!(files[0].path, "test.md", "Should extract clean path without c/ prefix");
334+
assert_eq!(files[0].operation, "added");
335+
assert_eq!(files[1].path, "test.js", "Should extract clean path without i/ prefix");
336+
assert_eq!(files[1].operation, "added");
337+
338+
// Verify files contain diff content
339+
assert!(files[0].diff_content.contains("# Test File"));
340+
assert!(files[1].diff_content.contains("console.log"));
341+
}
342+
343+
#[test]
344+
fn test_parse_diff_with_deleted_file() {
345+
let diff = r#"diff --git a/test.txt b/test.txt
346+
deleted file mode 100644
347+
index 9daeafb..0000000
348+
--- a/test.txt
349+
+++ /dev/null
350+
@@ -1 +0,0 @@
351+
-test
352+
"#;
353+
354+
let files = parse_diff(diff).unwrap();
355+
assert_eq!(files.len(), 1);
356+
assert_eq!(files[0].path, "test.txt");
357+
assert_eq!(files[0].operation, "deleted");
358+
}
359+
}

0 commit comments

Comments
 (0)