Skip to content

Commit 9423311

Browse files
committed
Add CHANGED_TOGETHER_WITH edge for git file nodes
1 parent 765945d commit 9423311

5 files changed

+90
-24
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
// Take the already existing "CHANGED_TOGETHER_WITH" relationship between git files and apply it to resolved file nodes. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
2+
3+
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
WHERE elementId(firstGitFile) < elementId(secondGitFile)
5+
MATCH (firstGitFile)-[:RESOLVES_TO]->(firstCodeFile:File&!Git&!Repository)
6+
MATCH (secondGitFile)-[:RESOLVES_TO]->(secondCodeFile:File&!Git&!Repository)
7+
CALL (firstCodeFile, secondCodeFile, gitChange) {
8+
MERGE (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
9+
ON CREATE SET pairwiseChange.commitCount = gitChange.commitCount
10+
} IN TRANSACTIONS
11+
RETURN count(*) AS pairCount
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
2+
3+
MATCH (global_git_commit:Git:Commit)
4+
WITH count(global_git_commit) AS globalCommitCount
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
6+
WHERE git_file.deletedAt IS NULL
7+
// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
8+
ORDER BY git_commit.sha, git_file.relativePath
9+
WITH globalCommitCount
10+
,git_commit.sha AS commitHash
11+
,collect(DISTINCT git_file) AS filesInCommit
12+
// Limit the file count to min. 2 (changed together) and
13+
// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
14+
WHERE size(filesInCommit) >= 2
15+
AND size(filesInCommit) <= 50
16+
// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
17+
WITH globalCommitCount
18+
,commitHash
19+
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
20+
UNWIND fileCombinations AS fileCombination
21+
WITH globalCommitCount
22+
,fileCombination
23+
,count(DISTINCT commitHash) AS commitCount
24+
// Filter out file pairs that where changed not very often together
25+
// In detail: More than 0.1 per mille compared to overall commit count
26+
WHERE commitCount > globalCommitCount * 0.001
27+
WITH fileCombination[0] AS firstFile
28+
,fileCombination[1] AS secondFile
29+
,commitCount
30+
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
31+
CALL (firstFile, secondFile, commitCount) {
32+
MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile)
33+
ON CREATE SET pairwiseChange.commitCount = commitCount
34+
} IN TRANSACTIONS
35+
// Return one row with some statistics about the found pairs and their commit counts
36+
RETURN max(commitCount) AS maxCommitCount
37+
,avg(commitCount) AS avgCommitCount
38+
,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
39+
,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
40+
,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
41+
,count(*) AS pairCount
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// List git files that where changed together frequently
2+
3+
MATCH (global_git_commit:Git:Commit)
4+
WITH count(global_git_commit) AS globalCommitCount
5+
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
6+
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
7+
WHERE git_file.deletedAt IS NULL
8+
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
9+
WITH globalCommitCount
10+
,git_commit.sha AS commitHash
11+
,collect(DISTINCT filePath) AS filesInCommit
12+
WHERE size(filesInCommit) >= 2
13+
AND size(filesInCommit) <= 50
14+
WITH globalCommitCount
15+
,commitHash
16+
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
17+
UNWIND fileCombinations AS fileCombination
18+
WITH globalCommitCount
19+
,apoc.coll.sort(fileCombination) AS fileCombination
20+
,count(DISTINCT commitHash) AS commitCount
21+
WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare
22+
RETURN fileCombination[0] AS firstFile
23+
,fileCombination[1] AS secondFile
24+
,commitCount
25+
ORDER BY commitCount DESC
Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,10 @@
1-
// List git files that where changed together frequently
1+
// List git files that where changed together frequently. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
22

3-
MATCH (global_git_commit:Git:Commit)
4-
WITH count(global_git_commit) AS globalCommitCount
5-
MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
6-
MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file)
7-
WHERE git_file.deletedAt IS NULL
8-
WITH *, git_repository.name + '/' + git_file.relativePath AS filePath
9-
WITH globalCommitCount
10-
,git_commit.sha AS commitHash
11-
,collect(DISTINCT filePath) AS filesInCommit
12-
WHERE size(filesInCommit) >= 2
13-
AND size(filesInCommit) <= 50
14-
WITH globalCommitCount
15-
,commitHash
16-
,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations
17-
UNWIND fileCombinations AS fileCombination
18-
WITH globalCommitCount
19-
,apoc.coll.sort(fileCombination) AS fileCombination
20-
,count(DISTINCT commitHash) AS commitCount
21-
WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare
22-
RETURN fileCombination[0] AS firstFile
23-
,fileCombination[1] AS secondFile
24-
,commitCount
25-
ORDER BY commitCount DESC
3+
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4+
WHERE elementId(firstGitFile) < elementId(secondGitFile)
5+
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
6+
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile)
7+
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile
8+
,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile
9+
,gitChange.commitCount AS commitCount
10+
ORDER BY commitCount DESC

scripts/importGit.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,10 @@ commonPostGitImport() {
125125
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher"
126126
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher"
127127

128+
echo "importGit: Creating relationships to file nodes that where changed together..."
129+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher"
130+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher"
131+
128132
# Since it's currently not possible to rule out ambiguity in git<->code file matching,
129133
# the following verifications are only an additional info in the log rather than an error.
130134
echo "importGit: Running verification queries for troubleshooting (non failing)..."

0 commit comments

Comments
 (0)