11// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
22
3- MATCH (global_git_commit :Git :Commit )
4- WITH count (global_git_commit ) AS globalCommitCount
3+ // Determine global file count, global file count threshold (filter out refactoring commits) and global update commits
4+ MATCH (git_commit_global :Git :Commit )- [ : CONTAINS_CHANGE ] -> (:Git :Change )- [ : UPDATES ] -> (git_file_global :Git :File )
5+ WHERE git_file_global .deletedAt IS NULL
6+ WITH git_commit_global , count (DISTINCT git_file_global ) AS commitFileCount
7+ WITH percentileDisc (commitFileCount , 0.95 ) AS globalFileCountThreshold
8+ ,count (git_commit_global ) AS globalUpdateCommitCount
9+ // Main section
510MATCH (git_commit :Git :Commit )- [ : CONTAINS_CHANGE ] -> (git_change :Git :Change )- [ : UPDATES ] -> (git_file :Git :File )
611MATCH (git_repository :Git &Repository )- [ : HAS_FILE ] -> (git_file )
712WHERE git_file .deletedAt IS NULL
813// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
914ORDER BY git_commit .sha , git_file .relativePath
10- WITH globalCommitCount
15+ WITH globalFileCountThreshold
16+ ,globalUpdateCommitCount
1117 ,git_commit .sha AS commitHash
1218 ,collect (DISTINCT git_file ) AS filesInCommit
1319// Limit the file count to min. 2 (changed together) and
1420// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
1521WHERE size (filesInCommit ) >= 2
16- AND size (filesInCommit ) <= 50
22+ AND size (filesInCommit ) <= globalFileCountThreshold
1723// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
18- WITH globalCommitCount
24+ WITH globalFileCountThreshold
25+ ,globalUpdateCommitCount
1926 ,commitHash
2027 ,apoc .coll .combinations (filesInCommit , 2 , 2 ) AS fileCombinations
2128UNWIND fileCombinations AS fileCombination
22- WITH globalCommitCount
29+ WITH globalFileCountThreshold
30+ ,globalUpdateCommitCount
2331 ,fileCombination
24- ,count (DISTINCT commitHash ) AS commitCount
25- ,collect (DISTINCT commitHash ) AS commitHashes
32+ ,count (DISTINCT commitHash ) AS updateCommitCount
33+ ,collect (DISTINCT commitHash ) AS updateCommitHashes
34+ // Deactivated:
2635// Filter out file pairs that where changed not very often together
2736// In detail: More than 0.1 per mille compared to overall commit count
28- WHERE commitCount > globalCommitCount * 0.001
29- WITH fileCombination [0 ] AS firstFile
37+ // WHERE updateCommitCount > globalUpdateCommitCount * 0.001
38+ WITH *
39+ ,fileCombination [0 ] AS firstFile
3040 ,fileCombination [1 ] AS secondFile
31- ,commitCount
32- ,commitHashes
33- // Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
34- CALL (firstFile , secondFile , commitCount , commitHashes ) {
41+ WITH *
42+ // Get the lowest number of git update commits of both files (file pair)
43+ ,CASE WHEN firstFile .updateCommitCount < secondFile .updateCommitCount
44+ THEN firstFile .updateCommitCount
45+ ELSE secondFile .updateCommitCount
46+ END AS minUpdateCommitCount
47+ // Calculate update commit support by dividing the update commit count by the overall commit count for both files
48+ ,toFloat (firstFile .updateCommitCount ) / globalUpdateCommitCount AS firstFileUpdateSupport
49+ ,toFloat (secondFile .updateCommitCount ) / globalUpdateCommitCount AS secondFileUpdateSupport
50+ WITH *
51+ // Expected likelihood that the first and the second file change together given complete randomness
52+ ,firstFileUpdateSupport * secondFileUpdateSupport AS expectedCoUpdateSupport
53+ WITH firstFile
54+ ,secondFile
55+ ,updateCommitHashes
56+ ,updateCommitCount
57+ // Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file?
58+ ,toFloat (updateCommitCount ) / minUpdateCommitCount AS updateCommitMinConfidence
59+ // Compared to all commits in general, how high is the percentage of the commits where both files changed together?
60+ ,toFloat (updateCommitCount ) / globalUpdateCommitCount AS updateCommitSupport
61+ ,expectedCoUpdateSupport
62+ // Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "updateCommitCount" on it
63+ CALL (firstFile , secondFile , updateCommitCount , updateCommitHashes , updateCommitMinConfidence , updateCommitSupport , expectedCoUpdateSupport ) {
3564 MERGE (firstFile )- [pairwiseChange : CHANGED_TOGETHER_WITH ]- (secondFile )
36- SET pairwiseChange .commitCount = commitCount
37- , pairwiseChange .commitHashes = commitHashes
38- } IN TRANSACTIONS
65+ SET pairwiseChange .updateCommitCount = updateCommitCount
66+ , pairwiseChange .updateCommitHashes = updateCommitHashes
67+ , pairwiseChange .updateCommitMinConfidence = updateCommitMinConfidence
68+ , pairwiseChange .updateCommitSupport = updateCommitSupport
69+ , pairwiseChange .updateCommitLift = updateCommitSupport / expectedCoUpdateSupport
70+ } IN TRANSACTIONS OF 500 ROWS
3971// Return one row with some statistics about the found pairs and their commit counts
40- RETURN max (commitCount ) AS maxCommitCount
41- ,avg (commitCount ) AS avgCommitCount
42- ,percentileDisc (commitCount , 0.5 ) AS percentile50CommitCount
43- ,percentileDisc (commitCount , 0.9 ) AS percentile90CommitCount
44- ,percentileDisc (commitCount , 0.95 ) AS percentile95CommitCount
45- ,count (* ) AS pairCount
72+ RETURN min (updateCommitCount ) AS minCommitCount
73+ ,max (updateCommitCount ) AS maxCommitCount
74+ ,avg (updateCommitCount ) AS avgCommitCount
75+ ,percentileDisc (updateCommitCount , 0.5 ) AS percentile50CommitCount
76+ ,percentileDisc (updateCommitCount , 0.9 ) AS percentile90CommitCount
77+ ,percentileDisc (updateCommitCount , 0.95 ) AS percentile95CommitCount
78+ ,min (updateCommitMinConfidence ) AS minMinConfidence
79+ ,max (updateCommitMinConfidence ) AS maxMinConfidence
80+ ,avg (updateCommitMinConfidence ) AS avgMinConfidence
81+ ,percentileDisc (updateCommitMinConfidence , 0.5 ) AS percentile50MinConfidence
82+ ,percentileDisc (updateCommitMinConfidence , 0.9 ) AS percentile90MinConfidence
83+ ,percentileDisc (updateCommitMinConfidence , 0.95 ) AS percentile95MinConfidence
84+ ,count (* ) AS pairCount
0 commit comments