11// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH"
22
3- MATCH (global_git_commit :Git :Commit )
4- WITH count (global_git_commit ) AS globalCommitCount
3+ MATCH (git_commit_global :Git :Commit )- [ : CONTAINS_CHANGE ] -> (:Git :Change )- [ : UPDATES ] -> (git_file_global :Git :File )
4+ WHERE git_file_global .deletedAt IS NULL
5+ WITH git_commit_global , count (DISTINCT git_file_global ) AS commitfileCount
6+ WITH percentileDisc (commitfileCount , 0.95 ) AS globalFileCountThreshold
7+ ,count (git_commit_global ) AS globalUpdateCommitCount
8+
59MATCH (git_commit :Git :Commit )- [ : CONTAINS_CHANGE ] -> (git_change :Git :Change )- [ : UPDATES ] -> (git_file :Git :File )
610MATCH (git_repository :Git &Repository )- [ : HAS_FILE ] -> (git_file )
711WHERE git_file .deletedAt IS NULL
812// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA)
913ORDER BY git_commit .sha , git_file .relativePath
10- WITH globalCommitCount
14+ WITH globalFileCountThreshold
15+ ,globalUpdateCommitCount
1116 ,git_commit .sha AS commitHash
1217 ,collect (DISTINCT git_file ) AS filesInCommit
1318// Limit the file count to min. 2 (changed together) and
1419// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files)
1520WHERE size (filesInCommit ) >= 2
16- AND size (filesInCommit ) <= 50
21+ AND size (filesInCommit ) <= globalFileCountThreshold
1722// Collect distinct pairwise (..., 2, 2) combinations of all files in the list
18- WITH globalCommitCount
23+ WITH globalFileCountThreshold
24+ ,globalUpdateCommitCount
1925 ,commitHash
2026 ,apoc .coll .combinations (filesInCommit , 2 , 2 ) AS fileCombinations
2127UNWIND fileCombinations AS fileCombination
22- WITH globalCommitCount
28+ WITH globalFileCountThreshold
29+ ,globalUpdateCommitCount
2330 ,fileCombination
2431 ,count (DISTINCT commitHash ) AS commitCount
2532 ,collect (DISTINCT commitHash ) AS commitHashes
2633// Filter out file pairs that where changed not very often together
2734// In detail: More than 0.1 per mille compared to overall commit count
28- WHERE commitCount > globalCommitCount * 0.001
35+ // WHERE commitCount > globalUpdateCommitCount * 0.001
36+ WITH *
37+ // Get the lowest number of git update commits of both files (file pair)
38+ ,CASE WHEN fileCombination [0 ].numberOfGitUpdateCommits < fileCombination [1 ].numberOfGitUpdateCommits
39+ THEN fileCombination [0 ].numberOfGitUpdateCommits
40+ ELSE fileCombination [1 ].numberOfGitUpdateCommits
41+ END AS minNumberOfGitUpdateCommits
2942 WITH fileCombination [0 ] AS firstFile
3043 ,fileCombination [1 ] AS secondFile
3144 ,commitCount
45+ // Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file?
46+ ,toFloat (commitCount ) / minNumberOfGitUpdateCommits AS minConfidence
3247 ,commitHashes
3348// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
34- CALL (firstFile , secondFile , commitCount , commitHashes ) {
49+ CALL (firstFile , secondFile , commitCount , minConfidence , commitHashes ) {
3550 MERGE (firstFile )- [pairwiseChange : CHANGED_TOGETHER_WITH ]- (secondFile )
36- SET pairwiseChange .commitCount = commitCount
37- , pairwiseChange .commitHashes = commitHashes
38- } IN TRANSACTIONS
51+ SET pairwiseChange .commitCount = commitCount
52+ , pairwiseChange .minConfidence = minConfidence
53+ , pairwiseChange .commitHashes = commitHashes
54+ } IN TRANSACTIONS OF 500 ROWS
3955// Return one row with some statistics about the found pairs and their commit counts
40- RETURN max (commitCount ) AS maxCommitCount
41- ,avg (commitCount ) AS avgCommitCount
42- ,percentileDisc (commitCount , 0.5 ) AS percentile50CommitCount
43- ,percentileDisc (commitCount , 0.9 ) AS percentile90CommitCount
44- ,percentileDisc (commitCount , 0.95 ) AS percentile95CommitCount
45- ,count (* ) AS pairCount
56+ RETURN min (commitCount ) AS minCommitCount
57+ ,max (commitCount ) AS maxCommitCount
58+ ,avg (commitCount ) AS avgCommitCount
59+ ,percentileDisc (commitCount , 0.5 ) AS percentile50CommitCount
60+ ,percentileDisc (commitCount , 0.9 ) AS percentile90CommitCount
61+ ,percentileDisc (commitCount , 0.95 ) AS percentile95CommitCount
62+ ,min (minConfidence ) AS minMinConfidence
63+ ,max (minConfidence ) AS maxMinConfidence
64+ ,avg (minConfidence ) AS avgMinConfidence
65+ ,percentileDisc (minConfidence , 0.5 ) AS percentile50MinConfidence
66+ ,percentileDisc (minConfidence , 0.9 ) AS percentile90MinConfidence
67+ ,percentileDisc (minConfidence , 0.95 ) AS percentile95MinConfidence
68+ ,count (* ) AS pairCount
0 commit comments