Skip to content

Commit 0bf9a9d

Browse files
committed
Calculate min confidence of two files changing together (normalized co-change count)
1 parent 679e21e commit 0bf9a9d

7 files changed

+88
-38
lines changed

cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
// Take the already existing "CHANGED_TOGETHER_WITH" relationship between git files and apply it to resolved file nodes. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files".
22

33
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
4-
WHERE elementId(firstGitFile) < elementId(secondGitFile)
4+
//De-duplicating the pairs of files isn't necessary, because the dependency relation is directed.
5+
//WHERE elementId(firstGitFile) < elementId(secondGitFile)
56
MATCH (firstGitFile)-[:RESOLVES_TO]->(firstCodeFile:File&!Git&!Repository)
67
MATCH (secondGitFile)-[:RESOLVES_TO]->(secondCodeFile:File&!Git&!Repository)
78
CALL (firstCodeFile, secondCodeFile, gitChange) {

cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,36 @@ UNWIND fileCombinations AS fileCombination
2626
// Filter out file pairs that where changed not very often together
2727
// In detail: More than 0.1 per mille compared to overall commit count
2828
WHERE commitCount > globalCommitCount * 0.001
29+
WITH *
30+
// Get the lowest number of git update commits of both files (file pair)
31+
,CASE WHEN fileCombination[0].numberOfGitUpdateCommits < fileCombination[1].numberOfGitUpdateCommits
32+
THEN fileCombination[0].numberOfGitUpdateCommits
33+
ELSE fileCombination[1].numberOfGitUpdateCommits
34+
END AS minNumberOfGitUpdateCommits
2935
WITH fileCombination[0] AS firstFile
3036
,fileCombination[1] AS secondFile
3137
,commitCount
38+
// Out of all the times the less frequently changed file was touched, how often did it co-occur with the other file?
39+
,toFloat(commitCount) / minNumberOfGitUpdateCommits AS minConfidence
3240
,commitHashes
3341
// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it
34-
CALL (firstFile, secondFile, commitCount, commitHashes) {
42+
CALL (firstFile, secondFile, commitCount, minConfidence, commitHashes) {
3543
MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile)
36-
SET pairwiseChange.commitCount = commitCount
37-
,pairwiseChange.commitHashes = commitHashes
44+
SET pairwiseChange.commitCount = commitCount
45+
,pairwiseChange.minConfidence = minConfidence
46+
,pairwiseChange.commitHashes = commitHashes
3847
} IN TRANSACTIONS
3948
// Return one row with some statistics about the found pairs and their commit counts
40-
RETURN max(commitCount) AS maxCommitCount
41-
,avg(commitCount) AS avgCommitCount
42-
,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
43-
,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
44-
,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
45-
,count(*) AS pairCount
49+
RETURN min(commitCount) AS minCommitCount
50+
,max(commitCount) AS maxCommitCount
51+
,avg(commitCount) AS avgCommitCount
52+
,percentileDisc(commitCount, 0.5) AS percentile50CommitCount
53+
,percentileDisc(commitCount, 0.9) AS percentile90CommitCount
54+
,percentileDisc(commitCount, 0.95) AS percentile95CommitCount
55+
,min(minConfidence) AS minMinConfidence
56+
,max(minConfidence) AS maxMinConfidence
57+
,avg(minConfidence) AS avgMinConfidence
58+
,percentileDisc(minConfidence, 0.5) AS percentile50MinConfidence
59+
,percentileDisc(minConfidence, 0.9) AS percentile90MinConfidence
60+
,percentileDisc(minConfidence, 0.95) AS percentile95MinConfidence
61+
,count(*) AS pairCount

cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository)
44
MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile)
55
UNWIND gitChange.commitHashes AS commitHash
6-
RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath
6+
WITH gitRepository.name + '/' + firstGitFile.relativePath AS filePath
77
,count(DISTINCT commitHash) AS commitCount
8+
,sum(firstGitFile.numberOfGitUpdateCommits) AS fileUpdateCount
9+
WITH *
10+
// Out of all the times the file was touched, how often did it co-occur with other files?
11+
,CASE WHEN fileUpdateCount > 0 THEN toFloat(commitCount) / fileUpdateCount ELSE 0.0 END AS coChangeRate
12+
RETURN filePath, commitCount, coChangeRate
813
ORDER BY commitCount DESC

cypher/GitLog/List_pairwise_changed_files_with_dependencies.cypher

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,36 +4,16 @@ MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
44
MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
55
//De-duplicating the pairs of files isn't necessary, because the dependency relation is directed.
66
//WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
7-
WITH firstCodeFile.fileName AS firstFileName
8-
,secondCodeFile.fileName AS secondFileName
7+
WITH firstCodeFile.fileName AS firstFileName
8+
,secondCodeFile.fileName AS secondFileName
99
,coalesce(dependency.weight, dependency.cardinality) AS dependencyWeight
10-
,pairwiseChange.commitCount AS commitCount
10+
,pairwiseChange.commitCount AS commitCount
11+
,pairwiseChange.minConfidence AS minConfidence
1112
,dependency.fileDistanceAsFewestChangeDirectoryCommands AS fileDistanceAsFewestChangeDirectoryCommands
1213
RETURN dependencyWeight
1314
,commitCount
15+
,minConfidence
1416
,fileDistanceAsFewestChangeDirectoryCommands
1517
// ,count(*) AS occurrences
1618
// ,collect(firstFileName + ' -> ' + secondFileName)[0..3] AS examples
17-
ORDER BY dependencyWeight, commitCount
18-
19-
// MATCH (firstCodeFile:File)-[dependency:DEPENDS_ON]->(secondCodeFile:File)
20-
// MATCH (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile)
21-
// WHERE elementId(firstCodeFile) < elementId(secondCodeFile)
22-
// RETURN firstCodeFile.fileName AS firstFileName
23-
// ,secondCodeFile.fileName AS secondFileName
24-
// ,dependency.weight AS dependencyWeight
25-
// ,pairwiseChange.commitCount AS commitCount
26-
// ORDER BY dependencyWeight, commitCount
27-
28-
// MATCH (g1:!Git&File)-[relation:CHANGED_TOGETHER_WITH|DEPENDS_ON]-(g2:!Git&File)
29-
// WITH count(DISTINCT relation) AS relatedFilesCount
30-
// ,collect(DISTINCT relation) AS relations
31-
// UNWIND relations AS relation
32-
// WITH relatedFilesCount
33-
// ,coalesce(relation.commitCount, 0) AS commitCount
34-
// ,coalesce(relation.weight, 0) AS dependencyWeight
35-
// ,coalesce(relation.fileDistanceAsFewestChangeDirectoryCommands, 0) AS fileDistanceAsFewestChangeDirectoryCommands
36-
// RETURN dependencyWeight
37-
// ,commitCount
38-
// ,fileDistanceAsFewestChangeDirectoryCommands
39-
// ORDER BY dependencyWeight, commitCount
19+
ORDER BY dependencyWeight, commitCount
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Set numberOfGitUpdateCommits property on Git File nodes when git commits with Update modifier (detected by the plugin) are present
2+
3+
MATCH (git_file:File&Git)<-[:UPDATES]-(:Git&Change)<-[:CONTAINS_CHANGE]-(git_commit:Git&Commit)
4+
WITH git_file, count(DISTINCT git_commit.sha) AS numberOfGitUpdateCommits
5+
SET git_file.numberOfGitUpdateCommits = numberOfGitUpdateCommits
6+
WITH git_file, numberOfGitUpdateCommits
7+
MATCH (code_file:File&!Git)<-[:RESOLVES_TO]-(git_file)
8+
SET code_file.numberOfGitUpdateCommits = numberOfGitUpdateCommits
9+
RETURN count(DISTINCT code_file) AS codeFileUpdates
10+
,collect(DISTINCT code_file.name)[0..4] AS codeFileExample

jupyter/GitHistoryGeneral.ipynb

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1198,6 +1198,7 @@
11981198
"pairwise_changed_git_files = pairwise_changed_git_files.groupby(['directoryPath']).aggregate(\n",
11991199
" pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
12001200
" pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n",
1201+
" pairwiseChangeAverageRate=pd.NamedAgg(column=\"coChangeRate\", aggfunc=\"mean\"),\n",
12011202
")\n",
12021203
"pairwise_changed_git_files.reset_index(inplace=True)\n",
12031204
"\n",
@@ -1220,6 +1221,7 @@
12201221
"\n",
12211222
"pairwise_changed_git_files['pairwiseChangeCommitCount'] = pairwise_changed_git_files['pairwiseChangeCommitCount'].fillna(0).astype(int)\n",
12221223
"pairwise_changed_git_files['pairwiseChangeFileCount'] = pairwise_changed_git_files['pairwiseChangeFileCount'].fillna(0).astype(int)\n",
1224+
"pairwise_changed_git_files['pairwiseChangeAverageRate'] = pairwise_changed_git_files['pairwiseChangeAverageRate'].fillna(0).astype(float)\n",
12231225
"pairwise_changed_git_files.reset_index(inplace=True)\n",
12241226
"\n",
12251227
"# Debug\n",
@@ -1399,7 +1401,13 @@
13991401
" display(pearsonr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
14001402
"\n",
14011403
" display(\"Spearman Correlation with p-value for commitCount and dependencyWeight\")\n",
1402-
" display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))"
1404+
" display(spearmanr(pairwise_changed_git_files_with_dependencies['commitCount'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
1405+
"\n",
1406+
" display(\"Pearson Correlation with p-value for minConfidence and dependencyWeight\")\n",
1407+
" display(pearsonr(pairwise_changed_git_files_with_dependencies['minConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))\n",
1408+
"\n",
1409+
" display(\"Spearman Correlation with p-value for minConfidence and dependencyWeight\")\n",
1410+
" display(spearmanr(pairwise_changed_git_files_with_dependencies['minConfidence'], pairwise_changed_git_files_with_dependencies['dependencyWeight']))"
14031411
]
14041412
},
14051413
{
@@ -1431,6 +1439,34 @@
14311439
" figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseChangedFilesVsDependencyWeight\"))"
14321440
]
14331441
},
1442+
{
1443+
"cell_type": "code",
1444+
"execution_count": null,
1445+
"id": "75264b82",
1446+
"metadata": {},
1447+
"outputs": [],
1448+
"source": [
1449+
"# Scatter plot of all pairs of files with their min confidence (normalized update commit count) on the x axis and dependency weight on the y axis\n",
1450+
"\n",
1451+
"if pairwise_changed_git_files_with_dependencies.empty:\n",
1452+
" print(\"No data to plot\")\n",
1453+
"else:\n",
1454+
" figure = plotly_graph_objects.Figure(plotly_graph_objects.Scatter(\n",
1455+
" x=pairwise_changed_git_files_with_dependencies['minConfidence'], \n",
1456+
" y=pairwise_changed_git_files_with_dependencies['dependencyWeight'],\n",
1457+
" mode='markers',\n",
1458+
" ))\n",
1459+
" figure.update_layout(\n",
1460+
" **plotly_bar_layout_base_settings,\n",
1461+
" title='Pairwise changed files: Min confidence co-change rate vs. dependency weight',\n",
1462+
" xaxis_title='co-change rate (min confidence, normalized update commit count)',\n",
1463+
" yaxis_title='dependency weight',\n",
1464+
" )\n",
1465+
" figure.show(**plotly_treemap_figure_show_settings)\n",
1466+
" if is_command_line_execution():\n",
1467+
" figure.write_image(**get_plotly_figure_write_image_settings(\"PairwiseChangedFilesVsDependencyWeight\"))"
1468+
]
1469+
},
14341470
{
14351471
"cell_type": "markdown",
14361472
"id": "14e87aff",

scripts/importGit.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ postGitPluginImport() {
160160

161161
echo "importGit: Add numberOfGitCommits property to nodes with matching file names..."
162162
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_commits.cypher"
163+
echo "importGit: Add numberOfGitUpdateCommits property to file nodes and code nodes with matching file names..."
164+
execute_cypher "${GIT_LOG_CYPHER_DIR}/Set_number_of_git_plugin_update_commits.cypher"
163165
}
164166

165167
postAggregatedGitLogImport() {

0 commit comments

Comments
 (0)