Skip to content

Commit d40b4b0

Browse files
committed
Add reports for frequently changed files
1 parent 32f6f7b commit d40b4b0

File tree

6 files changed

+244
-15
lines changed

6 files changed

+244
-15
lines changed

cypher/GitLog/List_pairwise_changed_files.cypher

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ RETURN firstFileName
1414
,firstFile.extension AS firstFileExtension
1515
,secondFile.extension AS secondFileExtension
1616
,firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair
17-
,toInteger(pairwiseChange.updateCommitCount) AS updateCommitCount
17+
,pairwiseChange.updateCommitCount AS updateCommitCount
1818
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
1919
,pairwiseChange.updateCommitSupport AS updateCommitSupport
2020
,pairwiseChange.updateCommitLift AS updateCommitLift
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// List top pairs of files that were changed together by their commit count. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
2+
3+
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
4+
WHERE elementId(firstFile) < elementId(secondFile)
5+
WITH *
6+
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
7+
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
8+
RETURN firstFile.name AS firstFileNameShort
9+
,secondFile.name AS secondFileNameShort
10+
,firstFileName
11+
,secondFileName
12+
,pairwiseChange.updateCommitCount AS updateCommitCount
13+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
14+
,pairwiseChange.updateCommitSupport AS updateCommitSupport
15+
,pairwiseChange.updateCommitLift AS updateCommitLift
16+
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
17+
ORDER BY updateCommitCount DESC
18+
LIMIT 50
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// List top pairs of files that were changed together by their commit jaccard similarity (0: never change together, 1: always change together). Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
2+
3+
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
4+
WHERE elementId(firstFile) < elementId(secondFile)
5+
WITH *
6+
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
7+
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
8+
RETURN firstFile.name AS firstFileNameShort
9+
,secondFile.name AS secondFileNameShort
10+
,firstFileName
11+
,secondFileName
12+
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
13+
,pairwiseChange.updateCommitCount AS updateCommitCount
14+
,pairwiseChange.updateCommitLift AS updateCommitLift
15+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
16+
,pairwiseChange.updateCommitSupport AS updateCommitSupport
17+
ORDER BY updateCommitJaccardSimilarity DESC
18+
LIMIT 50
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// List top pairs of files that were changed together by their commit lift (>1: changes more often than by random chance). Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
2+
3+
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
4+
WHERE elementId(firstFile) < elementId(secondFile)
5+
WITH *
6+
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
7+
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
8+
RETURN firstFile.name AS firstFileNameShort
9+
,secondFile.name AS secondFileNameShort
10+
,firstFileName
11+
,secondFileName
12+
,pairwiseChange.updateCommitLift AS updateCommitLift
13+
,pairwiseChange.updateCommitCount AS updateCommitCount
14+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
15+
,pairwiseChange.updateCommitSupport AS updateCommitSupport
16+
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
17+
ORDER BY updateCommitLift DESC
18+
LIMIT 50

jupyter/GitHistoryGeneral.ipynb

Lines changed: 186 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,6 +1343,16 @@
13431343
"The following charts show how these metrics are distributed across pairs of files that were changed together."
13441344
]
13451345
},
1346+
{
1347+
"cell_type": "code",
1348+
"execution_count": null,
1349+
"id": "e7721dfd",
1350+
"metadata": {},
1351+
"outputs": [],
1352+
"source": [
1353+
"# Initial steps: Function Declaration and Data Preparation"
1354+
]
1355+
},
13461356
{
13471357
"cell_type": "code",
13481358
"execution_count": null,
@@ -1368,20 +1378,99 @@
13681378
" return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n",
13691379
" \"\"\"\n",
13701380
" top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n",
1371-
" return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']\n",
1372-
"\n",
1373-
"top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)"
1381+
" return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']"
13741382
]
13751383
},
13761384
{
13771385
"cell_type": "code",
13781386
"execution_count": null,
1379-
"id": "3c34ceea",
1387+
"id": "7e228e63",
1388+
"metadata": {},
1389+
"outputs": [],
1390+
"source": [
1391+
"top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)\n",
1392+
"# Only keep the pairwise change files with the top file extensions\n",
1393+
"pairwise_changed_git_files = pairwise_changed_git_files[pairwise_changed_git_files['fileExtensionPair'].isin(top_pairwise_changed_file_extensions)]"
1394+
]
1395+
},
1396+
{
1397+
"cell_type": "code",
1398+
"execution_count": null,
1399+
"id": "8c07abbf",
1400+
"metadata": {},
1401+
"outputs": [],
1402+
"source": [
1403+
"def add_file_extension_rank_column(data_frame: pd.DataFrame, column_name: str):\n",
1404+
" \"\"\"\n",
1405+
" Adds a 'fileExtensionPair' based rank column to the DataFrame for the value of the specified column.\n",
1406+
" data_frame : pd.DataFrame : The input DataFrame\n",
1407+
" column_name : str : The name of the column to rank\n",
1408+
" return : pd.DataFrame : The DataFrame with added rank column\n",
1409+
" \"\"\"\n",
1410+
" if column_name + '_rank' in data_frame.columns:\n",
1411+
" return data_frame # Column already exists\n",
1412+
" \n",
1413+
" # Create a new rank column based on the specified column and group by the group column\n",
1414+
" data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair')[column_name].rank(ascending=False, method='dense').astype(int)\n",
1415+
" return data_frame"
1416+
]
1417+
},
1418+
{
1419+
"cell_type": "code",
1420+
"execution_count": null,
1421+
"id": "84b01643",
1422+
"metadata": {},
1423+
"outputs": [],
1424+
"source": [
1425+
"pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitCount\")\n",
1426+
"pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitMinConfidence\")\n",
1427+
"pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitJaccardSimilarity\")\n",
1428+
"pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitLift\")"
1429+
]
1430+
},
1431+
{
1432+
"cell_type": "code",
1433+
"execution_count": null,
1434+
"id": "ad158020",
13801435
"metadata": {},
13811436
"outputs": [],
13821437
"source": [
13831438
"def display_table_for_top_pairwise_changed_file_extensions(\n",
13841439
" data_to_display: pd.DataFrame, \n",
1440+
" metric_column: str,\n",
1441+
" top_n: int = 10\n",
1442+
" ):\n",
1443+
" \"\"\"\n",
1444+
" Displays a table containing the top N ranked pairwise changed file extensions based on the specified metric column.\n",
1445+
" data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n",
1446+
" metric_column : str : The column to sort the data by\n",
1447+
" top_n : int : The number of top entries to display for each extension (default is 10)\n",
1448+
" \"\"\"\n",
1449+
" filtered_data = data_to_display[data_to_display[metric_column + \"ExtensionRank\"] <= top_n]\n",
1450+
" \n",
1451+
" # Group by the file extensions and the metric and its rank.\n",
1452+
" # Since some entries might have the same metric value, we aggregate by the first file pair with relative path and the first file pair.\n",
1453+
" # This way we can pick the top n entries for each file extension pair.\n",
1454+
" grouping_columns = [\"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n",
1455+
" grouped_data = filtered_data.groupby(grouping_columns).aggregate(\n",
1456+
" filePair=pd.NamedAgg(column=\"filePair\", aggfunc=\"first\"),\n",
1457+
" filePairWithRelativePath=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"first\"),\n",
1458+
" ).reset_index()\n",
1459+
" \n",
1460+
" return grouped_data.sort_values(by=grouping_columns, ascending=[True, False, True]).reset_index(drop=True).rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})"
1461+
]
1462+
},
1463+
{
1464+
"cell_type": "code",
1465+
"execution_count": null,
1466+
"id": "3c34ceea",
1467+
"metadata": {},
1468+
"outputs": [],
1469+
"source": [
1470+
"# TODO delete if not needed anymore\n",
1471+
"\n",
1472+
"def display_table_for_top_pairwise_changed_file_extensions_deprecated(\n",
1473+
" data_to_display: pd.DataFrame, \n",
13851474
" top_pairwise_changed_file_extensions: pd.Series,\n",
13861475
" sort_column: str,\n",
13871476
" top_n: int = 10\n",
@@ -1408,10 +1497,10 @@
14081497
" \n",
14091498
" combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n",
14101499
" \n",
1411-
" for index, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n",
1500+
" for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n",
14121501
" filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n",
14131502
" sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n",
1414-
" combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data[selected_columns]], ignore_index=True)\n",
1503+
" combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n",
14151504
" \n",
14161505
" display(combined_data_for_top_extensions)"
14171506
]
@@ -1507,14 +1596,13 @@
15071596
{
15081597
"cell_type": "code",
15091598
"execution_count": null,
1510-
"id": "67e2a3c4",
1599+
"id": "1cd03b3f",
15111600
"metadata": {},
15121601
"outputs": [],
15131602
"source": [
15141603
"display_table_for_top_pairwise_changed_file_extensions(\n",
15151604
" pairwise_changed_git_files,\n",
1516-
" top_pairwise_changed_file_extensions,\n",
1517-
" sort_column=\"updateCommitCount\"\n",
1605+
" \"updateCommitCount\",\n",
15181606
")"
15191607
]
15201608
},
@@ -1534,6 +1622,46 @@
15341622
")"
15351623
]
15361624
},
1625+
{
1626+
"cell_type": "markdown",
1627+
"id": "55be3351",
1628+
"metadata": {},
1629+
"source": [
1630+
"### Files changed together by commit min confidence\n",
1631+
"\n",
1632+
"The commit min confidence is the commit count where both files were changed divided by the commit count of the file with the least commits.\n",
1633+
"This metric is useful to identify pairs of files that are frequently changed together and is not biased by single files that are changed very often."
1634+
]
1635+
},
1636+
{
1637+
"cell_type": "code",
1638+
"execution_count": null,
1639+
"id": "a1c9df18",
1640+
"metadata": {},
1641+
"outputs": [],
1642+
"source": [
1643+
"display_table_for_top_pairwise_changed_file_extensions(\n",
1644+
" pairwise_changed_git_files,\n",
1645+
" \"updateCommitMinConfidence\"\n",
1646+
")"
1647+
]
1648+
},
1649+
{
1650+
"cell_type": "code",
1651+
"execution_count": null,
1652+
"id": "7a54edcd",
1653+
"metadata": {},
1654+
"outputs": [],
1655+
"source": [
1656+
"plot_histogram_of_pairwise_changed_files(\n",
1657+
" data_to_plot = pairwise_changed_git_files,\n",
1658+
" top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n",
1659+
" x_axis_column = \"updateCommitMinConfidence\",\n",
1660+
" x_axis_label = \"Commit Min Confidence\",\n",
1661+
" output_file_name = \"CoChangedFilesByCommitMinConfidence\"\n",
1662+
")"
1663+
]
1664+
},
15371665
{
15381666
"cell_type": "markdown",
15391667
"id": "132fd688",
@@ -1551,8 +1679,7 @@
15511679
"source": [
15521680
"display_table_for_top_pairwise_changed_file_extensions(\n",
15531681
" pairwise_changed_git_files,\n",
1554-
" top_pairwise_changed_file_extensions,\n",
1555-
" sort_column=\"updateCommitLift\"\n",
1682+
" \"updateCommitLift\"\n",
15561683
")"
15571684
]
15581685
},
@@ -1589,8 +1716,7 @@
15891716
"source": [
15901717
"display_table_for_top_pairwise_changed_file_extensions(\n",
15911718
" pairwise_changed_git_files,\n",
1592-
" top_pairwise_changed_file_extensions,\n",
1593-
" sort_column=\"updateCommitJaccardSimilarity\"\n",
1719+
" \"updateCommitJaccardSimilarity\"\n",
15941720
")"
15951721
]
15961722
},
@@ -1610,6 +1736,53 @@
16101736
")"
16111737
]
16121738
},
1739+
{
1740+
"cell_type": "markdown",
1741+
"id": "727772c7",
1742+
"metadata": {},
1743+
"source": [
1744+
"### Find pairwise changed files with highly ranked metrics\n",
1745+
"\n",
1746+
"Find those pairwise changed files that have a high rank in most metrics.\n",
1747+
"This is useful to identify pairs of files that score high in most metrics, which indicates a strong co-change relationship."
1748+
]
1749+
},
1750+
{
1751+
"cell_type": "code",
1752+
"execution_count": null,
1753+
"id": "9de55c0b",
1754+
"metadata": {},
1755+
"outputs": [],
1756+
"source": [
1757+
"pairwise_changed_git_files['combinedMetricsScore'] = (\n",
1758+
" pairwise_changed_git_files['updateCommitCountExtensionRank'] +\n",
1759+
" pairwise_changed_git_files['updateCommitMinConfidenceExtensionRank'] +\n",
1760+
" pairwise_changed_git_files['updateCommitJaccardSimilarityExtensionRank'] +\n",
1761+
" pairwise_changed_git_files['updateCommitLiftExtensionRank']\n",
1762+
")\n",
1763+
"columns_to_show = [\n",
1764+
" \"fileExtensionPair\", \n",
1765+
" \"filePair\", \n",
1766+
" \"combinedMetricsScore\",\n",
1767+
" \"updateCommitCountExtensionRank\",\n",
1768+
" \"updateCommitMinConfidenceExtensionRank\",\n",
1769+
" \"updateCommitJaccardSimilarityExtensionRank\",\n",
1770+
" \"updateCommitLiftExtensionRank\",\n",
1771+
" \"updateCommitCount\",\n",
1772+
" \"updateCommitMinConfidence\",\n",
1773+
" \"updateCommitJaccardSimilarity\",\n",
1774+
" \"updateCommitLift\",\n",
1775+
" \"filePairWithRelativePath\",\n",
1776+
"]\n",
1777+
"pairwise_changed_git_files_top_10_ranks = pairwise_changed_git_files.\\\n",
1778+
" sort_values(by=[\"fileExtensionPair\", \"combinedMetricsScore\"], ascending=[True, True]).\\\n",
1779+
" groupby(\"fileExtensionPair\").\\\n",
1780+
" head(10).\\\n",
1781+
" reset_index(drop=True)\\\n",
1782+
" [columns_to_show]\n",
1783+
"display(pairwise_changed_git_files_top_10_ranks)\n"
1784+
]
1785+
},
16131786
{
16141787
"cell_type": "markdown",
16151788
"id": "14e87aff",

scripts/reports/GitHistoryCsv.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_per_commit_distribution.cyp
5151
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_with_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_with_dependencies.csv"
5252

5353
# List pairwise changed files with various metrics
54-
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files.csv"
54+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_count.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_count.csv"
55+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_lift.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_lift.csv"
56+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_jaccard.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_jaccard.csv"
5557

5658
# Clean-up after report generation. Empty reports will be deleted.
5759
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"

0 commit comments

Comments
 (0)