Skip to content

Commit fe1f398

Browse files
committed
Add reports for frequently changed files
1 parent 32f6f7b commit fe1f398

File tree

6 files changed

+173
-4
lines changed

6 files changed

+173
-4
lines changed

cypher/GitLog/List_pairwise_changed_files.cypher

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ RETURN firstFileName
1414
,firstFile.extension AS firstFileExtension
1515
,secondFile.extension AS secondFileExtension
1616
,firstFile.extension + '↔' + secondFile.extension AS fileExtensionPair
17-
,toInteger(pairwiseChange.updateCommitCount) AS updateCommitCount
17+
,pairwiseChange.updateCommitCount AS updateCommitCount
1818
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
1919
,pairwiseChange.updateCommitSupport AS updateCommitSupport
2020
,pairwiseChange.updateCommitLift AS updateCommitLift
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// List top pairs of files that were changed together by their commit count. Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
2+
3+
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
4+
WHERE elementId(firstFile) < elementId(secondFile)
5+
WITH *
6+
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
7+
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
8+
RETURN firstFile.name AS firstFileNameShort
9+
,secondFile.name AS secondFileNameShort
10+
,firstFileName
11+
,secondFileName
12+
,pairwiseChange.updateCommitCount AS updateCommitCount
13+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
14+
,pairwiseChange.updateCommitSupport AS updateCommitSupport
15+
,pairwiseChange.updateCommitLift AS updateCommitLift
16+
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
17+
ORDER BY updateCommitCount DESC
18+
LIMIT 50
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// List top pairs of files that were changed together by their commit jaccard similarity (0: never change together, 1: always change together). Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
2+
3+
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
4+
WHERE elementId(firstFile) < elementId(secondFile)
5+
WITH *
6+
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
7+
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
8+
RETURN firstFile.name AS firstFileNameShort
9+
,secondFile.name AS secondFileNameShort
10+
,firstFileName
11+
,secondFileName
12+
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
13+
,pairwiseChange.updateCommitCount AS updateCommitCount
14+
,pairwiseChange.updateCommitLift AS updateCommitLift
15+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
16+
,pairwiseChange.updateCommitSupport AS updateCommitSupport
17+
ORDER BY updateCommitJaccardSimilarity DESC
18+
LIMIT 50
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// List top pairs of files that were changed together by their commit lift (>1: changes more often than by random chance). Requires Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher to run first.
2+
3+
MATCH (firstFile:Git:File)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile:Git:File)
4+
WHERE elementId(firstFile) < elementId(secondFile)
5+
WITH *
6+
,coalesce(firstFile.relativePath, firstFile.fileName) AS firstFileName
7+
,coalesce(secondFile.relativePath, secondFile.fileName) AS secondFileName
8+
RETURN firstFile.name AS firstFileNameShort
9+
,secondFile.name AS secondFileNameShort
10+
,firstFileName
11+
,secondFileName
12+
,pairwiseChange.updateCommitLift AS updateCommitLift
13+
,pairwiseChange.updateCommitCount AS updateCommitCount
14+
,pairwiseChange.updateCommitMinConfidence AS updateCommitMinConfidence
15+
,pairwiseChange.updateCommitSupport AS updateCommitSupport
16+
,pairwiseChange.updateCommitJaccardSimilarity AS updateCommitJaccardSimilarity
17+
ORDER BY updateCommitLift DESC
18+
LIMIT 50

jupyter/GitHistoryGeneral.ipynb

Lines changed: 115 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1408,10 +1408,10 @@
14081408
" \n",
14091409
" combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n",
14101410
" \n",
1411-
" for index, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n",
1411+
" for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n",
14121412
" filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n",
14131413
" sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n",
1414-
" combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data[selected_columns]], ignore_index=True)\n",
1414+
" combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n",
14151415
" \n",
14161416
" display(combined_data_for_top_extensions)"
14171417
]
@@ -1534,6 +1534,47 @@
15341534
")"
15351535
]
15361536
},
1537+
{
1538+
"cell_type": "markdown",
1539+
"id": "55be3351",
1540+
"metadata": {},
1541+
"source": [
1542+
"### Files changed together by commit min confidence\n",
1543+
"\n",
1544+
"The commit min confidence is the commit count where both files were changed divided by the commit count of the file with the least commits.\n",
1545+
"This metric is useful to identify pairs of files that are frequently changed together and is not biased by single files that are changed very often."
1546+
]
1547+
},
1548+
{
1549+
"cell_type": "code",
1550+
"execution_count": null,
1551+
"id": "a1c9df18",
1552+
"metadata": {},
1553+
"outputs": [],
1554+
"source": [
1555+
"display_table_for_top_pairwise_changed_file_extensions(\n",
1556+
" pairwise_changed_git_files,\n",
1557+
" top_pairwise_changed_file_extensions,\n",
1558+
" sort_column=\"updateCommitMinConfidence\"\n",
1559+
")"
1560+
]
1561+
},
1562+
{
1563+
"cell_type": "code",
1564+
"execution_count": null,
1565+
"id": "7a54edcd",
1566+
"metadata": {},
1567+
"outputs": [],
1568+
"source": [
1569+
"plot_histogram_of_pairwise_changed_files(\n",
1570+
" data_to_plot = pairwise_changed_git_files,\n",
1571+
" top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n",
1572+
" x_axis_column = \"updateCommitMinConfidence\",\n",
1573+
" x_axis_label = \"Commit Min Confidence\",\n",
1574+
" output_file_name = \"CoChangedFilesByCommitMinConfidence\"\n",
1575+
")"
1576+
]
1577+
},
15371578
{
15381579
"cell_type": "markdown",
15391580
"id": "132fd688",
@@ -1610,6 +1651,78 @@
16101651
")"
16111652
]
16121653
},
1654+
{
1655+
"cell_type": "markdown",
1656+
"id": "727772c7",
1657+
"metadata": {},
1658+
"source": [
1659+
"### Find pairwise changed files with highly ranked metrics\n",
1660+
"\n",
1661+
"Find those pairwise change files that have a high rank in at least one of the metrics.\n",
1662+
"If a pair of files has a high rank in multiple metrics, this is a strong indicator that these files are frequently changed together."
1663+
]
1664+
},
1665+
{
1666+
"cell_type": "code",
1667+
"execution_count": null,
1668+
"id": "2d9fbad8",
1669+
"metadata": {},
1670+
"outputs": [],
1671+
"source": [
1672+
"def add_grouped_rank_column(data_frame: pd.DataFrame, column_name: str, group_column_name: str):\n",
1673+
" \"\"\"\n",
1674+
" Adds a rank column to the DataFrame based on the specified column and groups by the specified group column.\n",
1675+
" data_frame : pd.DataFrame : The input DataFrame\n",
1676+
" column_name : str : The name of the column to rank\n",
1677+
" group_column_name : str : The name of the column to group by\n",
1678+
" ascending : bool : Whether to rank in ascending order (default: True)\n",
1679+
" return : pd.DataFrame : The DataFrame with added rank column\n",
1680+
" \"\"\"\n",
1681+
" if column_name + '_rank' in data_frame.columns:\n",
1682+
" return data_frame # Column already exists\n",
1683+
" if group_column_name not in data_frame.columns:\n",
1684+
" raise ValueError(f\"Group column '{group_column_name}' does not exist in the DataFrame.\")\n",
1685+
" \n",
1686+
" # Create a new rank column based on the specified column and group by the group column\n",
1687+
" data_frame[f\"{column_name}_{group_column_name}Rank\"] = data_frame.groupby(group_column_name)[column_name].rank(ascending=False, method='dense').astype(int)\n",
1688+
" return data_frame"
1689+
]
1690+
},
1691+
{
1692+
"cell_type": "code",
1693+
"execution_count": null,
1694+
"id": "9de55c0b",
1695+
"metadata": {},
1696+
"outputs": [],
1697+
"source": [
1698+
"pairwise_changed_git_files = add_grouped_rank_column(pairwise_changed_git_files, \"updateCommitCount\", \"fileExtensionPair\")\n",
1699+
"pairwise_changed_git_files = add_grouped_rank_column(pairwise_changed_git_files, \"updateCommitMinConfidence\", \"fileExtensionPair\")\n",
1700+
"pairwise_changed_git_files = add_grouped_rank_column(pairwise_changed_git_files, \"updateCommitJaccardSimilarity\", \"fileExtensionPair\")\n",
1701+
"pairwise_changed_git_files = add_grouped_rank_column(pairwise_changed_git_files, \"updateCommitLift\", \"fileExtensionPair\")\n",
1702+
"\n",
1703+
"# Display all entries with updateCommitCount_fileExtensionPairRank <= 10 and updateCommitMinConfidence_fileExtensionPairRank <= 10\n",
1704+
"# and updateCommitJaccardSimilarity_fileExtensionPairRank <= 10 and updateCommitLift_fileExtensionPairRank <= 10\n",
1705+
"pairwise_changed_git_files_top_10_ranks = pairwise_changed_git_files[(\n",
1706+
" (pairwise_changed_git_files['updateCommitCount_fileExtensionPairRank'] <= 10) &\n",
1707+
" (pairwise_changed_git_files['updateCommitMinConfidence_fileExtensionPairRank'] <= 10) &\n",
1708+
" (pairwise_changed_git_files['updateCommitJaccardSimilarity_fileExtensionPairRank'] <= 10) &\n",
1709+
" (pairwise_changed_git_files['updateCommitLift_fileExtensionPairRank'] <= 10)\n",
1710+
")][[\n",
1711+
" \"fileExtensionPair\", \n",
1712+
" \"filePair\", \n",
1713+
" \"updateCommitCount_fileExtensionPairRank\",\n",
1714+
" \"updateCommitMinConfidence_fileExtensionPairRank\",\n",
1715+
" \"updateCommitJaccardSimilarity_fileExtensionPairRank\",\n",
1716+
" \"updateCommitLift_fileExtensionPairRank\",\n",
1717+
" \"updateCommitCount\",\n",
1718+
" \"updateCommitMinConfidence\",\n",
1719+
" \"updateCommitJaccardSimilarity\",\n",
1720+
" \"updateCommitLift\",\n",
1721+
" \"filePairWithRelativePath\",\n",
1722+
"]].sort_values(by=\"fileExtensionPair\", ascending=True).reset_index(drop=True)\n",
1723+
"display(pairwise_changed_git_files_top_10_ranks.head(50))\n"
1724+
]
1725+
},
16131726
{
16141727
"cell_type": "markdown",
16151728
"id": "14e87aff",

scripts/reports/GitHistoryCsv.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ execute_cypher "${GIT_LOG_CYPHER_DIR}/List_git_files_per_commit_distribution.cyp
5151
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_with_dependencies.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_with_dependencies.csv"
5252

5353
# List pairwise changed files with various metrics
54-
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files.csv"
54+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_count.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_count.csv"
55+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_lift.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_lift.csv"
56+
execute_cypher "${GIT_LOG_CYPHER_DIR}/List_pairwise_changed_files_top_jaccard.cypher" > "${FULL_REPORT_DIRECTORY}/List_pairwise_changed_files_top_jaccard.csv"
5557

5658
# Clean-up after report generation. Empty reports will be deleted.
5759
source "${SCRIPTS_DIR}/cleanupAfterReportGeneration.sh" "${FULL_REPORT_DIRECTORY}"

0 commit comments

Comments
 (0)