|
1408 | 1408 | " \n", |
1409 | 1409 | " combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n", |
1410 | 1410 | " \n", |
1411 | | - " for index, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", |
| 1411 | + " for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", |
1412 | 1412 | " filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n", |
1413 | 1413 | " sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n", |
1414 | | - " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data[selected_columns]], ignore_index=True)\n", |
| 1414 | + " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n", |
1415 | 1415 | " \n", |
1416 | 1416 | " display(combined_data_for_top_extensions)" |
1417 | 1417 | ] |
|
1534 | 1534 | ")" |
1535 | 1535 | ] |
1536 | 1536 | }, |
| 1537 | + { |
| 1538 | + "cell_type": "markdown", |
| 1539 | + "id": "55be3351", |
| 1540 | + "metadata": {}, |
| 1541 | + "source": [ |
| 1542 | + "### Files changed together by commit min confidence\n", |
| 1543 | + "\n", |
| 1544 | + "The commit min confidence is the commit count where both files were changed divided by the commit count of the file with the least commits.\n", |
| 1545 | + "This metric is useful to identify pairs of files that are frequently changed together and is not biased by single files that are changed very often." |
| 1546 | + ] |
| 1547 | + }, |
| 1548 | + { |
| 1549 | + "cell_type": "code", |
| 1550 | + "execution_count": null, |
| 1551 | + "id": "a1c9df18", |
| 1552 | + "metadata": {}, |
| 1553 | + "outputs": [], |
| 1554 | + "source": [ |
| 1555 | + "display_table_for_top_pairwise_changed_file_extensions(\n", |
| 1556 | + " pairwise_changed_git_files,\n", |
| 1557 | + " top_pairwise_changed_file_extensions,\n", |
| 1558 | + " sort_column=\"updateCommitMinConfidence\"\n", |
| 1559 | + ")" |
| 1560 | + ] |
| 1561 | + }, |
| 1562 | + { |
| 1563 | + "cell_type": "code", |
| 1564 | + "execution_count": null, |
| 1565 | + "id": "7a54edcd", |
| 1566 | + "metadata": {}, |
| 1567 | + "outputs": [], |
| 1568 | + "source": [ |
| 1569 | + "plot_histogram_of_pairwise_changed_files(\n", |
| 1570 | + " data_to_plot = pairwise_changed_git_files,\n", |
| 1571 | + " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n", |
| 1572 | + " x_axis_column = \"updateCommitMinConfidence\",\n", |
| 1573 | + " x_axis_label = \"Commit Min Confidence\",\n", |
| 1574 | + " output_file_name = \"CoChangedFilesByCommitMinConfidence\"\n", |
| 1575 | + ")" |
| 1576 | + ] |
| 1577 | + }, |
1537 | 1578 | { |
1538 | 1579 | "cell_type": "markdown", |
1539 | 1580 | "id": "132fd688", |
|
1610 | 1651 | ")" |
1611 | 1652 | ] |
1612 | 1653 | }, |
| 1654 | + { |
| 1655 | + "cell_type": "markdown", |
| 1656 | + "id": "727772c7", |
| 1657 | + "metadata": {}, |
| 1658 | + "source": [ |
| 1659 | + "### Find pairwise changed files with highly ranked metrics\n", |
| 1660 | + "\n", |
| 1661 | + "Find those pairwise change files that have a high rank in at least one of the metrics.\n", |
| 1662 | + "If a pair of files has a high rank in multiple metrics, this is a strong indicator that these files are frequently changed together." |
| 1663 | + ] |
| 1664 | + }, |
| 1665 | + { |
| 1666 | + "cell_type": "code", |
| 1667 | + "execution_count": null, |
| 1668 | + "id": "2d9fbad8", |
| 1669 | + "metadata": {}, |
| 1670 | + "outputs": [], |
| 1671 | + "source": [ |
| 1672 | + "def add_grouped_rank_column(data_frame: pd.DataFrame, column_name: str, group_column_name: str):\n", |
| 1673 | + " \"\"\"\n", |
| 1674 | + " Adds a rank column to the DataFrame based on the specified column and groups by the specified group column.\n", |
| 1675 | + " data_frame : pd.DataFrame : The input DataFrame\n", |
| 1676 | + " column_name : str : The name of the column to rank\n", |
| 1677 | + " group_column_name : str : The name of the column to group by\n", |
| 1678 | + " ascending : bool : Whether to rank in ascending order (default: True)\n", |
| 1679 | + " return : pd.DataFrame : The DataFrame with added rank column\n", |
| 1680 | + " \"\"\"\n", |
| 1681 | + " if column_name + '_rank' in data_frame.columns:\n", |
| 1682 | + " return data_frame # Column already exists\n", |
| 1683 | + " if group_column_name not in data_frame.columns:\n", |
| 1684 | + " raise ValueError(f\"Group column '{group_column_name}' does not exist in the DataFrame.\")\n", |
| 1685 | + " \n", |
| 1686 | + " # Create a new rank column based on the specified column and group by the group column\n", |
| 1687 | + " data_frame[f\"{column_name}_{group_column_name}Rank\"] = data_frame.groupby(group_column_name)[column_name].rank(ascending=False, method='dense').astype(int)\n", |
| 1688 | + " return data_frame" |
| 1689 | + ] |
| 1690 | + }, |
| 1691 | + { |
| 1692 | + "cell_type": "code", |
| 1693 | + "execution_count": null, |
| 1694 | + "id": "9de55c0b", |
| 1695 | + "metadata": {}, |
| 1696 | + "outputs": [], |
| 1697 | + "source": [ |
| 1698 | + "pairwise_changed_git_files = add_grouped_rank_column(pairwise_changed_git_files, \"updateCommitCount\", \"fileExtensionPair\")\n", |
| 1699 | + "pairwise_changed_git_files = add_grouped_rank_column(pairwise_changed_git_files, \"updateCommitMinConfidence\", \"fileExtensionPair\")\n", |
| 1700 | + "pairwise_changed_git_files = add_grouped_rank_column(pairwise_changed_git_files, \"updateCommitJaccardSimilarity\", \"fileExtensionPair\")\n", |
| 1701 | + "pairwise_changed_git_files = add_grouped_rank_column(pairwise_changed_git_files, \"updateCommitLift\", \"fileExtensionPair\")\n", |
| 1702 | + "\n", |
| 1703 | + "# Display all entries with updateCommitCount_fileExtensionPairRank <= 10 and updateCommitMinConfidence_fileExtensionPairRank <= 10\n", |
| 1704 | + "# and updateCommitJaccardSimilarity_fileExtensionPairRank <= 10 and updateCommitLift_fileExtensionPairRank <= 10\n", |
| 1705 | + "pairwise_changed_git_files_top_10_ranks = pairwise_changed_git_files[(\n", |
| 1706 | + " (pairwise_changed_git_files['updateCommitCount_fileExtensionPairRank'] <= 10) &\n", |
| 1707 | + " (pairwise_changed_git_files['updateCommitMinConfidence_fileExtensionPairRank'] <= 10) &\n", |
| 1708 | + " (pairwise_changed_git_files['updateCommitJaccardSimilarity_fileExtensionPairRank'] <= 10) &\n", |
| 1709 | + " (pairwise_changed_git_files['updateCommitLift_fileExtensionPairRank'] <= 10)\n", |
| 1710 | + ")][[\n", |
| 1711 | + " \"fileExtensionPair\", \n", |
| 1712 | + " \"filePair\", \n", |
| 1713 | + " \"updateCommitCount_fileExtensionPairRank\",\n", |
| 1714 | + " \"updateCommitMinConfidence_fileExtensionPairRank\",\n", |
| 1715 | + " \"updateCommitJaccardSimilarity_fileExtensionPairRank\",\n", |
| 1716 | + " \"updateCommitLift_fileExtensionPairRank\",\n", |
| 1717 | + " \"updateCommitCount\",\n", |
| 1718 | + " \"updateCommitMinConfidence\",\n", |
| 1719 | + " \"updateCommitJaccardSimilarity\",\n", |
| 1720 | + " \"updateCommitLift\",\n", |
| 1721 | + " \"filePairWithRelativePath\",\n", |
| 1722 | + "]].sort_values(by=\"fileExtensionPair\", ascending=True).reset_index(drop=True)\n", |
| 1723 | + "display(pairwise_changed_git_files_top_10_ranks.head(50))\n" |
| 1724 | + ] |
| 1725 | + }, |
1613 | 1726 | { |
1614 | 1727 | "cell_type": "markdown", |
1615 | 1728 | "id": "14e87aff", |
|
0 commit comments