|
1343 | 1343 | "The following charts show how these metrics are distributed across pairs of files that were changed together." |
1344 | 1344 | ] |
1345 | 1345 | }, |
| 1346 | + { |
| 1347 | + "cell_type": "code", |
| 1348 | + "execution_count": null, |
| 1349 | + "id": "e7721dfd", |
| 1350 | + "metadata": {}, |
| 1351 | + "outputs": [], |
| 1352 | + "source": [ |
| 1353 | + "# Initial steps: Function Declaration and Data Preparation" |
| 1354 | + ] |
| 1355 | + }, |
1346 | 1356 | { |
1347 | 1357 | "cell_type": "code", |
1348 | 1358 | "execution_count": null, |
|
1368 | 1378 | " return : pd.Series : Series with top N file extension pairs sorted by their pair count descending\n", |
1369 | 1379 | " \"\"\"\n", |
1370 | 1380 | " top_extensions = input_data.groupby('fileExtensionPair').aggregate({'filePairWithRelativePath': 'count'}).reset_index()\n", |
1371 | | - " return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']\n", |
1372 | | - "\n", |
1373 | | - "top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)" |
| 1381 | + " return top_extensions.sort_values(by='filePairWithRelativePath', ascending=False).reset_index(drop=True).head(top_n)['fileExtensionPair']" |
1374 | 1382 | ] |
1375 | 1383 | }, |
1376 | 1384 | { |
1377 | 1385 | "cell_type": "code", |
1378 | 1386 | "execution_count": null, |
1379 | | - "id": "3c34ceea", |
| 1387 | + "id": "7e228e63", |
| 1388 | + "metadata": {}, |
| 1389 | + "outputs": [], |
| 1390 | + "source": [ |
| 1391 | + "top_pairwise_changed_file_extensions = find_top_pairwise_changed_file_extensions(pairwise_changed_git_files, top_n=4)\n", |
| 1392 | + "# Only keep the pairwise change files with the top file extensions\n", |
| 1393 | + "pairwise_changed_git_files = pairwise_changed_git_files[pairwise_changed_git_files['fileExtensionPair'].isin(top_pairwise_changed_file_extensions)]" |
| 1394 | + ] |
| 1395 | + }, |
| 1396 | + { |
| 1397 | + "cell_type": "code", |
| 1398 | + "execution_count": null, |
| 1399 | + "id": "8c07abbf", |
| 1400 | + "metadata": {}, |
| 1401 | + "outputs": [], |
| 1402 | + "source": [ |
| 1403 | + "def add_file_extension_rank_column(data_frame: pd.DataFrame, column_name: str):\n", |
| 1404 | + " \"\"\"\n", |
| 1405 | + " Adds a 'fileExtensionPair' based rank column to the DataFrame for the value of the specified column.\n", |
| 1406 | + " data_frame : pd.DataFrame : The input DataFrame\n", |
| 1407 | + " column_name : str : The name of the column to rank\n", |
| 1408 | + " return : pd.DataFrame : The DataFrame with added rank column\n", |
| 1409 | + " \"\"\"\n", |
| 1410 | + " if column_name + '_rank' in data_frame.columns:\n", |
| 1411 | + " return data_frame # Column already exists\n", |
| 1412 | + " \n", |
| 1413 | + " # Create a new rank column based on the specified column and group by the group column\n", |
| 1414 | + " data_frame[f\"{column_name}ExtensionRank\"] = data_frame.groupby('fileExtensionPair')[column_name].rank(ascending=False, method='dense').astype(int)\n", |
| 1415 | + " return data_frame" |
| 1416 | + ] |
| 1417 | + }, |
| 1418 | + { |
| 1419 | + "cell_type": "code", |
| 1420 | + "execution_count": null, |
| 1421 | + "id": "84b01643", |
| 1422 | + "metadata": {}, |
| 1423 | + "outputs": [], |
| 1424 | + "source": [ |
| 1425 | + "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitCount\")\n", |
| 1426 | + "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitMinConfidence\")\n", |
| 1427 | + "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitJaccardSimilarity\")\n", |
| 1428 | + "pairwise_changed_git_files = add_file_extension_rank_column(pairwise_changed_git_files, \"updateCommitLift\")" |
| 1429 | + ] |
| 1430 | + }, |
| 1431 | + { |
| 1432 | + "cell_type": "code", |
| 1433 | + "execution_count": null, |
| 1434 | + "id": "ad158020", |
1380 | 1435 | "metadata": {}, |
1381 | 1436 | "outputs": [], |
1382 | 1437 | "source": [ |
1383 | 1438 | "def display_table_for_top_pairwise_changed_file_extensions(\n", |
1384 | 1439 | " data_to_display: pd.DataFrame, \n", |
| 1440 | + " metric_column: str,\n", |
| 1441 | + " top_n: int = 10\n", |
| 1442 | + " ):\n", |
| 1443 | + " \"\"\"\n", |
| 1444 | + " Displays a table containing the top N ranked pairwise changed file extensions based on the specified metric column.\n", |
| 1445 | + " data_to_plot : pd.DataFrame : DataFrame containing pairwise changed files with their commit counts\n", |
| 1446 | + " metric_column : str : The column to sort the data by\n", |
| 1447 | + " top_n : int : The number of top entries to display for each extension (default is 10)\n", |
| 1448 | + " \"\"\"\n", |
| 1449 | + " filtered_data = data_to_display[data_to_display[metric_column + \"ExtensionRank\"] <= top_n]\n", |
| 1450 | + " \n", |
| 1451 | + " # Group by the file extensions and the metric and its rank.\n", |
| 1452 | + " # Since some entries might have the same metric value, we aggregate by the first file pair with relative path and the first file pair.\n", |
| 1453 | + " # This way we can pick the top n entries for each file extension pair.\n", |
| 1454 | + " grouping_columns = [\"fileExtensionPair\", metric_column, metric_column + \"ExtensionRank\"]\n", |
| 1455 | + " grouped_data = filtered_data.groupby(grouping_columns).aggregate(\n", |
| 1456 | + " filePair=pd.NamedAgg(column=\"filePair\", aggfunc=\"first\"),\n", |
| 1457 | + " filePairWithRelativePath=pd.NamedAgg(column=\"filePairWithRelativePath\", aggfunc=\"first\"),\n", |
| 1458 | + " ).reset_index()\n", |
| 1459 | + " \n", |
| 1460 | + " return grouped_data.sort_values(by=grouping_columns, ascending=[True, False, True]).reset_index(drop=True).rename(columns={metric_column + \"ExtensionRank\": \"GroupRank\"})" |
| 1461 | + ] |
| 1462 | + }, |
| 1463 | + { |
| 1464 | + "cell_type": "code", |
| 1465 | + "execution_count": null, |
| 1466 | + "id": "3c34ceea", |
| 1467 | + "metadata": {}, |
| 1468 | + "outputs": [], |
| 1469 | + "source": [ |
| 1470 | + "# TODO delete if not needed anymore\n", |
| 1471 | + "\n", |
| 1472 | + "def display_table_for_top_pairwise_changed_file_extensions_deprecated(\n", |
| 1473 | + " data_to_display: pd.DataFrame, \n", |
1385 | 1474 | " top_pairwise_changed_file_extensions: pd.Series,\n", |
1386 | 1475 | " sort_column: str,\n", |
1387 | 1476 | " top_n: int = 10\n", |
|
1408 | 1497 | " \n", |
1409 | 1498 | " combined_data_for_top_extensions = pd.DataFrame().reindex_like(data_to_display.head(0)) # Create an empty DataFrame with the same columns as data_to_display\n", |
1410 | 1499 | " \n", |
1411 | | - " for index, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", |
| 1500 | + " for _, extension in enumerate(top_pairwise_changed_file_extensions, start=1):\n", |
1412 | 1501 | " filtered_data = data_to_display[data_to_display[\"fileExtensionPair\"] == extension]\n", |
1413 | 1502 | " sorted_data = filtered_data.sort_values(by=sort_column, ascending=False).head(top_n).reset_index()\n", |
1414 | | - " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data[selected_columns]], ignore_index=True)\n", |
| 1503 | + " combined_data_for_top_extensions = pd.concat([combined_data_for_top_extensions, sorted_data], ignore_index=True)\n", |
1415 | 1504 | " \n", |
1416 | 1505 | " display(combined_data_for_top_extensions)" |
1417 | 1506 | ] |
|
1507 | 1596 | { |
1508 | 1597 | "cell_type": "code", |
1509 | 1598 | "execution_count": null, |
1510 | | - "id": "67e2a3c4", |
| 1599 | + "id": "1cd03b3f", |
1511 | 1600 | "metadata": {}, |
1512 | 1601 | "outputs": [], |
1513 | 1602 | "source": [ |
1514 | 1603 | "display_table_for_top_pairwise_changed_file_extensions(\n", |
1515 | 1604 | " pairwise_changed_git_files,\n", |
1516 | | - " top_pairwise_changed_file_extensions,\n", |
1517 | | - " sort_column=\"updateCommitCount\"\n", |
| 1605 | + " \"updateCommitCount\",\n", |
1518 | 1606 | ")" |
1519 | 1607 | ] |
1520 | 1608 | }, |
|
1534 | 1622 | ")" |
1535 | 1623 | ] |
1536 | 1624 | }, |
| 1625 | + { |
| 1626 | + "cell_type": "markdown", |
| 1627 | + "id": "55be3351", |
| 1628 | + "metadata": {}, |
| 1629 | + "source": [ |
| 1630 | + "### Files changed together by commit min confidence\n", |
| 1631 | + "\n", |
| 1632 | + "The commit min confidence is the commit count where both files were changed divided by the commit count of the file with the least commits.\n", |
| 1633 | + "This metric is useful to identify pairs of files that are frequently changed together and is not biased by single files that are changed very often." |
| 1634 | + ] |
| 1635 | + }, |
| 1636 | + { |
| 1637 | + "cell_type": "code", |
| 1638 | + "execution_count": null, |
| 1639 | + "id": "a1c9df18", |
| 1640 | + "metadata": {}, |
| 1641 | + "outputs": [], |
| 1642 | + "source": [ |
| 1643 | + "display_table_for_top_pairwise_changed_file_extensions(\n", |
| 1644 | + " pairwise_changed_git_files,\n", |
| 1645 | + " \"updateCommitMinConfidence\"\n", |
| 1646 | + ")" |
| 1647 | + ] |
| 1648 | + }, |
| 1649 | + { |
| 1650 | + "cell_type": "code", |
| 1651 | + "execution_count": null, |
| 1652 | + "id": "7a54edcd", |
| 1653 | + "metadata": {}, |
| 1654 | + "outputs": [], |
| 1655 | + "source": [ |
| 1656 | + "plot_histogram_of_pairwise_changed_files(\n", |
| 1657 | + " data_to_plot = pairwise_changed_git_files,\n", |
| 1658 | + " top_pairwise_changed_file_extensions = top_pairwise_changed_file_extensions,\n", |
| 1659 | + " x_axis_column = \"updateCommitMinConfidence\",\n", |
| 1660 | + " x_axis_label = \"Commit Min Confidence\",\n", |
| 1661 | + " output_file_name = \"CoChangedFilesByCommitMinConfidence\"\n", |
| 1662 | + ")" |
| 1663 | + ] |
| 1664 | + }, |
1537 | 1665 | { |
1538 | 1666 | "cell_type": "markdown", |
1539 | 1667 | "id": "132fd688", |
|
1551 | 1679 | "source": [ |
1552 | 1680 | "display_table_for_top_pairwise_changed_file_extensions(\n", |
1553 | 1681 | " pairwise_changed_git_files,\n", |
1554 | | - " top_pairwise_changed_file_extensions,\n", |
1555 | | - " sort_column=\"updateCommitLift\"\n", |
| 1682 | + " \"updateCommitLift\"\n", |
1556 | 1683 | ")" |
1557 | 1684 | ] |
1558 | 1685 | }, |
|
1589 | 1716 | "source": [ |
1590 | 1717 | "display_table_for_top_pairwise_changed_file_extensions(\n", |
1591 | 1718 | " pairwise_changed_git_files,\n", |
1592 | | - " top_pairwise_changed_file_extensions,\n", |
1593 | | - " sort_column=\"updateCommitJaccardSimilarity\"\n", |
| 1719 | + " \"updateCommitJaccardSimilarity\"\n", |
1594 | 1720 | ")" |
1595 | 1721 | ] |
1596 | 1722 | }, |
|
1610 | 1736 | ")" |
1611 | 1737 | ] |
1612 | 1738 | }, |
| 1739 | + { |
| 1740 | + "cell_type": "markdown", |
| 1741 | + "id": "727772c7", |
| 1742 | + "metadata": {}, |
| 1743 | + "source": [ |
| 1744 | + "### Find pairwise changed files with highly ranked metrics\n", |
| 1745 | + "\n", |
| 1746 | + "Find those pairwise changed files that have a high rank in most metrics.\n", |
| 1747 | + "This is useful to identify pairs of files that score high in most metrics, which indicates a strong co-change relationship." |
| 1748 | + ] |
| 1749 | + }, |
| 1750 | + { |
| 1751 | + "cell_type": "code", |
| 1752 | + "execution_count": null, |
| 1753 | + "id": "9de55c0b", |
| 1754 | + "metadata": {}, |
| 1755 | + "outputs": [], |
| 1756 | + "source": [ |
| 1757 | + "pairwise_changed_git_files['combinedMetricsScore'] = (\n", |
| 1758 | + " pairwise_changed_git_files['updateCommitCountExtensionRank'] +\n", |
| 1759 | + " pairwise_changed_git_files['updateCommitMinConfidenceExtensionRank'] +\n", |
| 1760 | + " pairwise_changed_git_files['updateCommitJaccardSimilarityExtensionRank'] +\n", |
| 1761 | + " pairwise_changed_git_files['updateCommitLiftExtensionRank']\n", |
| 1762 | + ")\n", |
| 1763 | + "columns_to_show = [\n", |
| 1764 | + " \"fileExtensionPair\", \n", |
| 1765 | + " \"filePair\", \n", |
| 1766 | + " \"combinedMetricsScore\",\n", |
| 1767 | + " \"updateCommitCountExtensionRank\",\n", |
| 1768 | + " \"updateCommitMinConfidenceExtensionRank\",\n", |
| 1769 | + " \"updateCommitJaccardSimilarityExtensionRank\",\n", |
| 1770 | + " \"updateCommitLiftExtensionRank\",\n", |
| 1771 | + " \"updateCommitCount\",\n", |
| 1772 | + " \"updateCommitMinConfidence\",\n", |
| 1773 | + " \"updateCommitJaccardSimilarity\",\n", |
| 1774 | + " \"updateCommitLift\",\n", |
| 1775 | + " \"filePairWithRelativePath\",\n", |
| 1776 | + "]\n", |
| 1777 | + "pairwise_changed_git_files_top_10_ranks = pairwise_changed_git_files.\\\n", |
| 1778 | + " sort_values(by=[\"fileExtensionPair\", \"combinedMetricsScore\"], ascending=[True, True]).\\\n", |
| 1779 | + " groupby(\"fileExtensionPair\").\\\n", |
| 1780 | + " head(10).\\\n", |
| 1781 | + " reset_index(drop=True)\\\n", |
| 1782 | + " [columns_to_show]\n", |
| 1783 | + "display(pairwise_changed_git_files_top_10_ranks)\n" |
| 1784 | + ] |
| 1785 | + }, |
1613 | 1786 | { |
1614 | 1787 | "cell_type": "markdown", |
1615 | 1788 | "id": "14e87aff", |
|
0 commit comments