Fix grouping issue by adding the distinct column firstFileName

JohT · JohT · commit e1cd9c5865f3 · 2025-03-17T15:35:54.000+01:00
diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb
@@ -50,7 +50,11 @@
     "# To be able to distinguish between command line execution and Jupyter notebook execution\n",
     "# we need to check if the environment variable NBCONVERT is set.\n",
     "# The command line execution is required to take care of setting NBCONVERT.\n",
-    "# TODO Make an exception for command line execution with nbconvert if it results in an execute Jupyter Notebook (not pdf, not markdown)\n",
+    "\n",
+    "# Note: Even if it would be great to retain the interactivity of plotly Treemap plots (e.g. clicking into details)\n",
+    "#       for command line executed notebooks (via nbconvert),\n",
+    "#       it would require to execute the notebook twice: Once including interactivity and once for static Markdown and PDF.\n",
+    "#       Therefore, command line executed notebooks (nbconvert) will contain static graphics (here using svg).\n",
     "def is_command_line_execution():\n",
     "    return 'NBCONVERT' in os.environ\n",
     "\n",
@@ -362,6 +366,9 @@
     "    gitRepositoryName : str : The name of the column that contains the value to be used instead of an empty root\n",
     "    return : pd.DataFrame : The modified data frame\n",
     "    \"\"\"\n",
+    "    if not repository_column_name in data_frame.columns:\n",
+    "        return data_frame # Column already exists\n",
+    "    \n",
     "    repository_names = data_frame[repository_column_name]\n",
     "    data_frame[column_name] = data_frame[column_name].replace(\"\", np.NaN).fillna(repository_names)\n",
     "\n",
@@ -508,6 +515,7 @@
     "# display(git_files_with_commit_statistics)\n",
     "\n",
     "# Define how common non-grouped columns will be aggregated.\n",
+    "# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n",
     "common_named_aggregation = dict(\n",
     "    commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
     "    daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
@@ -522,6 +530,7 @@
     "# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n",
     "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
     "    filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
+    "    firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
     "    **common_named_aggregation\n",
     ")\n",
     "\n",
@@ -535,27 +544,42 @@
     "# display(git_files_with_commit_statistics)\n",
     "\n",
     "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
+    "# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n",
     "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
     "    fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
+    "    firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
     "    authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
     "    mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
     "    secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
     "    **common_named_aggregation\n",
     ")\n",
     "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
     "\n",
-    "# TODO Group the data a third time by all columns except for the directory to join directories the only contain one subdirectory (e.g. org/myproject/src)\n",
-    "\n",
     "# Debug\n",
     "# display(\"4. grouped by 'directoryPath' ----------------------\")\n",
     "# display(git_files_with_commit_statistics)\n",
     "\n",
     "# Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n",
-    "git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics)\n",
-    "git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics)\n",
+    "git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics, 'directoryPath', 'directoryName')\n",
+    "git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics, 'directoryPath', 'directoryParentPath')\n",
     "\n",
     "# Debug\n",
     "# display(\"5. added parent and name columns ------------\")\n",
+    "# display(git_files_with_commit_statistics)\n",
+    "\n",
+    "# Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.\n",
+    "all_column_names_except_for_the_directory_path = git_files_with_commit_statistics.columns.to_list()[3:]\n",
+    "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(all_column_names_except_for_the_directory_path).aggregate(\n",
+    "   directoryName=pd.NamedAgg(column=\"directoryName\", aggfunc=lambda names: '/'.join(names)),\n",
+    "   directoryParentPath=pd.NamedAgg(column=\"directoryParentPath\", aggfunc=\"first\"),\n",
+    "   directoryPath=pd.NamedAgg(column=\"directoryPath\", aggfunc=\"last\"),\n",
+    ")\n",
+    "# Reorder the column positions so that the directory path is again the first column. \n",
+    "all_column_names_with_the_directory_path_first = ['directoryPath', 'directoryParentPath', 'directoryName'] + all_column_names_except_for_the_directory_path\n",
+    "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()[all_column_names_with_the_directory_path_first]\n",
+    "\n",
+    "# Debug\n",
+    "# display(\"6. grouped by all except for directory path, name and parent columns (max) ----------------------\")\n",
     "# display(git_files_with_commit_statistics)"
    ]
   },