Skip to content

Commit e1cd9c5

Browse files
committed
Fix grouping issue by adding the distinct column firstFileName
1 parent 456c1d2 commit e1cd9c5

File tree

1 file changed

+29
-5
lines changed

1 file changed

+29
-5
lines changed

jupyter/GitHistoryGeneral.ipynb

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,11 @@
5050
"# To be able to distinguish between command line execution and Jupyter notebook execution\n",
5151
"# we need to check if the environment variable NBCONVERT is set.\n",
5252
"# The command line execution is required to take care of setting NBCONVERT.\n",
53-
"# TODO Make an exception for command line execution with nbconvert if it results in an execute Jupyter Notebook (not pdf, not markdown)\n",
53+
"\n",
54+
"# Note: Even if it would be great to retain the interactivity of plotly Treemap plots (e.g. clicking into details)\n",
55+
"# for command line executed notebooks (via nbconvert),\n",
56+
"# it would require to execute the notebook twice: Once including interactivity and once for static Markdown and PDF.\n",
57+
"# Therefore, command line executed notebooks (nbconvert) will contain static graphics (here using svg).\n",
5458
"def is_command_line_execution():\n",
5559
" return 'NBCONVERT' in os.environ\n",
5660
"\n",
@@ -362,6 +366,9 @@
362366
" gitRepositoryName : str : The name of the column that contains the value to be used instead of an empty root\n",
363367
" return : pd.DataFrame : The modified data frame\n",
364368
" \"\"\"\n",
369+
" if not repository_column_name in data_frame.columns:\n",
370+
" return data_frame # Column already exists\n",
371+
" \n",
365372
" repository_names = data_frame[repository_column_name]\n",
366373
" data_frame[column_name] = data_frame[column_name].replace(\"\", np.NaN).fillna(repository_names)\n",
367374
"\n",
@@ -508,6 +515,7 @@
508515
"# display(git_files_with_commit_statistics)\n",
509516
"\n",
510517
"# Define how common non-grouped columns will be aggregated.\n",
518+
"# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n",
511519
"common_named_aggregation = dict(\n",
512520
" commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n",
513521
" daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n",
@@ -522,6 +530,7 @@
522530
"# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n",
523531
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n",
524532
" filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n",
533+
" firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n",
525534
" **common_named_aggregation\n",
526535
")\n",
527536
"\n",
@@ -535,27 +544,42 @@
535544
"# display(git_files_with_commit_statistics)\n",
536545
"\n",
537546
"# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n",
547+
"# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n",
538548
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n",
539549
" fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n",
550+
" firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n",
540551
" authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n",
541552
" mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n",
542553
" secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n",
543554
" **common_named_aggregation\n",
544555
")\n",
545556
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n",
546557
"\n",
547-
"# TODO Group the data a third time by all columns except for the directory to join directories the only contain one subdirectory (e.g. org/myproject/src)\n",
548-
"\n",
549558
"# Debug\n",
550559
"# display(\"4. grouped by 'directoryPath' ----------------------\")\n",
551560
"# display(git_files_with_commit_statistics)\n",
552561
"\n",
553562
"# Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n",
554-
"git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics)\n",
555-
"git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics)\n",
563+
"git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics, 'directoryPath', 'directoryName')\n",
564+
"git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics, 'directoryPath', 'directoryParentPath')\n",
556565
"\n",
557566
"# Debug\n",
558567
"# display(\"5. added parent and name columns ------------\")\n",
568+
"# display(git_files_with_commit_statistics)\n",
569+
"\n",
570+
"# Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.\n",
571+
"all_column_names_except_for_the_directory_path = git_files_with_commit_statistics.columns.to_list()[3:]\n",
572+
"git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(all_column_names_except_for_the_directory_path).aggregate(\n",
573+
" directoryName=pd.NamedAgg(column=\"directoryName\", aggfunc=lambda names: '/'.join(names)),\n",
574+
" directoryParentPath=pd.NamedAgg(column=\"directoryParentPath\", aggfunc=\"first\"),\n",
575+
" directoryPath=pd.NamedAgg(column=\"directoryPath\", aggfunc=\"last\"),\n",
576+
")\n",
577+
"# Reorder the column positions so that the directory path is again the first column. \n",
578+
"all_column_names_with_the_directory_path_first = ['directoryPath', 'directoryParentPath', 'directoryName'] + all_column_names_except_for_the_directory_path\n",
579+
"git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()[all_column_names_with_the_directory_path_first]\n",
580+
"\n",
581+
"# Debug\n",
582+
"# display(\"6. grouped by all except for directory path, name and parent columns (max) ----------------------\")\n",
559583
"# display(git_files_with_commit_statistics)"
560584
]
561585
},

0 commit comments

Comments
 (0)