|
50 | 50 | "# To be able to distinguish between command line execution and Jupyter notebook execution\n", |
51 | 51 | "# we need to check if the environment variable NBCONVERT is set.\n", |
52 | 52 | "# The command line execution is required to take care of setting NBCONVERT.\n", |
53 | | - "# TODO Make an exception for command line execution with nbconvert if it results in an execute Jupyter Notebook (not pdf, not markdown)\n", |
| 53 | + "\n", |
| 54 | + "# Note: Even if it would be great to retain the interactivity of plotly Treemap plots (e.g. clicking into details)\n", |
| 55 | + "# for command line executed notebooks (via nbconvert),\n", |
| 56 | + "# it would require to execute the notebook twice: Once including interactivity and once for static Markdown and PDF.\n", |
| 57 | + "# Therefore, command line executed notebooks (nbconvert) will contain static graphics (here using svg).\n", |
54 | 58 | "def is_command_line_execution():\n", |
55 | 59 | " return 'NBCONVERT' in os.environ\n", |
56 | 60 | "\n", |
|
362 | 366 | " gitRepositoryName : str : The name of the column that contains the value to be used instead of an empty root\n", |
363 | 367 | " return : pd.DataFrame : The modified data frame\n", |
364 | 368 | " \"\"\"\n", |
| 369 | + " if not repository_column_name in data_frame.columns:\n", |
| 370 | + " return data_frame # Column already exists\n", |
| 371 | + " \n", |
365 | 372 | " repository_names = data_frame[repository_column_name]\n", |
366 | 373 | " data_frame[column_name] = data_frame[column_name].replace(\"\", np.NaN).fillna(repository_names)\n", |
367 | 374 | "\n", |
|
508 | 515 | "# display(git_files_with_commit_statistics)\n", |
509 | 516 | "\n", |
510 | 517 | "# Define how common non-grouped columns will be aggregated.\n", |
| 518 | + "# Hint: maxCommitSha might not seem very useful, but it actually helps by group similar directories in the final step\n", |
511 | 519 | "common_named_aggregation = dict(\n", |
512 | 520 | " commitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", |
513 | 521 | " daysSinceLastCommit=pd.NamedAgg(column=\"daysSinceLastCommit\", aggfunc=\"min\"),\n", |
|
522 | 530 | "# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n", |
523 | 531 | "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(['directoryPath', 'author']).aggregate(\n", |
524 | 532 | " filePaths=pd.NamedAgg(column=\"filePath\", aggfunc=np.unique),\n", |
| 533 | + " firstFile=pd.NamedAgg(column=\"filePath\", aggfunc=\"first\"),\n", |
525 | 534 | " **common_named_aggregation\n", |
526 | 535 | ")\n", |
527 | 536 | "\n", |
|
535 | 544 | "# display(git_files_with_commit_statistics)\n", |
536 | 545 | "\n", |
537 | 546 | "# Group the entries again now only by their directory path to get the aggregated number of authors, the main author and the second author.\n", |
| 547 | + "# Hint: firstFile (similar to maxCommitSha) might not seem very useful, but it also helps to group similar directories in the final step\n", |
538 | 548 | "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby('directoryPath').aggregate(\n", |
539 | 549 | " fileCount=pd.NamedAgg(column=\"filePaths\", aggfunc=get_file_count_from_aggregated_file_paths),\n", |
| 550 | + " firstFile=pd.NamedAgg(column=\"firstFile\", aggfunc=\"first\"),\n", |
540 | 551 | " authorCount=pd.NamedAgg(column=\"author\", aggfunc=\"nunique\"),\n", |
541 | 552 | " mainAuthor=pd.NamedAgg(column=\"author\", aggfunc=\"first\"),\n", |
542 | 553 | " secondAuthor=pd.NamedAgg(column=\"author\", aggfunc=second_entry),\n", |
543 | 554 | " **common_named_aggregation\n", |
544 | 555 | ")\n", |
545 | 556 | "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()\n", |
546 | 557 | "\n", |
547 | | - "# TODO Group the data a third time by all columns except for the directory to join directories the only contain one subdirectory (e.g. org/myproject/src)\n", |
548 | | - "\n", |
549 | 558 | "# Debug\n", |
550 | 559 | "# display(\"4. grouped by 'directoryPath' ----------------------\")\n", |
551 | 560 | "# display(git_files_with_commit_statistics)\n", |
552 | 561 | "\n", |
553 | 562 | "# Add the name of the directory (last '/' separated element) and the parent directory path to the table.\n", |
554 | | - "git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics)\n", |
555 | | - "git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics)\n", |
| 563 | + "git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics, 'directoryPath', 'directoryName')\n", |
| 564 | + "git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics, 'directoryPath', 'directoryParentPath')\n", |
556 | 565 | "\n", |
557 | 566 | "# Debug\n", |
558 | 567 | "# display(\"5. added parent and name columns ------------\")\n", |
| 568 | + "# display(git_files_with_commit_statistics)\n", |
| 569 | + "\n", |
| 570 | + "# Group finally by all columns except for the directory name, parent and path (first 3 columns) and pick the longest (max) directory path in case there are multiple.\n", |
| 571 | + "all_column_names_except_for_the_directory_path = git_files_with_commit_statistics.columns.to_list()[3:]\n", |
| 572 | + "git_files_with_commit_statistics = git_files_with_commit_statistics.groupby(all_column_names_except_for_the_directory_path).aggregate(\n", |
| 573 | + " directoryName=pd.NamedAgg(column=\"directoryName\", aggfunc=lambda names: '/'.join(names)),\n", |
| 574 | + " directoryParentPath=pd.NamedAgg(column=\"directoryParentPath\", aggfunc=\"first\"),\n", |
| 575 | + " directoryPath=pd.NamedAgg(column=\"directoryPath\", aggfunc=\"last\"),\n", |
| 576 | + ")\n", |
| 577 | + "# Reorder the column positions so that the directory path is again the first column. \n", |
| 578 | + "all_column_names_with_the_directory_path_first = ['directoryPath', 'directoryParentPath', 'directoryName'] + all_column_names_except_for_the_directory_path\n", |
| 579 | + "git_files_with_commit_statistics = git_files_with_commit_statistics.reset_index()[all_column_names_with_the_directory_path_first]\n", |
| 580 | + "\n", |
| 581 | + "# Debug\n", |
| 582 | + "# display(\"6. grouped by all except for directory path, name and parent columns (max) ----------------------\")\n", |
559 | 583 | "# display(git_files_with_commit_statistics)" |
560 | 584 | ] |
561 | 585 | }, |
|
0 commit comments