From b898b1611717497e2176b368db8ab5bd27a017e8 Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Tue, 25 Mar 2025 07:55:54 +0100 Subject: [PATCH 1/4] Change default svg rendering size to 1080x1080 --- jupyter/GitHistoryGeneral.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index fc4a1665a..c1f59ec5b 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -208,8 +208,8 @@ ")\n", "plotly_treemap_figure_show_settings = dict(\n", " renderer=\"svg\" if is_command_line_execution() else None,\n", - " width=1000,\n", - " height=800\n", + " width=1080,\n", + " height=1080\n", ")\n", "\n", "plotly_treemap_marker_base_style = dict(\n", From 46290acd5451ce7e4628f118cd67846bc47e535d Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Tue, 25 Mar 2025 08:14:23 +0100 Subject: [PATCH 2/4] Add plot highlighting directories with very few authors --- jupyter/GitHistoryGeneral.ipynb | 38 +++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index c1f59ec5b..1b10ceeab 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -766,6 +766,44 @@ "figure.show(**plotly_treemap_figure_show_settings)" ] }, + { + "cell_type": "markdown", + "id": "485b5194", + "metadata": {}, + "source": [ + "### Directories with very few different authors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3175be23", + "metadata": {}, + "outputs": [], + "source": [ + "git_commit_authors_per_directory_low_focus = add_quantile_limited_column(git_files_with_commit_statistics, \"authorCount\", 0.33)\n", + "\n", + "author_count_top_limit = git_commit_authors_per_directory_low_focus['authorCount_limited'].max().astype(int).astype(str)\n", + "author_count_top_limit_label_alias = {author_count_top_limit: author_count_top_limit + ' or more'}\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(git_commit_authors_per_directory_low_focus),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = git_commit_authors_per_directory['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=git_commit_authors_per_directory_low_focus['authorCount_limited'], \n", + " colorbar=dict(title=\"Authors\", labelalias=author_count_top_limit_label_alias),\n", + " reversescale=True\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Number of distinct commit authors (red/black = ony one or very few authors)',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, { "cell_type": "markdown", "id": "5dbceaef", From 30349a77160acd1cde612c199c74b3c67f4cafdb Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Fri, 28 Mar 2025 08:00:18 +0100 Subject: [PATCH 3/4] Add treemap plot that shows commit counts of pairwise changed files --- ...it_files_that_were_changed_together.cypher | 25 ++++ ...that_were_changed_together_unwinded.cypher | 25 ++++ jupyter/GitHistoryGeneral.ipynb | 134 +++++++++++++++++- 3 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 cypher/GitLog/List_git_files_that_were_changed_together.cypher create mode 100644 cypher/GitLog/List_git_files_that_were_changed_together_unwinded.cypher diff --git a/cypher/GitLog/List_git_files_that_were_changed_together.cypher b/cypher/GitLog/List_git_files_that_were_changed_together.cypher new file mode 100644 index 000000000..9eb91fa79 --- /dev/null +++ b/cypher/GitLog/List_git_files_that_were_changed_together.cypher @@ -0,0 +1,25 @@ +// List git files that where changed together frequently + +MATCH (global_git_commit:Git:Commit) + WITH count(global_git_commit) AS globalCommitCount +MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) +MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) +WHERE git_file.deletedAt IS NULL + WITH *, git_repository.name + '/' + git_file.relativePath AS filePath + WITH globalCommitCount + ,git_commit.sha AS commitHash + ,collect(DISTINCT filePath) AS filesInCommit +WHERE size(filesInCommit) >= 2 + AND size(filesInCommit) <= 50 + WITH globalCommitCount + ,commitHash + ,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations +UNWIND fileCombinations AS fileCombination + WITH globalCommitCount + ,apoc.coll.sort(fileCombination) AS fileCombination + ,count(DISTINCT commitHash) AS commitCount +WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare +RETURN fileCombination[0] AS firstFile + ,fileCombination[1] AS secondFile + ,commitCount +ORDER BY commitCount DESC \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_unwinded.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_unwinded.cypher new file mode 100644 index 000000000..236ee8374 --- /dev/null +++ b/cypher/GitLog/List_git_files_that_were_changed_together_unwinded.cypher @@ -0,0 +1,25 @@ +// List git files that where changed together frequently + +MATCH (global_git_commit:Git:Commit) + WITH count(global_git_commit) AS globalCommitCount +MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File) +MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) +WHERE git_file.deletedAt IS NULL + WITH *, git_repository.name + '/' + git_file.relativePath AS filePath + WITH globalCommitCount + ,git_commit.sha AS commitHash + ,collect(DISTINCT filePath) AS filesInCommit +WHERE size(filesInCommit) >= 2 + AND size(filesInCommit) <= 50 + WITH globalCommitCount + ,commitHash + ,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations +UNWIND fileCombinations AS fileCombination +UNWIND fileCombination AS filePath + WITH globalCommitCount + ,filePath + ,count(DISTINCT commitHash) AS commitCount +WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare +RETURN filePath + ,commitCount +ORDER BY commitCount DESC \ No newline at end of file diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 1b10ceeab..261b2af21 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -793,7 +793,10 @@ " marker=dict(\n", " **plotly_treemap_marker_base_colorscale,\n", " colors=git_commit_authors_per_directory_low_focus['authorCount_limited'], \n", - " colorbar=dict(title=\"Authors\", labelalias=author_count_top_limit_label_alias),\n", + " colorbar=dict(title=\"Authors\",\n", + " tickmode=\"auto\",\n", + " labelalias=author_count_top_limit_label_alias\n", + " ),\n", " reversescale=True\n", " ),\n", "))\n", @@ -801,9 +804,51 @@ " **plotly_treemap_layout_base_settings,\n", " title='Number of distinct commit authors (red/black = ony one or very few authors)',\n", ")\n", + "\n", "figure.show(**plotly_treemap_figure_show_settings)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "e11947c5", + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.graph_objects as go\n", + "\n", + "# Example data\n", + "labels = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n", + "parents = [\"\", \"A\", \"A\", \"B\", \"B\"]\n", + "values = [10, 20, 30, 40, 50] # Color scale values\n", + "max_value = max(values)\n", + "\n", + "# Create treemap\n", + "fig = go.Figure(go.Treemap(\n", + " labels=labels,\n", + " parents=parents,\n", + " values=values,\n", + " marker=dict(\n", + " colors=values,\n", + " colorscale=\"Blues\",\n", + " colorbar=dict(\n", + " title=\"Value\",\n", + " tickmode=\"auto\", # Let Plotly auto-select ticks\n", + " ticklabelposition=\"outside top\",\n", + " tickformat=\",\", # Use default formatting\n", + " ticklabeloverflow=\"allow\", # Ensure long labels are displayed\n", + " ticklabelstep=1 # Show all labels\n", + " )\n", + " )\n", + "))\n", + "\n", + "# Add an alias for the highest tick value dynamically\n", + "fig.update_layout(coloraxis_colorbar_tickvals=[max_value])\n", + "fig.update_layout(coloraxis_colorbar_ticktext=[f\"{max_value} or more\"])\n", + "\n", + "fig.show()\n" + ] + }, { "cell_type": "markdown", "id": "5dbceaef", @@ -1083,6 +1128,93 @@ "figure.show(**plotly_treemap_figure_show_settings)" ] }, + { + "cell_type": "markdown", + "id": "80bd7c28", + "metadata": {}, + "source": [ + "### File changed frequently with other files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24055998", + "metadata": {}, + "outputs": [], + "source": [ + "pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_unwinded.cypher\")\n", + "\n", + "# Debug\n", + "# display(\"1. pairwise changed files --------------\")\n", + "# display(pairwise_changed_git_files)\n", + "\n", + "# Add multiple rows for each file path containing all its directories paths in the new column 'directoryPath'\n", + "pairwise_changed_git_files = add_directory_column(pairwise_changed_git_files, 'filePath', 'directoryPath')\n", + "\n", + "# Debug\n", + "# display(\"2. added directories --------------\")\n", + "# display(pairwise_changed_git_files)\n", + "\n", + "# Group the git files by their directory and author and count the number of files of each directory (across all levels).\n", + "pairwise_changed_git_files = pairwise_changed_git_files.groupby(['directoryPath']).aggregate(\n", + " pairwiseChangeCommitCount=pd.NamedAgg(column=\"commitCount\", aggfunc=\"sum\"),\n", + " pairwiseChangeFileCount=pd.NamedAgg(column=\"filePath\", aggfunc=\"count\"),\n", + ")\n", + "pairwise_changed_git_files.reset_index(inplace=True)\n", + "\n", + "# Debug\n", + "# display(\"3. after grouping --------------\")\n", + "# display(pairwise_changed_git_files)\n", + "\n", + "pairwise_changed_git_files = pd.merge(\n", + " git_files_with_commit_statistics, \n", + " pairwise_changed_git_files, \n", + " left_on='directoryPath', \n", + " right_on=\"directoryPath\",\n", + " how=\"left\",\n", + " validate=\"m:1\"\n", + ")\n", + "\n", + "# Debug\n", + "# display(\"4. after merging --------------\")\n", + "# display(pairwise_changed_git_files)\n", + "\n", + "pairwise_changed_git_files['pairwiseChangeCommitCount'] = pairwise_changed_git_files['pairwiseChangeCommitCount'].fillna(0).astype(int)\n", + "pairwise_changed_git_files['pairwiseChangeFileCount'] = pairwise_changed_git_files['pairwiseChangeFileCount'].fillna(0).astype(int)\n", + "pairwise_changed_git_files.reset_index(inplace=True)\n", + "\n", + "# Debug\n", + "# display(\"5. after NaN fill --------------\")\n", + "# display(pairwise_changed_git_files)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19b5a98a", + "metadata": {}, + "outputs": [], + "source": [ + "pairwise_changed_git_files = add_quantile_limited_column(pairwise_changed_git_files, \"pairwiseChangeCommitCount\", 0.98)\n", + "\n", + "figure = plotly_graph_objects.Figure(plotly_graph_objects.Treemap(\n", + " create_treemap_commit_statistics_settings(pairwise_changed_git_files),\n", + " # Without values, much more squares are shown which gives a much better overview. The drawback is that the fileCount isn't visible.\n", + " # values = pairwise_changed_git_files['fileCount'],\n", + " marker=dict(\n", + " **plotly_treemap_marker_base_colorscale,\n", + " colors=pairwise_changed_git_files['pairwiseChangeCommitCount_limited'], \n", + " colorbar=dict(title=\"Changes\"),\n", + " ),\n", + "))\n", + "figure.update_layout(\n", + " **plotly_treemap_layout_base_settings,\n", + " title='Pairwise file changes',\n", + ")\n", + "figure.show(**plotly_treemap_figure_show_settings)" + ] + }, { "cell_type": "markdown", "id": "d8c6ccee", From 10e202e45e5d4ba602b6023277f63ddf60c97f2e Mon Sep 17 00:00:00 2001 From: JohT <7671054+JohT@users.noreply.github.com> Date: Thu, 3 Apr 2025 08:21:28 +0200 Subject: [PATCH 4/4] Add CHANGED_TOGETHER_WITH edge for git file nodes --- ...ER_WITH_relationships_to_code_files.cypher | 11 +++++ ...HER_WITH_relationships_to_git_files.cypher | 44 +++++++++++++++++++ ...it_files_that_were_changed_together.cypher | 33 ++++---------- ...at_were_changed_together_all_in_one.cypher | 25 +++++++++++ ..._changed_together_with_another_file.cypher | 8 ++++ ...ether_with_another_file_all_in_one.cypher} | 0 jupyter/GitHistoryGeneral.ipynb | 4 +- scripts/importGit.sh | 4 ++ 8 files changed, 103 insertions(+), 26 deletions(-) create mode 100644 cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher create mode 100644 cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher create mode 100644 cypher/GitLog/List_git_files_that_were_changed_together_all_in_one.cypher create mode 100644 cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher rename cypher/GitLog/{List_git_files_that_were_changed_together_unwinded.cypher => List_git_files_that_were_changed_together_with_another_file_all_in_one.cypher} (100%) diff --git a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher new file mode 100644 index 000000000..d9abf9bc6 --- /dev/null +++ b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher @@ -0,0 +1,11 @@ +// Take the already existing "CHANGED_TOGETHER_WITH" relationship between git files and apply it to resolved file nodes. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files". + +MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository) +WHERE elementId(firstGitFile) < elementId(secondGitFile) +MATCH (firstGitFile)-[:RESOLVES_TO]->(firstCodeFile:File&!Git&!Repository) +MATCH (secondGitFile)-[:RESOLVES_TO]->(secondCodeFile:File&!Git&!Repository) + CALL (firstCodeFile, secondCodeFile, gitChange) { + MERGE (firstCodeFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondCodeFile) + SET pairwiseChange = properties(gitChange) + } IN TRANSACTIONS +RETURN count(*) AS pairCount \ No newline at end of file diff --git a/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher new file mode 100644 index 000000000..5836cf46b --- /dev/null +++ b/cypher/GitLog/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher @@ -0,0 +1,44 @@ +// Connect git files that where changed together frequently with "CHANGED_TOGETHER_WITH" + +MATCH (global_git_commit:Git:Commit) + WITH count(global_git_commit) AS globalCommitCount +MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) +WHERE git_file.deletedAt IS NULL +// Order files to assure, that pairs of distinct files are grouped together (fileA, fileB) without (fileB, fileA) +ORDER BY git_commit.sha, git_file.relativePath + WITH globalCommitCount + ,git_commit.sha AS commitHash + ,collect(DISTINCT git_file) AS filesInCommit +// Limit the file count to min. 2 (changed together) and +// max. 50 (reduce permutations, improve performance, filter out large refactorings that usually affect many files) +WHERE size(filesInCommit) >= 2 + AND size(filesInCommit) <= 50 +// Collect distinct pairwise (..., 2, 2) combinations of all files in the list + WITH globalCommitCount + ,commitHash + ,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations +UNWIND fileCombinations AS fileCombination + WITH globalCommitCount + ,fileCombination + ,count(DISTINCT commitHash) AS commitCount + ,collect(DISTINCT commitHash) AS commitHashes +// Filter out file pairs that where changed not very often together +// In detail: More than 0.1 per mille compared to overall commit count +WHERE commitCount > globalCommitCount * 0.001 + WITH fileCombination[0] AS firstFile + ,fileCombination[1] AS secondFile + ,commitCount + ,commitHashes +// Create the new relationship "CHANGED_TOGETHER_WITH" and set the property "commitCount" on it + CALL (firstFile, secondFile, commitCount, commitHashes) { + MERGE (firstFile)-[pairwiseChange:CHANGED_TOGETHER_WITH]-(secondFile) + SET pairwiseChange.commitCount = commitCount + ,pairwiseChange.commitHashes = commitHashes + } IN TRANSACTIONS +// Return one row with some statistics about the found pairs and their commit counts +RETURN max(commitCount) AS maxCommitCount + ,avg(commitCount) AS avgCommitCount + ,percentileDisc(commitCount, 0.5) AS percentile50CommitCount + ,percentileDisc(commitCount, 0.9) AS percentile90CommitCount + ,percentileDisc(commitCount, 0.95) AS percentile95CommitCount + ,count(*) AS pairCount \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_that_were_changed_together.cypher b/cypher/GitLog/List_git_files_that_were_changed_together.cypher index 9eb91fa79..f19cf4efb 100644 --- a/cypher/GitLog/List_git_files_that_were_changed_together.cypher +++ b/cypher/GitLog/List_git_files_that_were_changed_together.cypher @@ -1,25 +1,10 @@ -// List git files that where changed together frequently +// List git files that where changed together frequently. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files". -MATCH (global_git_commit:Git:Commit) - WITH count(global_git_commit) AS globalCommitCount -MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) -MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) -WHERE git_file.deletedAt IS NULL - WITH *, git_repository.name + '/' + git_file.relativePath AS filePath - WITH globalCommitCount - ,git_commit.sha AS commitHash - ,collect(DISTINCT filePath) AS filesInCommit -WHERE size(filesInCommit) >= 2 - AND size(filesInCommit) <= 50 - WITH globalCommitCount - ,commitHash - ,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations -UNWIND fileCombinations AS fileCombination - WITH globalCommitCount - ,apoc.coll.sort(fileCombination) AS fileCombination - ,count(DISTINCT commitHash) AS commitCount -WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare -RETURN fileCombination[0] AS firstFile - ,fileCombination[1] AS secondFile - ,commitCount -ORDER BY commitCount DESC \ No newline at end of file +MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository) +WHERE elementId(firstGitFile) < elementId(secondGitFile) +MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile) +MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(secondGitFile) +RETURN gitRepository.name + '/' + firstGitFile.relativePath AS firstFile + ,gitRepository.name + '/' + secondGitFile.relativePath AS secondFile + ,gitChange.commitCount AS commitCount +ORDER BY commitCount DESC diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_all_in_one.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_all_in_one.cypher new file mode 100644 index 000000000..9eb91fa79 --- /dev/null +++ b/cypher/GitLog/List_git_files_that_were_changed_together_all_in_one.cypher @@ -0,0 +1,25 @@ +// List git files that where changed together frequently + +MATCH (global_git_commit:Git:Commit) + WITH count(global_git_commit) AS globalCommitCount +MATCH (git_commit:Git:Commit)-[:CONTAINS_CHANGE]->(git_change:Git:Change:Update)-[:UPDATES]->(git_file:Git:File)MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) +MATCH (git_repository:Git&Repository)-[:HAS_FILE]->(git_file) +WHERE git_file.deletedAt IS NULL + WITH *, git_repository.name + '/' + git_file.relativePath AS filePath + WITH globalCommitCount + ,git_commit.sha AS commitHash + ,collect(DISTINCT filePath) AS filesInCommit +WHERE size(filesInCommit) >= 2 + AND size(filesInCommit) <= 50 + WITH globalCommitCount + ,commitHash + ,apoc.coll.combinations(filesInCommit, 2, 2) AS fileCombinations +UNWIND fileCombinations AS fileCombination + WITH globalCommitCount + ,apoc.coll.sort(fileCombination) AS fileCombination + ,count(DISTINCT commitHash) AS commitCount +WHERE commitCount > globalCommitCount * 0.001 // Filter out combinations that are too rare +RETURN fileCombination[0] AS firstFile + ,fileCombination[1] AS secondFile + ,commitCount +ORDER BY commitCount DESC \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher new file mode 100644 index 000000000..16f8458dd --- /dev/null +++ b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher @@ -0,0 +1,8 @@ +// List git files that where frequently changed with another file. Requires "Add_CHANGED_TOGETHER_WITH_relationships_to_git_files". + +MATCH (firstGitFile:Git&File&!Repository)-[gitChange:CHANGED_TOGETHER_WITH]-(secondGitFile:Git&File&!Repository) +MATCH (gitRepository:Git&Repository)-[:HAS_FILE]->(firstGitFile) +UNWIND gitChange.commitHashes AS commitHash +RETURN gitRepository.name + '/' + firstGitFile.relativePath AS filePath + ,count(DISTINCT commitHash) AS commitCount +ORDER BY commitCount DESC \ No newline at end of file diff --git a/cypher/GitLog/List_git_files_that_were_changed_together_unwinded.cypher b/cypher/GitLog/List_git_files_that_were_changed_together_with_another_file_all_in_one.cypher similarity index 100% rename from cypher/GitLog/List_git_files_that_were_changed_together_unwinded.cypher rename to cypher/GitLog/List_git_files_that_were_changed_together_with_another_file_all_in_one.cypher diff --git a/jupyter/GitHistoryGeneral.ipynb b/jupyter/GitHistoryGeneral.ipynb index 261b2af21..4b34a65d9 100644 --- a/jupyter/GitHistoryGeneral.ipynb +++ b/jupyter/GitHistoryGeneral.ipynb @@ -802,7 +802,7 @@ "))\n", "figure.update_layout(\n", " **plotly_treemap_layout_base_settings,\n", - " title='Number of distinct commit authors (red/black = ony one or very few authors)',\n", + " title='Number of distinct commit authors (red/black = only one or very few authors)',\n", ")\n", "\n", "figure.show(**plotly_treemap_figure_show_settings)" @@ -1143,7 +1143,7 @@ "metadata": {}, "outputs": [], "source": [ - "pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_unwinded.cypher\")\n", + "pairwise_changed_git_files = query_cypher_to_data_frame(\"../cypher/GitLog/List_git_files_that_were_changed_together_with_another_file.cypher\")\n", "\n", "# Debug\n", "# display(\"1. pairwise changed files --------------\")\n", diff --git a/scripts/importGit.sh b/scripts/importGit.sh index 183051371..7ed281310 100755 --- a/scripts/importGit.sh +++ b/scripts/importGit.sh @@ -125,6 +125,10 @@ commonPostGitImport() { execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Java.cypher" execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_RESOLVES_TO_relationships_to_git_files_for_Typescript.cypher" + echo "importGit: Creating relationships to file nodes that where changed together..." + execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_git_files.cypher" + execute_cypher "${GIT_LOG_CYPHER_DIR}/Add_CHANGED_TOGETHER_WITH_relationships_to_code_files.cypher" + # Since it's currently not possible to rule out ambiguity in git<->code file matching, # the following verifications are only an additional info in the log rather than an error. echo "importGit: Running verification queries for troubleshooting (non failing)..."