Skip to content

Commit eb4d764

Browse files
committed
Create sub directories for each anomaly detected code unit
1 parent ead9f54 commit eb4d764

File tree

4 files changed

+67
-57
lines changed

4 files changed

+67
-57
lines changed

domains/anomaly-detection/anomalyDetectionCsv.sh

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -76,18 +76,22 @@ anomaly_detection_queries() {
7676

7777
local language
7878
language=$( extractQueryParameter "projection_language" "${@}" )
79-
79+
80+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
81+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
82+
mkdir -p "${detail_report_directory}"
83+
8084
echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Queries for ${nodeLabel} nodes..."
81-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialImbalancedRoles.csv"
82-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
85+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialImbalancedRoles.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialImbalancedRoles.csv"
86+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPotentialOverEngineerOrIsolated.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PotentialOverEngineerOrIsolated.csv"
8387

84-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_HiddenBridgeNodes.csv"
85-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_PopularBottlenecks.csv"
86-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_SilentCoordinators.csv"
87-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_OverReferencesUtilities.csv"
88-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_FragileStructuralBridges.csv"
89-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_DependencyHungryOrchestrators.csv"
90-
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyDetection_UnexpectedCentralNodes.csv"
88+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionHiddenBridgeNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_HiddenBridgeNodes.csv"
89+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionPopularBottlenecks.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_PopularBottlenecks.csv"
90+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionSilentCoordinators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_SilentCoordinators.csv"
91+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionOverReferencesUtilities.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_OverReferencesUtilities.csv"
92+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionFragileStructuralBridges.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_FragileStructuralBridges.csv"
93+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionDependencyHungryOrchestrators.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_DependencyHungryOrchestrators.csv"
94+
execute_cypher "${ANOMALY_DETECTION_QUERY_CYPHER_DIR}/AnomalyDetectionUnexpectedCentralNodes.cypher" "${@}" > "${detail_report_directory}/AnomalyDetection_UnexpectedCentralNodes.csv"
9195
}
9296

9397
# Label code units with top anomalies by archetype.
@@ -102,11 +106,15 @@ anomaly_detection_labels() {
102106
local language
103107
language=$( extractQueryParameter "projection_language" "${@}" )
104108

109+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
110+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
111+
mkdir -p "${detail_report_directory}"
112+
105113
echo "anomalyDetectionCsv: $(date +'%Y-%m-%dT%H:%M:%S%z') Labelling ${language} ${nodeLabel} anomalies..."
106114
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeRemoveLabels.cypher" "${@}"
107-
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopAuthority.csv"
108-
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopBottleneck.csv"
109-
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}_AnomalyArchetypeTopHub.csv"
115+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeAuthority.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopAuthority.csv"
116+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBottleneck.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopBottleneck.csv"
117+
execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeHub.cypher" "${@}" > "${detail_report_directory}/AnomalyArchetypeTopHub.csv"
110118
# The following two label types require Python scripts to run first and are skipped here intentionally:
111119
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeBridge.cypher" "${@}"
112120
# execute_cypher "${ANOMALY_DETECTION_LABEL_CYPHER_DIR}/AnomalyDetectionArchetypeOutlier.cypher" "${@}"

domains/anomaly-detection/anomalyDetectionFeaturePlots.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def __is_code_language_available(self) -> bool:
9898
def __get_projection_language(self) -> str:
9999
return self.query_parameters_["projection_language"] if self.__is_code_language_available() else ""
100100

101-
def get_plot_prefix(self) -> str:
101+
def get_title_prefix(self) -> str:
102102
if self.__is_code_language_available():
103103
return self.__get_projection_language() + " " + self.__get_projection_node_label()
104104
return self.__get_projection_node_label()
@@ -815,7 +815,7 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
815815
# ------------------------------------------------------------------------------------------------------------
816816

817817
parameters = parse_input_parameters()
818-
plot_prefix = parameters.get_plot_prefix()
818+
title_prefix = parameters.get_title_prefix()
819819
report_directory = parameters.get_report_directory()
820820

821821
driver = get_graph_database_driver()
@@ -828,21 +828,21 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
828828
data['pageRank'],
829829
data['articleRank'],
830830
data['shortCodeUnitName'],
831-
title=f"{plot_prefix} distribution of PageRank - ArticleRank differences",
832-
plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters)
831+
title=f"{title_prefix} distribution of PageRank - ArticleRank differences",
832+
plot_file_path=get_file_path("PageRank_Minus_ArticleRank_Distribution", parameters)
833833
)
834834

835835
plot_feature_distribution(
836836
feature_values=data['clusteringCoefficient'],
837837
feature_name='Clustering Coefficient',
838-
title=f"{plot_prefix} clustering coefficient distribution",
838+
title=f"{title_prefix} clustering coefficient distribution",
839839
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
840840
)
841841

842842
plot_feature_distribution(
843843
feature_values=data['betweenness'],
844844
feature_name='Betweenness',
845-
title=f"{plot_prefix} betweenness centrality distribution",
845+
title=f"{title_prefix} betweenness centrality distribution",
846846
plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
847847
)
848848

@@ -851,15 +851,15 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
851851
data['pageRank'],
852852
data['shortCodeUnitName'],
853853
data['clusterNoise'],
854-
title=f"{plot_prefix} clustering coefficient versus PageRank",
855-
plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_versus_PageRank", parameters)
854+
title=f"{title_prefix} clustering coefficient versus PageRank",
855+
plot_file_path=get_file_path("ClusteringCoefficient_versus_PageRank", parameters)
856856
)
857857

858858
if (overall_cluster_count < 20):
859859
print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.")
860860
plot_clusters(
861861
clustering_visualization_dataframe=data,
862-
title=f"{plot_prefix} all clusters overall",
862+
title=f"{title_prefix} all clusters overall",
863863
plot_file_path=get_file_path("Clusters_Overall", parameters)
864864
)
865865
else:
@@ -869,57 +869,57 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
869869
)
870870
plot_clusters(
871871
clustering_visualization_dataframe=clusters_by_largest_size,
872-
title=f"{plot_prefix} clusters with the largest size",
873-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_size", parameters)
872+
title=f"{title_prefix} clusters with the largest size",
873+
plot_file_path=get_file_path("Clusters_largest_size", parameters)
874874
)
875875

876876
clusters_by_largest_max_radius = get_clusters_by_criteria(
877877
data, by='clusterRadiusMax', ascending=False, cluster_count=20
878878
)
879879
plot_clusters(
880880
clustering_visualization_dataframe=clusters_by_largest_max_radius,
881-
title=f"{plot_prefix} clusters with the largest max radius",
882-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_max_radius", parameters)
881+
title=f"{title_prefix} clusters with the largest max radius",
882+
plot_file_path=get_file_path("Clusters_largest_max_radius", parameters)
883883
)
884884

885885
clusters_by_largest_average_radius = get_clusters_by_criteria(
886886
data, by='clusterRadiusAverage', ascending=False, cluster_count=20
887887
)
888888
plot_clusters(
889889
clustering_visualization_dataframe=clusters_by_largest_average_radius,
890-
title=f"{plot_prefix} clusters with the largest average radius",
891-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_largest_average_radius", parameters)
890+
title=f"{title_prefix} clusters with the largest average radius",
891+
plot_file_path=get_file_path("Clusters_largest_average_radius", parameters)
892892
)
893893

894894
plot_clusters_probabilities(
895895
clustering_visualization_dataframe=data,
896-
title=f"{plot_prefix} clustering probabilities (red=high uncertainty)",
897-
plot_file_path=get_file_path(f"{plot_prefix}_Cluster_probabilities", parameters)
896+
title=f"{title_prefix} clustering probabilities (red=high uncertainty)",
897+
plot_file_path=get_file_path("Cluster_probabilities", parameters)
898898
)
899899

900900
plot_cluster_noise(
901901
clustering_visualization_dataframe=data,
902-
title=f"{plot_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
902+
title=f"{title_prefix} clustering noise points that are surprisingly central (red) or popular (size)",
903903
size_column_name='degree',
904904
color_column_name='pageRank',
905-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_highly_central_and_popular", parameters)
905+
plot_file_path=get_file_path("ClusterNoise_highly_central_and_popular", parameters)
906906
)
907907

908908
plot_cluster_noise(
909909
clustering_visualization_dataframe=data,
910-
title=f"{plot_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
910+
title=f"{title_prefix} clustering noise points that bridge flow (red) and are poorly integrated (size)",
911911
size_column_name='inverseClusteringCoefficient',
912912
color_column_name='betweenness',
913-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_poorly_integrated_bridges", parameters),
913+
plot_file_path=get_file_path("ClusterNoise_poorly_integrated_bridges", parameters),
914914
downscale_normal_sizes=0.4
915915
)
916916

917917
plot_cluster_noise(
918918
clustering_visualization_dataframe=data,
919-
title=f"{plot_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
919+
title=f"{title_prefix} clustering noise points with role inversion (size) possibly violating layering or dependency direction (red)",
920920
size_column_name='pageToArticleRankDifference',
921921
color_column_name='betweenness',
922-
plot_file_path=get_file_path(f"{plot_prefix}_ClusterNoise_role_inverted_bridges", parameters)
922+
plot_file_path=get_file_path("ClusterNoise_role_inverted_bridges", parameters)
923923
)
924924

925925
driver.close()

domains/anomaly-detection/anomalyDetectionPython.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,18 +128,22 @@ anomaly_detection_using_python() {
128128

129129
echo "anomalyDetectionPipeline: $(date +'%Y-%m-%dT%H:%M:%S%z') Executing Python scripts for ${language} ${nodeLabel} nodes..."
130130

131+
# Within the absolute (full) report directory for anomaly detection, create a sub directory for every detailed type (Java_Package, Java_Type,...)
132+
local detail_report_directory="${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}"
133+
mkdir -p "${detail_report_directory}"
134+
131135
# Get tuned Leiden communities as a reference to tune clustering
132136
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedLeidenCommunityDetection.py" "${@}" ${verboseMode}
133137
# Tuned Fast Random Projection and tuned HDBSCAN clustering
134138
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedNodeEmbeddingClustering.py" "${@}" ${verboseMode}
135139
# Reduce the dimensionality of the node embeddings down to 2D for visualization using UMAP
136140
time "${ANOMALY_DETECTION_SCRIPT_DIR}/umap2dNodeEmbeddings.py" "${@}" ${verboseMode}
137141
# Plot the results with clustering and UMAP embeddings to reveal anomalies in rare feature combinations
138-
time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
142+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/anomalyDetectionFeaturePlots.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode}
139143
# Run an unsupervised anomaly detection algorithm including tuning and explainability
140-
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${FULL_REPORT_DIRECTORY}" ${verboseMode}
144+
time "${ANOMALY_DETECTION_SCRIPT_DIR}/tunedAnomalyDetectionExplained.py" "${@}" "--report_directory" "${detail_report_directory}" ${verboseMode}
141145
# Query Results: Output all collected features into a CSV file.
142-
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${FULL_REPORT_DIRECTORY}/${language}_${nodeLabel}AnomalyDetection_Features.csv"
146+
execute_cypher "${ANOMALY_DETECTION_FEATURE_CYPHER_DIR}/AnomalyDetectionFeatures.cypher" "${@}" > "${detail_report_directory}/Anomaly_Features.csv"
143147
}
144148

145149
# Label code units with top anomalies by archetype.

0 commit comments

Comments
 (0)