JohT
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎domains/anomaly-detection/anomalyDetectionFeaturePlots.py‎
Lines changed: 100 additions & 82 deletions b/‎domains/anomaly-detection/anomalyDetectionFeaturePlots.py‎
Lines changed: 100 additions & 82 deletions
@@ -93,6 +93,9 @@ coverage/
 .ipynb_checkpoints
 *.nbconvert*
 
+# Python
+__pycache__/
+
 # Python environments
 .conda
 
 
@@ -28,6 +28,7 @@
 import matplotlib.pyplot as plot
 import seaborn
 
+from visualization import plot_annotation_style, annotate_each, annotate_each_with_index, zoom_into_center, zoom_into_center_while_preserving_scores_above_threshold, zoom_into_center_while_preserving_top_scores
 
 class Parameters:
     required_parameters_ = ["projection_node_label"]
@@ -256,19 +257,6 @@ def get_clusters_by_criteria(
     return data[(data[by] >= threshold) | (data[label_column_name] == -1)]
 
 
-plot_annotation_style: dict = {
-    'textcoords': 'offset points',
-    'arrowprops': dict(arrowstyle='->', color='black', alpha=0.3),
-    'fontsize': 6,
-    'backgroundcolor': 'white',
-    'bbox': dict(boxstyle='round,pad=0.4',
-                 edgecolor='silver',
-                 facecolor='whitesmoke',
-                 alpha=1
-                 )
-}
-
-
 def get_file_path(name: str, parameters: Parameters, extension: str = 'svg') -> str:
     name = parameters.get_report_directory() + '/' + name.replace(' ', '_') + '.' + extension
     if parameters.is_verbose():
@@ -460,33 +448,35 @@ def plot_clustering_coefficient_vs_page_rank(
         'clusterNoise': clustering_noise,
     }, index=clustering_coefficients.index)
 
+    common_column_names_for_annotations = {
+        "name_column": 'shortName',
+        "x_position_column": 'clusteringCoefficient', 
+        "y_position_column": 'pageRank'
+    }
+
     # Annotate points with their names. Filter out values with a page rank smaller than 1.5 standard deviations
     mean_page_rank = page_ranks.mean()
     standard_deviation_page_rank = page_ranks.std()
     threshold_page_rank = mean_page_rank + 1.5 * standard_deviation_page_rank
-    significant_points = combined_data[combined_data['pageRank'] > threshold_page_rank].reset_index(drop=True).head(10)
-    for dataframe_index, row in significant_points.iterrows():
-        index = typing.cast(int, dataframe_index)
-        plot.annotate(
-            text=row['shortName'],
-            xy=(row['clusteringCoefficient'], row['pageRank']),
-            xytext=(5, 5 + index * 10),  # Offset y position for better visibility
-            **plot_annotation_style
-        )
+    significant_points = combined_data[combined_data['pageRank'] > threshold_page_rank].sort_values(by='pageRank', ascending=False).reset_index(drop=True).head(10)
+    annotate_each_with_index(
+        significant_points, 
+        using=plot.annotate, 
+        value_column='pageRank',
+        **common_column_names_for_annotations
+    )
 
     # Annotate points with the highest clustering coefficients (top 20) and only show the lowest 5 page ranks
     combined_data['page_rank_ranking'] = combined_data['pageRank'].rank(ascending=False).astype(int)
     combined_data['clustering_coefficient_ranking'] = combined_data['clusteringCoefficient'].rank(ascending=False).astype(int)
     top_clustering_coefficients = combined_data.sort_values(by='clusteringCoefficient', ascending=False).reset_index(drop=True).head(20)
     top_clustering_coefficients = top_clustering_coefficients.sort_values(by='pageRank', ascending=True).reset_index(drop=True).head(5)
-    for dataframe_index, row in top_clustering_coefficients.iterrows():
-        index = typing.cast(int, dataframe_index)
-        plot.annotate(
-            text=f"{row['shortName']} (score {row['pageRank']:.4f})",
-            xy=(row['clusteringCoefficient'], row['pageRank']),
-            xytext=(5, 5 + index * 10),  # Offset y position for better visibility
-            **plot_annotation_style
-        )
+    annotate_each_with_index(
+        top_clustering_coefficients, 
+        using=plot.annotate, 
+        value_column='clusteringCoefficient',
+        **common_column_names_for_annotations
+    )
 
     # plot.yscale('log')  # Use logarithmic scale for better visibility of differences
     plot.grid(True)
@@ -523,9 +513,16 @@ def truncate(text: str, max_length: int):
     # Setup columns
     node_size_column = centrality_column_name
 
+    clustering_visualization_dataframe_zoomed=zoom_into_center(
+        clustering_visualization_dataframe, 
+        x_position_column, 
+        y_position_column,
+        percentile_of_distance_to_center=0.9
+    )
+
     # Separate HDBSCAN non-noise and noise nodes
-    node_embeddings_without_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] != -1]
-    node_embeddings_noise_only = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]
+    node_embeddings_without_noise = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column_name] != -1]
+    node_embeddings_noise_only = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column_name] == -1]
 
     # ------------------------------------------
     # Subplot: HDBSCAN Clustering with KDE
@@ -586,13 +583,15 @@ def truncate(text: str, max_length: int):
 
         # Annotate medoids of the cluster
         medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]
-        for index, row in medoids.iterrows():
-            plot.annotate(
-                text=f"{truncate(row[code_unit_column_name], 30)} ({row[cluster_label_column_name]})",
-                xy=(row[x_position_column], row[y_position_column]),
-                xytext=(5, 5),  # Offset for better visibility
-                **plot_annotation_style
-            )
+        annotate_each(
+            medoids,
+            using=plot.annotate,
+            name_column=code_unit_column_name,
+            x_position_column=x_position_column,
+            y_position_column=y_position_column,
+            cluster_label_column=cluster_label_column_name,
+            alpha=0.6
+        )
 
     plot.savefig(plot_file_path)
 
@@ -609,40 +608,48 @@ def plot_clusters_probabilities(
     size_column: str = "pageRank",
     x_position_column: str = 'embeddingVisualizationX',
     y_position_column: str = 'embeddingVisualizationY',
+    annotate_n_lowest_probabilities: int = 10
 ) -> None:
 
     if clustering_visualization_dataframe.empty:
         print("No projected data to plot available")
         return
 
-    def truncate(text: str, max_length: int = 22):
-        if len(text) <= max_length:
-            return text
-        return text[:max_length - 3] + "..."
+    clustering_visualization_dataframe_zoomed=zoom_into_center_while_preserving_top_scores(
+        clustering_visualization_dataframe, 
+        x_position_column, 
+        y_position_column, 
+        cluster_probability_column,
+        annotate_n_lowest_probabilities,
+        lowest_scores=True
+    )
+
+    cluster_noise = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column] == -1]
+    cluster_non_noise = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column] != -1]
+    cluster_even_labels = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column] % 2 == 0]
+    cluster_odd_labels = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column] % 2 == 1]
 
-    cluster_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] == -1]
-    cluster_non_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] != -1]
-    cluster_even_labels = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] % 2 == 0]
-    cluster_odd_labels = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] % 2 == 1]
+    def get_common_plot_parameters(data: pd.DataFrame) -> dict: 
+        return {
+            "x": data[x_position_column],
+            "y": data[y_position_column],
+            "s": data[size_column] * 10 + 2,
+        }
 
     plot.figure(figsize=(10, 10))
     plot.title(title)
 
     # Plot noise
     plot.scatter(
-        x=cluster_noise[x_position_column],
-        y=cluster_noise[y_position_column],
-        s=cluster_noise[size_column] * 10 + 2,
+        **get_common_plot_parameters(cluster_noise),
         color='lightgrey',
         alpha=0.4,
         label='Noise'
     )
 
     # Plot even labels
     plot.scatter(
-        x=cluster_even_labels[x_position_column],
-        y=cluster_even_labels[y_position_column],
-        s=cluster_even_labels[size_column] * 10 + 2,
+        **get_common_plot_parameters(cluster_even_labels),
         c=cluster_even_labels[cluster_probability_column],
         vmin=0.6,
         vmax=1.0,
@@ -653,9 +660,7 @@ def truncate(text: str, max_length: int = 22):
 
     # Plot odd labels
     plot.scatter(
-        x=cluster_odd_labels[x_position_column],
-        y=cluster_odd_labels[y_position_column],
-        s=cluster_odd_labels[size_column] * 10 + 2,
+        **get_common_plot_parameters(cluster_odd_labels),
         c=cluster_odd_labels[cluster_probability_column],
         vmin=0.6,
         vmax=1.0,
@@ -665,28 +670,33 @@ def truncate(text: str, max_length: int = 22):
     )
 
     # Annotate medoids of the cluster
-    cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1].sort_values(by=cluster_size_column, ascending=False).head(20)
-    for index, row in cluster_medoids.iterrows():
-        mean_cluster_probability = cluster_non_noise[cluster_non_noise[cluster_label_column] == row[cluster_label_column]][cluster_probability_column].mean()
-        plot.annotate(
-            text=f"{truncate(row[code_unit_column])} (cluster {row[cluster_label_column]}) (p={mean_cluster_probability:.4f})",
-            xy=(row[x_position_column], row[y_position_column]),
-            xytext=(5, 5),
-            alpha=0.4,
-            **plot_annotation_style
-        )
+    # Find center node of each cluster (medoid), sort them by cluster size descending and add a mean cluster probability column
+    cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1]
+    cluster_medoids_by_cluster_size = cluster_medoids.sort_values(by=cluster_size_column, ascending=False).head(20)
+    mean_probabilities = cluster_non_noise.groupby(cluster_label_column)[cluster_probability_column].mean().rename('mean_cluster_probability')
+    cluster_medoids_with_mean_probabilites = cluster_medoids_by_cluster_size.merge(mean_probabilities, on=cluster_label_column, how='left')
+
+    annotate_each(
+        cluster_medoids_with_mean_probabilites,
+        using=plot.annotate,
+        name_column=code_unit_column,
+        x_position_column=x_position_column,
+        y_position_column=y_position_column,
+        cluster_label_column=cluster_label_column,
+        probability_column='mean_cluster_probability',
+        alpha=0.4
+    )
 
-    lowest_probabilities = cluster_non_noise.sort_values(by=cluster_probability_column, ascending=True).reset_index().head(10)
-    lowest_probabilities_in_reverse_order = lowest_probabilities.iloc[::-1] # plot most important annotations last to overlap less important ones
-    for dataframe_index, row in lowest_probabilities_in_reverse_order.iterrows():
-        index = typing.cast(int, dataframe_index)
-        plot.annotate(
-            text=f"#{index}:{truncate(row[code_unit_column], 20)} ({row[cluster_probability_column]:.4f})",
-            xy=(row[x_position_column], row[y_position_column]),
-            xytext=(5, 5 + index * 10),
-            color='red',
-            **plot_annotation_style
-        )
+    lowest_probabilities = cluster_non_noise.sort_values(by=cluster_probability_column, ascending=True).reset_index().head(annotate_n_lowest_probabilities)
+    annotate_each_with_index(
+        lowest_probabilities,
+        using=plot.annotate,
+        name_column=code_unit_column,
+        x_position_column=x_position_column,
+        y_position_column=y_position_column,
+        probability_column=cluster_probability_column,
+        color="red"
+    )
 
     plot.savefig(plot_file_path)
 
@@ -722,23 +732,31 @@ def plot_cluster_noise(
     color_90_quantile = noise_points[color_column_name].quantile(0.90)
     color_threshold = max(color_10th_highest_value, color_90_quantile)
 
+    noise_points_zoomed = zoom_into_center_while_preserving_scores_above_threshold(
+        noise_points,
+        x_position_column,
+        y_position_column,
+        color_column_name,
+        color_threshold
+    )
+
     # Color the color column values above the 90% quantile threshold red, the rest light grey
-    colors = noise_points[color_column_name].apply(
+    colors = noise_points_zoomed[color_column_name].apply(
         lambda x: "red" if x >= color_threshold else "lightgrey"
     )
-    normalized_size = noise_points[size_column_name] / noise_points[size_column_name].max()
+    normalized_size = noise_points_zoomed[size_column_name] / noise_points_zoomed[size_column_name].max()
 
     # Scatter plot for noise points
     plot.scatter(
-        x=noise_points[x_position_column],
-        y=noise_points[y_position_column],
+        x=noise_points_zoomed[x_position_column],
+        y=noise_points_zoomed[y_position_column],
         s=normalized_size.clip(lower=0.01) * 200 + 2,
         c=colors,
         alpha=0.6
     )
 
     # Annotate the largest 10 points and all colored ones with their names
-    for index, row in noise_points.iterrows():
+    for index, row in noise_points_zoomed.iterrows():
         index = typing.cast(int, index)
         if colors[index] != 'red' and index >= 10:
             continue