Skip to content

Commit c7b0581

Browse files
committed
Reuse common visualization functions for more unified plots
1 parent b524736 commit c7b0581

File tree

4 files changed

+432
-177
lines changed

4 files changed

+432
-177
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ coverage/
9393
.ipynb_checkpoints
9494
*.nbconvert*
9595

96+
# Python
97+
__pycache__/
98+
9699
# Python environments
97100
.conda
98101

domains/anomaly-detection/anomalyDetectionFeaturePlots.py

Lines changed: 126 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import matplotlib.pyplot as plot
2929
import seaborn
3030

31+
from visualization import plot_annotation_style, annotate_each, annotate_each_with_index, scale_marker_sizes, zoom_into_center, zoom_into_center_while_preserving_scores_above_threshold, zoom_into_center_while_preserving_top_scores
3132

3233
class Parameters:
3334
required_parameters_ = ["projection_node_label"]
@@ -256,19 +257,6 @@ def get_clusters_by_criteria(
256257
return data[(data[by] >= threshold) | (data[label_column_name] == -1)]
257258

258259

259-
plot_annotation_style: dict = {
260-
'textcoords': 'offset points',
261-
'arrowprops': dict(arrowstyle='->', color='black', alpha=0.3),
262-
'fontsize': 6,
263-
'backgroundcolor': 'white',
264-
'bbox': dict(boxstyle='round,pad=0.4',
265-
edgecolor='silver',
266-
facecolor='whitesmoke',
267-
alpha=1
268-
)
269-
}
270-
271-
272260
def get_file_path(name: str, parameters: Parameters, extension: str = 'svg') -> str:
273261
name = parameters.get_report_directory() + '/' + name.replace(' ', '_') + '.' + extension
274262
if parameters.is_verbose():
@@ -322,7 +310,7 @@ def plot_difference_between_article_and_page_rank(
322310

323311
plot.figure(figsize=(10, 6))
324312
plot.hist(page_to_article_rank_difference, bins=50, color='blue', alpha=0.7, edgecolor='black')
325-
plot.title(title)
313+
plot.title(title, pad=20)
326314
plot.xlabel('Absolute difference between Page Rank and Article Rank')
327315
plot.ylabel('Frequency')
328316
plot.xlim(left=page_to_article_rank_difference.min(), right=page_to_article_rank_difference.max())
@@ -394,7 +382,7 @@ def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series,
394382
plot.figure(figsize=(10, 6))
395383
plot.figure(figsize=(10, 6))
396384
plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')
397-
plot.title(title)
385+
plot.title(title, pad=20)
398386
plot.xlabel('Clustering Coefficient')
399387
plot.ylabel('Frequency')
400388
plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())
@@ -443,7 +431,7 @@ def plot_clustering_coefficient_vs_page_rank(
443431

444432
plot.figure(figsize=(10, 6))
445433
plot.scatter(x=clustering_coefficients, y=page_ranks, alpha=0.7, color=color)
446-
plot.title(title)
434+
plot.title(title, pad=20)
447435
plot.xlabel('Clustering Coefficient')
448436
plot.ylabel('Page Rank')
449437

@@ -460,33 +448,35 @@ def plot_clustering_coefficient_vs_page_rank(
460448
'clusterNoise': clustering_noise,
461449
}, index=clustering_coefficients.index)
462450

451+
common_column_names_for_annotations = {
452+
"name_column": 'shortName',
453+
"x_position_column": 'clusteringCoefficient',
454+
"y_position_column": 'pageRank'
455+
}
456+
463457
# Annotate points with their names. Filter out values with a page rank smaller than 1.5 standard deviations
464458
mean_page_rank = page_ranks.mean()
465459
standard_deviation_page_rank = page_ranks.std()
466460
threshold_page_rank = mean_page_rank + 1.5 * standard_deviation_page_rank
467-
significant_points = combined_data[combined_data['pageRank'] > threshold_page_rank].reset_index(drop=True).head(10)
468-
for dataframe_index, row in significant_points.iterrows():
469-
index = typing.cast(int, dataframe_index)
470-
plot.annotate(
471-
text=row['shortName'],
472-
xy=(row['clusteringCoefficient'], row['pageRank']),
473-
xytext=(5, 5 + index * 10), # Offset y position for better visibility
474-
**plot_annotation_style
475-
)
461+
significant_points = combined_data[combined_data['pageRank'] > threshold_page_rank].sort_values(by='pageRank', ascending=False).reset_index(drop=True).head(10)
462+
annotate_each_with_index(
463+
significant_points,
464+
using=plot.annotate,
465+
value_column='pageRank',
466+
**common_column_names_for_annotations
467+
)
476468

477469
# Annotate points with the highest clustering coefficients (top 20) and only show the lowest 5 page ranks
478470
combined_data['page_rank_ranking'] = combined_data['pageRank'].rank(ascending=False).astype(int)
479471
combined_data['clustering_coefficient_ranking'] = combined_data['clusteringCoefficient'].rank(ascending=False).astype(int)
480472
top_clustering_coefficients = combined_data.sort_values(by='clusteringCoefficient', ascending=False).reset_index(drop=True).head(20)
481473
top_clustering_coefficients = top_clustering_coefficients.sort_values(by='pageRank', ascending=True).reset_index(drop=True).head(5)
482-
for dataframe_index, row in top_clustering_coefficients.iterrows():
483-
index = typing.cast(int, dataframe_index)
484-
plot.annotate(
485-
text=f"{row['shortName']} (score {row['pageRank']:.4f})",
486-
xy=(row['clusteringCoefficient'], row['pageRank']),
487-
xytext=(5, 5 + index * 10), # Offset y position for better visibility
488-
**plot_annotation_style
489-
)
474+
annotate_each_with_index(
475+
top_clustering_coefficients,
476+
using=plot.annotate,
477+
value_column='clusteringCoefficient',
478+
**common_column_names_for_annotations
479+
)
490480

491481
# plot.yscale('log') # Use logarithmic scale for better visibility of differences
492482
plot.grid(True)
@@ -523,14 +513,30 @@ def truncate(text: str, max_length: int):
523513
# Setup columns
524514
node_size_column = centrality_column_name
525515

516+
clustering_visualization_dataframe_zoomed=zoom_into_center(
517+
clustering_visualization_dataframe,
518+
x_position_column,
519+
y_position_column
520+
)
521+
522+
# Add column with scaled version of "node_size_column" for uniform marker scaling
523+
clustering_visualization_dataframe_zoomed[node_size_column + '_scaled'] = scale_marker_sizes(clustering_visualization_dataframe_zoomed[node_size_column])
524+
525+
def get_common_plot_parameters(data: pd.DataFrame) -> dict:
526+
return {
527+
"x": data[x_position_column],
528+
"y": data[y_position_column],
529+
"s": data[node_size_column + '_scaled'],
530+
}
531+
526532
# Separate HDBSCAN non-noise and noise nodes
527-
node_embeddings_without_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] != -1]
528-
node_embeddings_noise_only = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column_name] == -1]
533+
node_embeddings_without_noise = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column_name] != -1]
534+
node_embeddings_noise_only = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column_name] == -1]
529535

530536
# ------------------------------------------
531537
# Subplot: HDBSCAN Clustering with KDE
532538
# ------------------------------------------
533-
plot.title(title)
539+
plot.title(title, pad=20)
534540

535541
unique_cluster_labels = node_embeddings_without_noise[cluster_label_column_name].unique()
536542
hdbscan_color_palette = seaborn.color_palette(main_color_map, len(unique_cluster_labels))
@@ -541,9 +547,7 @@ def truncate(text: str, max_length: int):
541547

542548
# Plot noise points in gray
543549
plot.scatter(
544-
x=node_embeddings_noise_only[x_position_column],
545-
y=node_embeddings_noise_only[y_position_column],
546-
s=node_embeddings_noise_only[node_size_column] * 80 + 2,
550+
**get_common_plot_parameters(node_embeddings_noise_only),
547551
color='lightgrey',
548552
alpha=0.4,
549553
label="Noise"
@@ -576,23 +580,23 @@ def truncate(text: str, max_length: int):
576580

577581
# Node scatter points
578582
plot.scatter(
579-
x=cluster_nodes[x_position_column],
580-
y=cluster_nodes[y_position_column],
581-
s=cluster_nodes[node_size_column] * 80 + 2,
583+
**get_common_plot_parameters(cluster_nodes),
582584
color=hdbscan_cluster_to_color[cluster_label],
583585
alpha=0.9,
584586
label=f"Cluster {cluster_label}"
585587
)
586588

587589
# Annotate medoids of the cluster
588590
medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]
589-
for index, row in medoids.iterrows():
590-
plot.annotate(
591-
text=f"{truncate(row[code_unit_column_name], 30)} ({row[cluster_label_column_name]})",
592-
xy=(row[x_position_column], row[y_position_column]),
593-
xytext=(5, 5), # Offset for better visibility
594-
**plot_annotation_style
595-
)
591+
annotate_each(
592+
medoids,
593+
using=plot.annotate,
594+
name_column=code_unit_column_name,
595+
x_position_column=x_position_column,
596+
y_position_column=y_position_column,
597+
cluster_label_column=cluster_label_column_name,
598+
alpha=0.6
599+
)
596600

597601
plot.savefig(plot_file_path)
598602

@@ -609,40 +613,51 @@ def plot_clusters_probabilities(
609613
size_column: str = "pageRank",
610614
x_position_column: str = 'embeddingVisualizationX',
611615
y_position_column: str = 'embeddingVisualizationY',
616+
annotate_n_lowest_probabilities: int = 10
612617
) -> None:
613618

614619
if clustering_visualization_dataframe.empty:
615620
print("No projected data to plot available")
616621
return
617622

618-
def truncate(text: str, max_length: int = 22):
619-
if len(text) <= max_length:
620-
return text
621-
return text[:max_length - 3] + "..."
623+
clustering_visualization_dataframe_zoomed=zoom_into_center_while_preserving_top_scores(
624+
clustering_visualization_dataframe,
625+
x_position_column,
626+
y_position_column,
627+
cluster_probability_column,
628+
annotate_n_lowest_probabilities,
629+
lowest_scores=True
630+
)
622631

623-
cluster_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] == -1]
624-
cluster_non_noise = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] != -1]
625-
cluster_even_labels = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] % 2 == 0]
626-
cluster_odd_labels = clustering_visualization_dataframe[clustering_visualization_dataframe[cluster_label_column] % 2 == 1]
632+
# Add column with scaled version of "node_size_column" for uniform marker scaling
633+
clustering_visualization_dataframe_zoomed[size_column + '_scaled'] = scale_marker_sizes(clustering_visualization_dataframe_zoomed[size_column])
634+
635+
def get_common_plot_parameters(data: pd.DataFrame) -> dict:
636+
return {
637+
"x": data[x_position_column],
638+
"y": data[y_position_column],
639+
"s": data[size_column + '_scaled'],
640+
}
641+
642+
cluster_noise = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column] == -1]
643+
cluster_non_noise = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column] != -1]
644+
cluster_even_labels = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column] % 2 == 0]
645+
cluster_odd_labels = clustering_visualization_dataframe_zoomed[clustering_visualization_dataframe_zoomed[cluster_label_column] % 2 == 1]
627646

628647
plot.figure(figsize=(10, 10))
629-
plot.title(title)
648+
plot.title(title, pad=20)
630649

631650
# Plot noise
632651
plot.scatter(
633-
x=cluster_noise[x_position_column],
634-
y=cluster_noise[y_position_column],
635-
s=cluster_noise[size_column] * 10 + 2,
652+
**get_common_plot_parameters(cluster_noise),
636653
color='lightgrey',
637654
alpha=0.4,
638655
label='Noise'
639656
)
640657

641658
# Plot even labels
642659
plot.scatter(
643-
x=cluster_even_labels[x_position_column],
644-
y=cluster_even_labels[y_position_column],
645-
s=cluster_even_labels[size_column] * 10 + 2,
660+
**get_common_plot_parameters(cluster_even_labels),
646661
c=cluster_even_labels[cluster_probability_column],
647662
vmin=0.6,
648663
vmax=1.0,
@@ -653,9 +668,7 @@ def truncate(text: str, max_length: int = 22):
653668

654669
# Plot odd labels
655670
plot.scatter(
656-
x=cluster_odd_labels[x_position_column],
657-
y=cluster_odd_labels[y_position_column],
658-
s=cluster_odd_labels[size_column] * 10 + 2,
671+
**get_common_plot_parameters(cluster_odd_labels),
659672
c=cluster_odd_labels[cluster_probability_column],
660673
vmin=0.6,
661674
vmax=1.0,
@@ -665,28 +678,33 @@ def truncate(text: str, max_length: int = 22):
665678
)
666679

667680
# Annotate medoids of the cluster
668-
cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1].sort_values(by=cluster_size_column, ascending=False).head(20)
669-
for index, row in cluster_medoids.iterrows():
670-
mean_cluster_probability = cluster_non_noise[cluster_non_noise[cluster_label_column] == row[cluster_label_column]][cluster_probability_column].mean()
671-
plot.annotate(
672-
text=f"{truncate(row[code_unit_column])} (cluster {row[cluster_label_column]}) (p={mean_cluster_probability:.4f})",
673-
xy=(row[x_position_column], row[y_position_column]),
674-
xytext=(5, 5),
675-
alpha=0.4,
676-
**plot_annotation_style
677-
)
681+
# Find center node of each cluster (medoid), sort them by cluster size descending and add a mean cluster probability column
682+
cluster_medoids = cluster_non_noise[cluster_non_noise[cluster_medoid_column] == 1]
683+
cluster_medoids_by_cluster_size = cluster_medoids.sort_values(by=cluster_size_column, ascending=False).head(20)
684+
mean_probabilities = cluster_non_noise.groupby(cluster_label_column)[cluster_probability_column].mean().rename('mean_cluster_probability')
685+
cluster_medoids_with_mean_probabilites = cluster_medoids_by_cluster_size.merge(mean_probabilities, on=cluster_label_column, how='left')
686+
687+
annotate_each(
688+
cluster_medoids_with_mean_probabilites,
689+
using=plot.annotate,
690+
name_column=code_unit_column,
691+
x_position_column=x_position_column,
692+
y_position_column=y_position_column,
693+
cluster_label_column=cluster_label_column,
694+
probability_column='mean_cluster_probability',
695+
alpha=0.4
696+
)
678697

679-
lowest_probabilities = cluster_non_noise.sort_values(by=cluster_probability_column, ascending=True).reset_index().head(10)
680-
lowest_probabilities_in_reverse_order = lowest_probabilities.iloc[::-1] # plot most important annotations last to overlap less important ones
681-
for dataframe_index, row in lowest_probabilities_in_reverse_order.iterrows():
682-
index = typing.cast(int, dataframe_index)
683-
plot.annotate(
684-
text=f"#{index}:{truncate(row[code_unit_column], 20)} ({row[cluster_probability_column]:.4f})",
685-
xy=(row[x_position_column], row[y_position_column]),
686-
xytext=(5, 5 + index * 10),
687-
color='red',
688-
**plot_annotation_style
689-
)
698+
lowest_probabilities = cluster_non_noise.sort_values(by=cluster_probability_column, ascending=True).reset_index().head(annotate_n_lowest_probabilities)
699+
annotate_each_with_index(
700+
lowest_probabilities,
701+
using=plot.annotate,
702+
name_column=code_unit_column,
703+
x_position_column=x_position_column,
704+
y_position_column=y_position_column,
705+
probability_column=cluster_probability_column,
706+
color="red"
707+
)
690708

691709
plot.savefig(plot_file_path)
692710

@@ -715,35 +733,45 @@ def plot_cluster_noise(
715733
return
716734

717735
plot.figure(figsize=(10, 10))
718-
plot.title(title)
736+
plot.title(title, pad=20)
719737

720738
# Determine the color threshold for noise points
721739
color_10th_highest_value = noise_points[color_column_name].nlargest(10).iloc[-1] # Get the 10th largest value
722740
color_90_quantile = noise_points[color_column_name].quantile(0.90)
723741
color_threshold = max(color_10th_highest_value, color_90_quantile)
724742

743+
noise_points_zoomed = zoom_into_center_while_preserving_scores_above_threshold(
744+
noise_points,
745+
x_position_column,
746+
y_position_column,
747+
color_column_name,
748+
color_threshold
749+
)
750+
725751
# Color the color column values above the 90% quantile threshold red, the rest light grey
726-
colors = noise_points[color_column_name].apply(
752+
colors = noise_points_zoomed[color_column_name].apply(
727753
lambda x: "red" if x >= color_threshold else "lightgrey"
728754
)
729-
normalized_size = noise_points[size_column_name] / noise_points[size_column_name].max()
755+
756+
# Add column with scaled version of "node_size_column" for uniform marker scaling
757+
noise_points_zoomed[size_column_name + '_scaled'] = scale_marker_sizes(noise_points_zoomed[size_column_name])
730758

731759
# Scatter plot for noise points
732760
plot.scatter(
733-
x=noise_points[x_position_column],
734-
y=noise_points[y_position_column],
735-
s=normalized_size.clip(lower=0.01) * 200 + 2,
761+
x=noise_points_zoomed[x_position_column],
762+
y=noise_points_zoomed[y_position_column],
763+
s=noise_points_zoomed[size_column_name + '_scaled'],
736764
c=colors,
737765
alpha=0.6
738766
)
739767

740768
# Annotate the largest 10 points and all colored ones with their names
741-
for index, row in noise_points.iterrows():
769+
top_red_noise_points = noise_points_zoomed[noise_points_zoomed[color_column_name] >= color_threshold].reset_index(drop=True)
770+
top_red_noise_points_reversed_order = top_red_noise_points.iloc[::-1]
771+
for index, row in top_red_noise_points_reversed_order.iterrows():
742772
index = typing.cast(int, index)
743-
if colors[index] != 'red' and index >= 10:
744-
continue
745773
plot.annotate(
746-
text=row[code_unit_column_name],
774+
text=f"#{index}: {row[code_unit_column_name]}",
747775
xy=(row[x_position_column], row[y_position_column]),
748776
xytext=(5, 5 + (index % 2) * 20), # Offset for better visibility
749777
**plot_annotation_style

0 commit comments

Comments
 (0)