2828import matplotlib .pyplot as plot
2929import seaborn
3030
31+ from visualization import plot_annotation_style , annotate_each , annotate_each_with_index , scale_marker_sizes , zoom_into_center , zoom_into_center_while_preserving_scores_above_threshold , zoom_into_center_while_preserving_top_scores
3132
3233class Parameters :
3334 required_parameters_ = ["projection_node_label" ]
@@ -256,19 +257,6 @@ def get_clusters_by_criteria(
256257 return data [(data [by ] >= threshold ) | (data [label_column_name ] == - 1 )]
257258
258259
259- plot_annotation_style : dict = {
260- 'textcoords' : 'offset points' ,
261- 'arrowprops' : dict (arrowstyle = '->' , color = 'black' , alpha = 0.3 ),
262- 'fontsize' : 6 ,
263- 'backgroundcolor' : 'white' ,
264- 'bbox' : dict (boxstyle = 'round,pad=0.4' ,
265- edgecolor = 'silver' ,
266- facecolor = 'whitesmoke' ,
267- alpha = 1
268- )
269- }
270-
271-
272260def get_file_path (name : str , parameters : Parameters , extension : str = 'svg' ) -> str :
273261 name = parameters .get_report_directory () + '/' + name .replace (' ' , '_' ) + '.' + extension
274262 if parameters .is_verbose ():
@@ -322,7 +310,7 @@ def plot_difference_between_article_and_page_rank(
322310
323311 plot .figure (figsize = (10 , 6 ))
324312 plot .hist (page_to_article_rank_difference , bins = 50 , color = 'blue' , alpha = 0.7 , edgecolor = 'black' )
325- plot .title (title )
313+ plot .title (title , pad = 20 )
326314 plot .xlabel ('Absolute difference between Page Rank and Article Rank' )
327315 plot .ylabel ('Frequency' )
328316 plot .xlim (left = page_to_article_rank_difference .min (), right = page_to_article_rank_difference .max ())
@@ -394,7 +382,7 @@ def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series,
394382 plot .figure (figsize = (10 , 6 ))
395383 plot .figure (figsize = (10 , 6 ))
396384 plot .hist (clustering_coefficients , bins = 40 , color = 'blue' , alpha = 0.7 , edgecolor = 'black' )
397- plot .title (title )
385+ plot .title (title , pad = 20 )
398386 plot .xlabel ('Clustering Coefficient' )
399387 plot .ylabel ('Frequency' )
400388 plot .xlim (left = clustering_coefficients .min (), right = clustering_coefficients .max ())
@@ -443,7 +431,7 @@ def plot_clustering_coefficient_vs_page_rank(
443431
444432 plot .figure (figsize = (10 , 6 ))
445433 plot .scatter (x = clustering_coefficients , y = page_ranks , alpha = 0.7 , color = color )
446- plot .title (title )
434+ plot .title (title , pad = 20 )
447435 plot .xlabel ('Clustering Coefficient' )
448436 plot .ylabel ('Page Rank' )
449437
@@ -460,33 +448,35 @@ def plot_clustering_coefficient_vs_page_rank(
460448 'clusterNoise' : clustering_noise ,
461449 }, index = clustering_coefficients .index )
462450
451+ common_column_names_for_annotations = {
452+ "name_column" : 'shortName' ,
453+ "x_position_column" : 'clusteringCoefficient' ,
454+ "y_position_column" : 'pageRank'
455+ }
456+
463457 # Annotate points with their names. Filter out values with a page rank smaller than 1.5 standard deviations
464458 mean_page_rank = page_ranks .mean ()
465459 standard_deviation_page_rank = page_ranks .std ()
466460 threshold_page_rank = mean_page_rank + 1.5 * standard_deviation_page_rank
467- significant_points = combined_data [combined_data ['pageRank' ] > threshold_page_rank ].reset_index (drop = True ).head (10 )
468- for dataframe_index , row in significant_points .iterrows ():
469- index = typing .cast (int , dataframe_index )
470- plot .annotate (
471- text = row ['shortName' ],
472- xy = (row ['clusteringCoefficient' ], row ['pageRank' ]),
473- xytext = (5 , 5 + index * 10 ), # Offset y position for better visibility
474- ** plot_annotation_style
475- )
461+ significant_points = combined_data [combined_data ['pageRank' ] > threshold_page_rank ].sort_values (by = 'pageRank' , ascending = False ).reset_index (drop = True ).head (10 )
462+ annotate_each_with_index (
463+ significant_points ,
464+ using = plot .annotate ,
465+ value_column = 'pageRank' ,
466+ ** common_column_names_for_annotations
467+ )
476468
477469 # Annotate points with the highest clustering coefficients (top 20) and only show the lowest 5 page ranks
478470 combined_data ['page_rank_ranking' ] = combined_data ['pageRank' ].rank (ascending = False ).astype (int )
479471 combined_data ['clustering_coefficient_ranking' ] = combined_data ['clusteringCoefficient' ].rank (ascending = False ).astype (int )
480472 top_clustering_coefficients = combined_data .sort_values (by = 'clusteringCoefficient' , ascending = False ).reset_index (drop = True ).head (20 )
481473 top_clustering_coefficients = top_clustering_coefficients .sort_values (by = 'pageRank' , ascending = True ).reset_index (drop = True ).head (5 )
482- for dataframe_index , row in top_clustering_coefficients .iterrows ():
483- index = typing .cast (int , dataframe_index )
484- plot .annotate (
485- text = f"{ row ['shortName' ]} (score { row ['pageRank' ]:.4f} )" ,
486- xy = (row ['clusteringCoefficient' ], row ['pageRank' ]),
487- xytext = (5 , 5 + index * 10 ), # Offset y position for better visibility
488- ** plot_annotation_style
489- )
474+ annotate_each_with_index (
475+ top_clustering_coefficients ,
476+ using = plot .annotate ,
477+ value_column = 'clusteringCoefficient' ,
478+ ** common_column_names_for_annotations
479+ )
490480
491481 # plot.yscale('log') # Use logarithmic scale for better visibility of differences
492482 plot .grid (True )
@@ -523,14 +513,30 @@ def truncate(text: str, max_length: int):
523513 # Setup columns
524514 node_size_column = centrality_column_name
525515
516+ clustering_visualization_dataframe_zoomed = zoom_into_center (
517+ clustering_visualization_dataframe ,
518+ x_position_column ,
519+ y_position_column
520+ )
521+
522+ # Add column with scaled version of "node_size_column" for uniform marker scaling
523+ clustering_visualization_dataframe_zoomed [node_size_column + '_scaled' ] = scale_marker_sizes (clustering_visualization_dataframe_zoomed [node_size_column ])
524+
525+ def get_common_plot_parameters (data : pd .DataFrame ) -> dict :
526+ return {
527+ "x" : data [x_position_column ],
528+ "y" : data [y_position_column ],
529+ "s" : data [node_size_column + '_scaled' ],
530+ }
531+
526532 # Separate HDBSCAN non-noise and noise nodes
527- node_embeddings_without_noise = clustering_visualization_dataframe [ clustering_visualization_dataframe [cluster_label_column_name ] != - 1 ]
528- node_embeddings_noise_only = clustering_visualization_dataframe [ clustering_visualization_dataframe [cluster_label_column_name ] == - 1 ]
533+ node_embeddings_without_noise = clustering_visualization_dataframe_zoomed [ clustering_visualization_dataframe_zoomed [cluster_label_column_name ] != - 1 ]
534+ node_embeddings_noise_only = clustering_visualization_dataframe_zoomed [ clustering_visualization_dataframe_zoomed [cluster_label_column_name ] == - 1 ]
529535
530536 # ------------------------------------------
531537 # Subplot: HDBSCAN Clustering with KDE
532538 # ------------------------------------------
533- plot .title (title )
539+ plot .title (title , pad = 20 )
534540
535541 unique_cluster_labels = node_embeddings_without_noise [cluster_label_column_name ].unique ()
536542 hdbscan_color_palette = seaborn .color_palette (main_color_map , len (unique_cluster_labels ))
@@ -541,9 +547,7 @@ def truncate(text: str, max_length: int):
541547
542548 # Plot noise points in gray
543549 plot .scatter (
544- x = node_embeddings_noise_only [x_position_column ],
545- y = node_embeddings_noise_only [y_position_column ],
546- s = node_embeddings_noise_only [node_size_column ] * 80 + 2 ,
550+ ** get_common_plot_parameters (node_embeddings_noise_only ),
547551 color = 'lightgrey' ,
548552 alpha = 0.4 ,
549553 label = "Noise"
@@ -576,23 +580,23 @@ def truncate(text: str, max_length: int):
576580
577581 # Node scatter points
578582 plot .scatter (
579- x = cluster_nodes [x_position_column ],
580- y = cluster_nodes [y_position_column ],
581- s = cluster_nodes [node_size_column ] * 80 + 2 ,
583+ ** get_common_plot_parameters (cluster_nodes ),
582584 color = hdbscan_cluster_to_color [cluster_label ],
583585 alpha = 0.9 ,
584586 label = f"Cluster { cluster_label } "
585587 )
586588
587589 # Annotate medoids of the cluster
588590 medoids = cluster_nodes [cluster_nodes [cluster_medoid_column_name ] == 1 ]
589- for index , row in medoids .iterrows ():
590- plot .annotate (
591- text = f"{ truncate (row [code_unit_column_name ], 30 )} ({ row [cluster_label_column_name ]} )" ,
592- xy = (row [x_position_column ], row [y_position_column ]),
593- xytext = (5 , 5 ), # Offset for better visibility
594- ** plot_annotation_style
595- )
591+ annotate_each (
592+ medoids ,
593+ using = plot .annotate ,
594+ name_column = code_unit_column_name ,
595+ x_position_column = x_position_column ,
596+ y_position_column = y_position_column ,
597+ cluster_label_column = cluster_label_column_name ,
598+ alpha = 0.6
599+ )
596600
597601 plot .savefig (plot_file_path )
598602
@@ -609,40 +613,51 @@ def plot_clusters_probabilities(
609613 size_column : str = "pageRank" ,
610614 x_position_column : str = 'embeddingVisualizationX' ,
611615 y_position_column : str = 'embeddingVisualizationY' ,
616+ annotate_n_lowest_probabilities : int = 10
612617) -> None :
613618
614619 if clustering_visualization_dataframe .empty :
615620 print ("No projected data to plot available" )
616621 return
617622
618- def truncate (text : str , max_length : int = 22 ):
619- if len (text ) <= max_length :
620- return text
621- return text [:max_length - 3 ] + "..."
623+ clustering_visualization_dataframe_zoomed = zoom_into_center_while_preserving_top_scores (
624+ clustering_visualization_dataframe ,
625+ x_position_column ,
626+ y_position_column ,
627+ cluster_probability_column ,
628+ annotate_n_lowest_probabilities ,
629+ lowest_scores = True
630+ )
622631
623- cluster_noise = clustering_visualization_dataframe [clustering_visualization_dataframe [cluster_label_column ] == - 1 ]
624- cluster_non_noise = clustering_visualization_dataframe [clustering_visualization_dataframe [cluster_label_column ] != - 1 ]
625- cluster_even_labels = clustering_visualization_dataframe [clustering_visualization_dataframe [cluster_label_column ] % 2 == 0 ]
626- cluster_odd_labels = clustering_visualization_dataframe [clustering_visualization_dataframe [cluster_label_column ] % 2 == 1 ]
632+ # Add column with scaled version of "node_size_column" for uniform marker scaling
633+ clustering_visualization_dataframe_zoomed [size_column + '_scaled' ] = scale_marker_sizes (clustering_visualization_dataframe_zoomed [size_column ])
634+
635+ def get_common_plot_parameters (data : pd .DataFrame ) -> dict :
636+ return {
637+ "x" : data [x_position_column ],
638+ "y" : data [y_position_column ],
639+ "s" : data [size_column + '_scaled' ],
640+ }
641+
642+ cluster_noise = clustering_visualization_dataframe_zoomed [clustering_visualization_dataframe_zoomed [cluster_label_column ] == - 1 ]
643+ cluster_non_noise = clustering_visualization_dataframe_zoomed [clustering_visualization_dataframe_zoomed [cluster_label_column ] != - 1 ]
644+ cluster_even_labels = clustering_visualization_dataframe_zoomed [clustering_visualization_dataframe_zoomed [cluster_label_column ] % 2 == 0 ]
645+ cluster_odd_labels = clustering_visualization_dataframe_zoomed [clustering_visualization_dataframe_zoomed [cluster_label_column ] % 2 == 1 ]
627646
628647 plot .figure (figsize = (10 , 10 ))
629- plot .title (title )
648+ plot .title (title , pad = 20 )
630649
631650 # Plot noise
632651 plot .scatter (
633- x = cluster_noise [x_position_column ],
634- y = cluster_noise [y_position_column ],
635- s = cluster_noise [size_column ] * 10 + 2 ,
652+ ** get_common_plot_parameters (cluster_noise ),
636653 color = 'lightgrey' ,
637654 alpha = 0.4 ,
638655 label = 'Noise'
639656 )
640657
641658 # Plot even labels
642659 plot .scatter (
643- x = cluster_even_labels [x_position_column ],
644- y = cluster_even_labels [y_position_column ],
645- s = cluster_even_labels [size_column ] * 10 + 2 ,
660+ ** get_common_plot_parameters (cluster_even_labels ),
646661 c = cluster_even_labels [cluster_probability_column ],
647662 vmin = 0.6 ,
648663 vmax = 1.0 ,
@@ -653,9 +668,7 @@ def truncate(text: str, max_length: int = 22):
653668
654669 # Plot odd labels
655670 plot .scatter (
656- x = cluster_odd_labels [x_position_column ],
657- y = cluster_odd_labels [y_position_column ],
658- s = cluster_odd_labels [size_column ] * 10 + 2 ,
671+ ** get_common_plot_parameters (cluster_odd_labels ),
659672 c = cluster_odd_labels [cluster_probability_column ],
660673 vmin = 0.6 ,
661674 vmax = 1.0 ,
@@ -665,28 +678,33 @@ def truncate(text: str, max_length: int = 22):
665678 )
666679
667680 # Annotate medoids of the cluster
668- cluster_medoids = cluster_non_noise [cluster_non_noise [cluster_medoid_column ] == 1 ].sort_values (by = cluster_size_column , ascending = False ).head (20 )
669- for index , row in cluster_medoids .iterrows ():
670- mean_cluster_probability = cluster_non_noise [cluster_non_noise [cluster_label_column ] == row [cluster_label_column ]][cluster_probability_column ].mean ()
671- plot .annotate (
672- text = f"{ truncate (row [code_unit_column ])} (cluster { row [cluster_label_column ]} ) (p={ mean_cluster_probability :.4f} )" ,
673- xy = (row [x_position_column ], row [y_position_column ]),
674- xytext = (5 , 5 ),
675- alpha = 0.4 ,
676- ** plot_annotation_style
677- )
681+ # Find center node of each cluster (medoid), sort them by cluster size descending and add a mean cluster probability column
682+ cluster_medoids = cluster_non_noise [cluster_non_noise [cluster_medoid_column ] == 1 ]
683+ cluster_medoids_by_cluster_size = cluster_medoids .sort_values (by = cluster_size_column , ascending = False ).head (20 )
684+ mean_probabilities = cluster_non_noise .groupby (cluster_label_column )[cluster_probability_column ].mean ().rename ('mean_cluster_probability' )
685+ cluster_medoids_with_mean_probabilites = cluster_medoids_by_cluster_size .merge (mean_probabilities , on = cluster_label_column , how = 'left' )
686+
687+ annotate_each (
688+ cluster_medoids_with_mean_probabilites ,
689+ using = plot .annotate ,
690+ name_column = code_unit_column ,
691+ x_position_column = x_position_column ,
692+ y_position_column = y_position_column ,
693+ cluster_label_column = cluster_label_column ,
694+ probability_column = 'mean_cluster_probability' ,
695+ alpha = 0.4
696+ )
678697
679- lowest_probabilities = cluster_non_noise .sort_values (by = cluster_probability_column , ascending = True ).reset_index ().head (10 )
680- lowest_probabilities_in_reverse_order = lowest_probabilities .iloc [::- 1 ] # plot most important annotations last to overlap less important ones
681- for dataframe_index , row in lowest_probabilities_in_reverse_order .iterrows ():
682- index = typing .cast (int , dataframe_index )
683- plot .annotate (
684- text = f"#{ index } :{ truncate (row [code_unit_column ], 20 )} ({ row [cluster_probability_column ]:.4f} )" ,
685- xy = (row [x_position_column ], row [y_position_column ]),
686- xytext = (5 , 5 + index * 10 ),
687- color = 'red' ,
688- ** plot_annotation_style
689- )
698+ lowest_probabilities = cluster_non_noise .sort_values (by = cluster_probability_column , ascending = True ).reset_index ().head (annotate_n_lowest_probabilities )
699+ annotate_each_with_index (
700+ lowest_probabilities ,
701+ using = plot .annotate ,
702+ name_column = code_unit_column ,
703+ x_position_column = x_position_column ,
704+ y_position_column = y_position_column ,
705+ probability_column = cluster_probability_column ,
706+ color = "red"
707+ )
690708
691709 plot .savefig (plot_file_path )
692710
@@ -715,35 +733,45 @@ def plot_cluster_noise(
715733 return
716734
717735 plot .figure (figsize = (10 , 10 ))
718- plot .title (title )
736+ plot .title (title , pad = 20 )
719737
720738 # Determine the color threshold for noise points
721739 color_10th_highest_value = noise_points [color_column_name ].nlargest (10 ).iloc [- 1 ] # Get the 10th largest value
722740 color_90_quantile = noise_points [color_column_name ].quantile (0.90 )
723741 color_threshold = max (color_10th_highest_value , color_90_quantile )
724742
743+ noise_points_zoomed = zoom_into_center_while_preserving_scores_above_threshold (
744+ noise_points ,
745+ x_position_column ,
746+ y_position_column ,
747+ color_column_name ,
748+ color_threshold
749+ )
750+
725751 # Color the color column values above the 90% quantile threshold red, the rest light grey
726- colors = noise_points [color_column_name ].apply (
752+ colors = noise_points_zoomed [color_column_name ].apply (
727753 lambda x : "red" if x >= color_threshold else "lightgrey"
728754 )
729- normalized_size = noise_points [size_column_name ] / noise_points [size_column_name ].max ()
755+
756+ # Add column with scaled version of "node_size_column" for uniform marker scaling
757+ noise_points_zoomed [size_column_name + '_scaled' ] = scale_marker_sizes (noise_points_zoomed [size_column_name ])
730758
731759 # Scatter plot for noise points
732760 plot .scatter (
733- x = noise_points [x_position_column ],
734- y = noise_points [y_position_column ],
735- s = normalized_size . clip ( lower = 0.01 ) * 200 + 2 ,
761+ x = noise_points_zoomed [x_position_column ],
762+ y = noise_points_zoomed [y_position_column ],
763+ s = noise_points_zoomed [ size_column_name + '_scaled' ] ,
736764 c = colors ,
737765 alpha = 0.6
738766 )
739767
740768 # Annotate the largest 10 points and all colored ones with their names
741- for index , row in noise_points .iterrows ():
769+ top_red_noise_points = noise_points_zoomed [noise_points_zoomed [color_column_name ] >= color_threshold ].reset_index (drop = True )
770+ top_red_noise_points_reversed_order = top_red_noise_points .iloc [::- 1 ]
771+ for index , row in top_red_noise_points_reversed_order .iterrows ():
742772 index = typing .cast (int , index )
743- if colors [index ] != 'red' and index >= 10 :
744- continue
745773 plot .annotate (
746- text = row [code_unit_column_name ],
774+ text = f"# { index } : { row [code_unit_column_name ]} " ,
747775 xy = (row [x_position_column ], row [y_position_column ]),
748776 xytext = (5 , 5 + (index % 2 ) * 20 ), # Offset for better visibility
749777 ** plot_annotation_style
0 commit comments