|
556 | 556 | " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n", |
557 | 557 | " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n", |
558 | 558 | " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n", |
| 559 | + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", |
559 | 560 | " AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n", |
560 | 561 | " AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n", |
561 | 562 | " RETURN DISTINCT \n", |
|
568 | 569 | " ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n", |
569 | 570 | " ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n", |
570 | 571 | " ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n", |
| 572 | + " ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n", |
571 | 573 | " ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n", |
572 | 574 | " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n", |
573 | 575 | "\"\"\"\n", |
|
604 | 606 | " clustering_visualization_dataframe: pd.DataFrame,\n", |
605 | 607 | " title: str,\n", |
606 | 608 | " main_color_map: str = \"tab20\",\n", |
| 609 | + " code_unit_column_name: str = \"shortCodeUnitName\",\n", |
607 | 610 | " cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n", |
| 611 | + " cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n", |
608 | 612 | " centrality_column_name: str = \"pageRank\",\n", |
609 | 613 | " x_position_column = 'embeddingVisualizationX',\n", |
610 | 614 | " y_position_column = 'embeddingVisualizationY'\n", |
|
641 | 645 | " # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n", |
642 | 646 | " # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n", |
643 | 647 | " cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n", |
644 | | - " alpha = max((1.0 - (cluster_diameter / max_diameter)) * 0.45 - 0.25, 0.001)\n", |
| 648 | + " alpha = max((1.0 - (cluster_diameter / max_diameter)) * 0.45 - 0.25, 0.02)\n", |
645 | 649 | "\n", |
646 | 650 | " # KDE cloud shape\n", |
647 | 651 | " if len(cluster_nodes) > 1 and (\n", |
|
668 | 672 | " label=f\"Cluster {cluster_label}\"\n", |
669 | 673 | " )\n", |
670 | 674 | "\n", |
| 675 | + " # Annotate medoids of the cluster\n", |
| 676 | + " medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n", |
| 677 | + " for index, row in medoids.iterrows():\n", |
| 678 | + " plot.annotate(\n", |
| 679 | + " text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n", |
| 680 | + " xy=(row[x_position_column], row[y_position_column]),\n", |
| 681 | + " xytext=(5, 5), # Offset for better visibility\n", |
| 682 | + " **plot_annotation_style\n", |
| 683 | + " )\n", |
| 684 | + "\n", |
671 | 685 | " # Plot noise points in gray\n", |
672 | 686 | " plot.scatter(\n", |
673 | 687 | " x=node_embeddings_noise_only[x_position_column],\n", |
|
678 | 692 | " label=\"Noise\"\n", |
679 | 693 | " )\n", |
680 | 694 | "\n", |
681 | | - " legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n", |
682 | | - " # Workaround to set all legend dots to the same size\n", |
683 | | - " for handle in legend.legend_handles:\n", |
684 | | - " handle.set_sizes([30])\n", |
| 695 | + " # Legend not needed since the clusters are now annotated at their center (medoid)\n", |
| 696 | + " # legend = plot.legend(title=\"HDBSCAN Clusters\", loc=\"best\", prop={'size': 6}, ncols=2)\n", |
| 697 | + " # # Workaround to set all legend dots to the same size\n", |
| 698 | + " # for handle in legend.legend_handles:\n", |
| 699 | + " # handle.set_sizes([30])\n", |
685 | 700 | " \n" |
686 | 701 | ] |
687 | 702 | }, |
|
696 | 711 | " clustering_visualization_dataframe: pd.DataFrame,\n", |
697 | 712 | " title: str,\n", |
698 | 713 | " main_color_map: str = \"tab20\",\n", |
| 714 | + " code_unit_column_name: str = \"shortCodeUnitName\",\n", |
699 | 715 | " cluster_label_column_name: str = \"clusteringHDBSCANLabel\",\n", |
| 716 | + " cluster_medoid_column_name: str = \"clusteringHDBSCANMedoid\",\n", |
700 | 717 | " centrality_column_name: str = \"pageRank\",\n", |
701 | 718 | " x_position_column = 'embeddingVisualizationX',\n", |
702 | 719 | " y_position_column = 'embeddingVisualizationY'\n", |
|
721 | 738 | " quartile_size = (n_clusters + 3) // 4 # ceil division\n", |
722 | 739 | " quartiles = [clusters[i*quartile_size:(i+1)*quartile_size] for i in range(4)]\n", |
723 | 740 | "\n", |
724 | | - " figure, axes = plot.subplots(4, 1, figsize=(10, 32), squeeze=False)\n", |
| 741 | + " figure, axes = plot.subplots(4, 1, figsize=(10, 40), squeeze=False)\n", |
725 | 742 | " figure.suptitle(title, fontsize=14)\n", |
726 | 743 | "\n", |
727 | 744 | " for index, cluster_group in enumerate(quartiles):\n", |
|
742 | 759 | " # we can adjust the alpha value for the KDE plot to visualize smaller clusters more clearly.\n", |
743 | 760 | " # This way, larger clusters will have a lower alpha value, making them less prominent and less prone to overshadow smaller clusters.\n", |
744 | 761 | " cluster_diameter = max_pairwise_distance(cluster_nodes[[x_position_column, y_position_column]].to_numpy())\n", |
745 | | - " alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.001)\n", |
| 762 | + " alpha = max((1.0 - (cluster_diameter / quartile_diameter)) * 0.45 - 0.25, 0.02)\n", |
746 | 763 | "\n", |
747 | 764 | " if len(cluster_nodes) > 1 and cluster_diameter > 0:\n", |
748 | 765 | " seaborn.kdeplot(\n", |
|
764 | 781 | " label=f\"Cluster {cluster_label}\"\n", |
765 | 782 | " )\n", |
766 | 783 | "\n", |
767 | | - " # Plot noise points in gray\n", |
| 784 | + " # Annotate medoids of the cluster\n", |
| 785 | + " medoids = cluster_nodes[cluster_nodes[cluster_medoid_column_name] == 1]\n", |
| 786 | + " for index, row in medoids.iterrows():\n", |
| 787 | + " axis.annotate(\n", |
| 788 | + " text=f\"{row[code_unit_column_name]} ({row[cluster_label_column_name]})\",\n", |
| 789 | + " xy=(row[x_position_column], row[y_position_column]),\n", |
| 790 | + " xytext=(5, 5), # Offset y position for better visibility\n", |
| 791 | + " **plot_annotation_style\n", |
| 792 | + " )\n", |
| 793 | + "\n", |
| 794 | + " # Plot noise points in gray\n", |
768 | 795 | " axis.scatter(\n", |
769 | 796 | " x=node_embeddings_noise_only[x_position_column],\n", |
770 | 797 | " y=node_embeddings_noise_only[y_position_column],\n", |
|
775 | 802 | " )\n", |
776 | 803 | "\n", |
777 | 804 | " axis.set_title(f\"Quartile {index+1}: {len(cluster_group)} clusters\")\n", |
778 | | - " legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n", |
779 | | - " # Workaround to set all legend dots to the same size\n", |
780 | | - " for handle in legend.legend_handles:\n", |
781 | | - " handle.set_sizes([50])\n", |
782 | 805 | "\n", |
783 | | - " plot.tight_layout(rect=(0, 0, 1, 0.99))\n" |
| 806 | + " # Legend not needed since the clusters are now annotated at their center (medoid)\n", |
| 807 | + " # legend = axis.legend(title=\"Cluster\", prop={'size': 6}, loc=\"best\", ncols=1)\n", |
| 808 | + " # # Workaround to set all legend dots to the same size\n", |
| 809 | + " # for handle in legend.legend_handles:\n", |
| 810 | + " # handle.set_sizes([50])\n", |
| 811 | + "\n", |
| 812 | + " plot.tight_layout(rect=(0, 0, 1, 0.98))\n" |
784 | 813 | ] |
785 | 814 | }, |
786 | 815 | { |
|
804 | 833 | " return\n", |
805 | 834 | " \n", |
806 | 835 | " number_of_distinct_clusters = clustering_visualization_dataframe[cluster_label_column_name].nunique()\n", |
807 | | - " print(f\"Number of distinct clusters: {number_of_distinct_clusters}\")\n", |
808 | 836 | " if number_of_distinct_clusters > 30:\n", |
809 | 837 | " plot_clusters_by_size_quartiles(\n", |
810 | 838 | " clustering_visualization_dataframe=clustering_visualization_dataframe,\n", |
|
986 | 1014 | " AND codeUnit.clusteringHDBSCANLabel IS NOT NULL\n", |
987 | 1015 | " AND codeUnit.clusteringHDBSCANProbability IS NOT NULL\n", |
988 | 1016 | " AND codeUnit.clusteringHDBSCANNoise IS NOT NULL\n", |
| 1017 | + " AND codeUnit.clusteringHDBSCANMedoid IS NOT NULL\n", |
989 | 1018 | " AND codeUnit.embeddingFastRandomProjectionVisualizationX IS NOT NULL\n", |
990 | 1019 | " AND codeUnit.embeddingFastRandomProjectionVisualizationY IS NOT NULL\n", |
991 | 1020 | " RETURN DISTINCT \n", |
|
998 | 1027 | " ,codeUnit.clusteringHDBSCANLabel AS clusteringHDBSCANLabel\n", |
999 | 1028 | " ,codeUnit.clusteringHDBSCANProbability AS clusteringHDBSCANProbability\n", |
1000 | 1029 | " ,codeUnit.clusteringHDBSCANNoise AS clusteringHDBSCANNoise\n", |
| 1030 | + " ,codeUnit.clusteringHDBSCANMedoid AS clusteringHDBSCANMedoid\n", |
1001 | 1031 | " ,codeUnit.embeddingFastRandomProjectionVisualizationX AS embeddingVisualizationX\n", |
1002 | 1032 | " ,codeUnit.embeddingFastRandomProjectionVisualizationY AS embeddingVisualizationY\n", |
1003 | 1033 | "\"\"\"\n", |
|
0 commit comments