Add betweenness centrality distribution

JohT · JohT · commit ead9f542aff9 · 2025-10-05T08:12:12.000+02:00
diff --git a/domains/anomaly-detection/anomalyDetectionFeaturePlots.py b/domains/anomaly-detection/anomalyDetectionFeaturePlots.py
@@ -378,37 +378,40 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
     plot.savefig(plot_file_path)
 
 
-def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title: str, plot_file_path: str) -> None:
+def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title: str, plot_file_path: str) -> None:
     """
-    Plots the distribution of clustering coefficients.
-
+    Plots the distribution of feature's values.
+    
     Parameters
     ----------
-    clustering_coefficients : pd.Series
-        Series containing clustering coefficient values.
+    feature_values : pd.Series
+        Series containing feature values.
+    text_prefix: str
+        Text at the beginning of the title
     """
-    if clustering_coefficients.empty:
+    if feature_values.empty:
         print("No data available to plot.")
         return
 
     plot.figure(figsize=(10, 6))
     plot.figure(figsize=(10, 6))
-    plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')
+    plot.hist(feature_values, bins=40, color='blue', alpha=0.7, edgecolor='black')
     plot.title(title, pad=20)
-    plot.xlabel('Clustering Coefficient')
+    plot.xlabel(feature_name)
     plot.ylabel('Frequency')
-    plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())
+    plot.xlim(left=feature_values.min(), right=feature_values.max())
     # plot.yscale('log')  # Use logarithmic scale for better visibility of differences
     plot.grid(True)
-    plot.tight_layout()
 
-    mean = clustering_coefficients.mean()
-    standard_deviation = clustering_coefficients.std()
+    mean = feature_values.mean()
+    standard_deviation = feature_values.std()
 
     # Vertical line for the mean
     plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
     # Vertical line for 1 x standard deviations + mean (=z-score of 1)
-    plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1)
+    plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
+    # Vertical line for 2 x standard deviations + mean (=z-score of 2)
+    plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
 
     plot.tight_layout()
     plot.savefig(plot_file_path)
@@ -829,10 +832,18 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
     plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters)
 )
 
-plot_clustering_coefficient_distribution(
-    data['clusteringCoefficient'],
-    title=f"{plot_prefix} distribution of clustering coefficients",
-    plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_distribution", parameters)
+plot_feature_distribution(
+    feature_values=data['clusteringCoefficient'],
+    feature_name='Clustering Coefficient',
+    title=f"{plot_prefix} clustering coefficient distribution",
+    plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
+)
+
+plot_feature_distribution(
+    feature_values=data['betweenness'],
+    feature_name='Betweenness',
+    title=f"{plot_prefix} betweenness centrality distribution",
+    plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
 )
 
 plot_clustering_coefficient_vs_page_rank(
@@ -848,8 +859,8 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
     print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.")
     plot_clusters(
         clustering_visualization_dataframe=data,
-        title=f"{plot_prefix} all clusters overall (less than 20)",
-        plot_file_path=get_file_path(f"{plot_prefix}_Clusters_Overall", parameters)
+        title=f"{plot_prefix} all clusters overall",
+        plot_file_path=get_file_path("Clusters_Overall", parameters)
     )
 else:
     print(f"anomalyDetectionFeaturePlots: More than 20 clusters: {overall_cluster_count}. Different plots focussing on different features like cluster size will be created.")
diff --git a/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb b/domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb
@@ -640,55 +640,78 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ed900c59",
+   "id": "d7b587c9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title_prefix: str) -> None:\n",
+    "def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title_prefix: str) -> None:\n",
     "    \"\"\"\n",
-    "    Plots the distribution of clustering coefficients.\n",
+    "    Plots the distribution of feature's values.\n",
     "    \n",
     "    Parameters\n",
     "    ----------\n",
-    "    clustering_coefficients : pd.Series\n",
-    "        Series containing clustering coefficient values.\n",
+    "    feature_values : pd.Series\n",
+    "        Series containing feature values.\n",
     "    text_prefix: str\n",
     "        Text at the beginning of the title\n",
     "    \"\"\"\n",
-    "    if clustering_coefficients.empty:\n",
+    "    if feature_values.empty:\n",
     "        print(\"No data available to plot.\")\n",
     "        return\n",
     "\n",
     "    plot.figure(figsize=(10, 6))\n",
     "    plot.figure(figsize=(10, 6))\n",
-    "    plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')\n",
-    "    plot.title(f\"{title_prefix} Distribution of Clustering Coefficients\", pad=20)\n",
-    "    plot.xlabel('Clustering Coefficient')\n",
+    "    plot.hist(feature_values, bins=40, color='blue', alpha=0.7, edgecolor='black')\n",
+    "    plot.title(f\"{title_prefix} Distribution of the feature '{feature_name}'\", pad=20)\n",
+    "    plot.xlabel(feature_name)\n",
     "    plot.ylabel('Frequency')\n",
-    "    plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())\n",
+    "    plot.xlim(left=feature_values.min(), right=feature_values.max())\n",
     "    # plot.yscale('log')  # Use logarithmic scale for better visibility of differences\n",
     "    plot.grid(True)\n",
     "    plot.tight_layout()\n",
     "\n",
-    "    mean = clustering_coefficients.mean()\n",
-    "    standard_deviation = clustering_coefficients.std()\n",
+    "    mean = feature_values.mean()\n",
+    "    standard_deviation = feature_values.std()\n",
     "\n",
     "    # Vertical line for the mean\n",
     "    plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n",
     "    # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n",
-    "    plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1)\n",
+    "    plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)\n",
+    "    # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n",
+    "    plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)\n",
     "\n",
     "    plot.show()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed900c59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title_prefix: str) -> None:\n",
+    "    \"\"\"\n",
+    "    Plots the distribution of clustering coefficients.\n",
+    "    \n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    clustering_coefficients : pd.Series\n",
+    "        Series containing clustering coefficient values.\n",
+    "    text_prefix: str\n",
+    "        Text at the beginning of the title\n",
+    "    \"\"\"\n",
+    "    plot_feature_distribution(clustering_coefficients, 'Clustering Coefficient', title_prefix)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "92aff8d9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_clustering_coefficient_distribution(java_package_features['clusteringCoefficient'], title_prefix=\"Java Package\")"
+    "plot_feature_distribution(java_package_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Package\")"
    ]
   },
   {
@@ -798,6 +821,24 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "38cad9cb",
+   "metadata": {},
+   "source": [
+    "### 1.2b Betweenness Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01c0ea0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_feature_distribution(java_package_features['betweenness'], 'Betweenness', title_prefix=\"Java Package\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "630f5e4b",
@@ -1402,7 +1443,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_clustering_coefficient_distribution(java_type_features['clusteringCoefficient'], title_prefix=\"Java Package\")"
+    "plot_feature_distribution(java_type_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Type\")"
    ]
   },
   {
@@ -1421,6 +1462,24 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "dfb7560d",
+   "metadata": {},
+   "source": [
+    "### 2.2b Betweenness Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1082ef81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_feature_distribution(java_type_features['betweenness'], 'Betweenness', title_prefix=\"Java Type\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "69256999",