Skip to content

Commit ead9f54

Browse files
committed
Add betweenness centrality distribution
1 parent 9ee47df commit ead9f54

File tree

2 files changed

+104
-34
lines changed

2 files changed

+104
-34
lines changed

domains/anomaly-detection/anomalyDetectionFeaturePlots.py

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -378,37 +378,40 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
378378
plot.savefig(plot_file_path)
379379

380380

381-
def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title: str, plot_file_path: str) -> None:
381+
def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title: str, plot_file_path: str) -> None:
382382
"""
383-
Plots the distribution of clustering coefficients.
384-
383+
Plots the distribution of feature's values.
384+
385385
Parameters
386386
----------
387-
clustering_coefficients : pd.Series
388-
Series containing clustering coefficient values.
387+
feature_values : pd.Series
388+
Series containing feature values.
389+
text_prefix: str
390+
Text at the beginning of the title
389391
"""
390-
if clustering_coefficients.empty:
392+
if feature_values.empty:
391393
print("No data available to plot.")
392394
return
393395

394396
plot.figure(figsize=(10, 6))
395397
plot.figure(figsize=(10, 6))
396-
plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')
398+
plot.hist(feature_values, bins=40, color='blue', alpha=0.7, edgecolor='black')
397399
plot.title(title, pad=20)
398-
plot.xlabel('Clustering Coefficient')
400+
plot.xlabel(feature_name)
399401
plot.ylabel('Frequency')
400-
plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())
402+
plot.xlim(left=feature_values.min(), right=feature_values.max())
401403
# plot.yscale('log') # Use logarithmic scale for better visibility of differences
402404
plot.grid(True)
403-
plot.tight_layout()
404405

405-
mean = clustering_coefficients.mean()
406-
standard_deviation = clustering_coefficients.std()
406+
mean = feature_values.mean()
407+
standard_deviation = feature_values.std()
407408

408409
# Vertical line for the mean
409410
plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
410411
# Vertical line for 1 x standard deviations + mean (=z-score of 1)
411-
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1)
412+
plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
413+
# Vertical line for 2 x standard deviations + mean (=z-score of 2)
414+
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
412415

413416
plot.tight_layout()
414417
plot.savefig(plot_file_path)
@@ -829,10 +832,18 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
829832
plot_file_path=get_file_path(f"{plot_prefix}_PageRank_Minus_ArticleRank_Distribution", parameters)
830833
)
831834

832-
plot_clustering_coefficient_distribution(
833-
data['clusteringCoefficient'],
834-
title=f"{plot_prefix} distribution of clustering coefficients",
835-
plot_file_path=get_file_path(f"{plot_prefix}_ClusteringCoefficient_distribution", parameters)
835+
plot_feature_distribution(
836+
feature_values=data['clusteringCoefficient'],
837+
feature_name='Clustering Coefficient',
838+
title=f"{plot_prefix} clustering coefficient distribution",
839+
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
840+
)
841+
842+
plot_feature_distribution(
843+
feature_values=data['betweenness'],
844+
feature_name='Betweenness',
845+
title=f"{plot_prefix} betweenness centrality distribution",
846+
plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
836847
)
837848

838849
plot_clustering_coefficient_vs_page_rank(
@@ -848,8 +859,8 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
848859
print(f"anomalyDetectionFeaturePlots: Less than 20 clusters: {overall_cluster_count}. Only one plot containing all clusters will be created.")
849860
plot_clusters(
850861
clustering_visualization_dataframe=data,
851-
title=f"{plot_prefix} all clusters overall (less than 20)",
852-
plot_file_path=get_file_path(f"{plot_prefix}_Clusters_Overall", parameters)
862+
title=f"{plot_prefix} all clusters overall",
863+
plot_file_path=get_file_path("Clusters_Overall", parameters)
853864
)
854865
else:
855866
print(f"anomalyDetectionFeaturePlots: More than 20 clusters: {overall_cluster_count}. Different plots focussing on different features like cluster size will be created.")

domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb

Lines changed: 74 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -640,55 +640,78 @@
640640
{
641641
"cell_type": "code",
642642
"execution_count": null,
643-
"id": "ed900c59",
643+
"id": "d7b587c9",
644644
"metadata": {},
645645
"outputs": [],
646646
"source": [
647-
"def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title_prefix: str) -> None:\n",
647+
"def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title_prefix: str) -> None:\n",
648648
" \"\"\"\n",
649-
" Plots the distribution of clustering coefficients.\n",
649+
" Plots the distribution of feature's values.\n",
650650
" \n",
651651
" Parameters\n",
652652
" ----------\n",
653-
" clustering_coefficients : pd.Series\n",
654-
" Series containing clustering coefficient values.\n",
653+
" feature_values : pd.Series\n",
654+
" Series containing feature values.\n",
655655
" text_prefix: str\n",
656656
" Text at the beginning of the title\n",
657657
" \"\"\"\n",
658-
" if clustering_coefficients.empty:\n",
658+
" if feature_values.empty:\n",
659659
" print(\"No data available to plot.\")\n",
660660
" return\n",
661661
"\n",
662662
" plot.figure(figsize=(10, 6))\n",
663663
" plot.figure(figsize=(10, 6))\n",
664-
" plot.hist(clustering_coefficients, bins=40, color='blue', alpha=0.7, edgecolor='black')\n",
665-
" plot.title(f\"{title_prefix} Distribution of Clustering Coefficients\", pad=20)\n",
666-
" plot.xlabel('Clustering Coefficient')\n",
664+
" plot.hist(feature_values, bins=40, color='blue', alpha=0.7, edgecolor='black')\n",
665+
" plot.title(f\"{title_prefix} Distribution of the feature '{feature_name}'\", pad=20)\n",
666+
" plot.xlabel(feature_name)\n",
667667
" plot.ylabel('Frequency')\n",
668-
" plot.xlim(left=clustering_coefficients.min(), right=clustering_coefficients.max())\n",
668+
" plot.xlim(left=feature_values.min(), right=feature_values.max())\n",
669669
" # plot.yscale('log') # Use logarithmic scale for better visibility of differences\n",
670670
" plot.grid(True)\n",
671671
" plot.tight_layout()\n",
672672
"\n",
673-
" mean = clustering_coefficients.mean()\n",
674-
" standard_deviation = clustering_coefficients.std()\n",
673+
" mean = feature_values.mean()\n",
674+
" standard_deviation = feature_values.std()\n",
675675
"\n",
676676
" # Vertical line for the mean\n",
677677
" plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n",
678678
" # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n",
679-
" plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=1)\n",
679+
" plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)\n",
680+
" # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n",
681+
" plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)\n",
680682
"\n",
681683
" plot.show()"
682684
]
683685
},
686+
{
687+
"cell_type": "code",
688+
"execution_count": null,
689+
"id": "ed900c59",
690+
"metadata": {},
691+
"outputs": [],
692+
"source": [
693+
"def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title_prefix: str) -> None:\n",
694+
" \"\"\"\n",
695+
" Plots the distribution of clustering coefficients.\n",
696+
" \n",
697+
" Parameters\n",
698+
" ----------\n",
699+
" clustering_coefficients : pd.Series\n",
700+
" Series containing clustering coefficient values.\n",
701+
" text_prefix: str\n",
702+
" Text at the beginning of the title\n",
703+
" \"\"\"\n",
704+
" plot_feature_distribution(clustering_coefficients, 'Clustering Coefficient', title_prefix)"
705+
]
706+
},
684707
{
685708
"cell_type": "code",
686709
"execution_count": null,
687710
"id": "92aff8d9",
688711
"metadata": {},
689712
"outputs": [],
690713
"source": [
691-
"plot_clustering_coefficient_distribution(java_package_features['clusteringCoefficient'], title_prefix=\"Java Package\")"
714+
"plot_feature_distribution(java_package_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Package\")"
692715
]
693716
},
694717
{
@@ -798,6 +821,24 @@
798821
")"
799822
]
800823
},
824+
{
825+
"cell_type": "markdown",
826+
"id": "38cad9cb",
827+
"metadata": {},
828+
"source": [
829+
"### 1.2b Betweenness Distribution"
830+
]
831+
},
832+
{
833+
"cell_type": "code",
834+
"execution_count": null,
835+
"id": "01c0ea0d",
836+
"metadata": {},
837+
"outputs": [],
838+
"source": [
839+
"plot_feature_distribution(java_package_features['betweenness'], 'Betweenness', title_prefix=\"Java Package\")"
840+
]
841+
},
801842
{
802843
"cell_type": "markdown",
803844
"id": "630f5e4b",
@@ -1402,7 +1443,7 @@
14021443
"metadata": {},
14031444
"outputs": [],
14041445
"source": [
1405-
"plot_clustering_coefficient_distribution(java_type_features['clusteringCoefficient'], title_prefix=\"Java Package\")"
1446+
"plot_feature_distribution(java_type_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Type\")"
14061447
]
14071448
},
14081449
{
@@ -1421,6 +1462,24 @@
14211462
")"
14221463
]
14231464
},
1465+
{
1466+
"cell_type": "markdown",
1467+
"id": "dfb7560d",
1468+
"metadata": {},
1469+
"source": [
1470+
"### 2.2b Betweenness Distribution"
1471+
]
1472+
},
1473+
{
1474+
"cell_type": "code",
1475+
"execution_count": null,
1476+
"id": "1082ef81",
1477+
"metadata": {},
1478+
"outputs": [],
1479+
"source": [
1480+
"plot_feature_distribution(java_type_features['betweenness'], 'Betweenness', title_prefix=\"Java Type\")"
1481+
]
1482+
},
14241483
{
14251484
"cell_type": "markdown",
14261485
"id": "69256999",

0 commit comments

Comments
 (0)