Skip to content

Commit 6434c58

Browse files
committed
Hierarchical Clustering added
1 parent b4e6d05 commit 6434c58

File tree

4 files changed

+342
-0
lines changed

4 files changed

+342
-0
lines changed

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ add_executable(KNNRegressor tests/clustering/KNNRegressorTest.cpp)
6565
target_compile_definitions(KNNRegressor PRIVATE TEST_KNN_REGRESSOR)
6666
target_link_libraries(KNNRegressor cpp_ml_library)
6767

68+
add_executable(HierarchicalClustering tests/clustering/HierarchicalClusteringTest.cpp)
69+
target_compile_definitions(HierarchicalClustering PRIVATE TEST_HIERARCHICAL_CLUSTERING)
70+
target_link_libraries(HierarchicalClustering cpp_ml_library)
71+
6872
# Register individual tests
6973
add_test(NAME LogisticRegressionTest COMMAND LogisticRegressionTest)
7074
add_test(NAME PolynomialRegressionTest COMMAND PolynomialRegressionTest)
@@ -76,6 +80,7 @@ add_test(NAME RandomForestClassifier COMMAND RandomForestClassifier)
7680
add_test(NAME KMeansClustering COMMAND KMeansClustering)
7781
add_test(NAME KNNClassifier COMMAND KNNClassifier)
7882
add_test(NAME KNNRegressor COMMAND KNNRegressor)
83+
add_test(NAME HierarchicalClustering COMMAND HierarchicalClustering)
7984

8085

8186
# Add example executables if BUILD_EXAMPLES is ON
@@ -109,6 +114,8 @@ if(BUILD_EXAMPLES)
109114
target_compile_definitions(${EXAMPLE_TARGET} PRIVATE TEST_KKN_CLASSIFIER)
110115
elseif(EXAMPLE_NAME STREQUAL "KNNRegressorExample")
111116
target_compile_definitions(${EXAMPLE_TARGET} PRIVATE TEST_KNN_REGRESSOR)
117+
elseif(EXAMPLE_NAME STREQUAL "HierarchicalClusteringExample")
118+
target_compile_definitions(${EXAMPLE_TARGET} PRIVATE TEST_HIERARCHICAL_CLUSTERING)
112119
endif()
113120
endforeach()
114121
endif()
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#include "../ml_library_include/ml/clustering/HierarchicalClustering.hpp"
2+
#include <iostream>
3+
4+
int testHierarchicalClustering() {
5+
// Sample data
6+
std::vector<std::vector<double>> data = {
7+
{1.0, 2.0},
8+
{1.5, 1.8},
9+
{5.0, 8.0},
10+
{6.0, 9.0},
11+
{1.0, 0.6},
12+
{9.0, 11.0},
13+
{8.0, 2.0},
14+
{10.0, 2.0},
15+
{9.0, 3.0}
16+
};
17+
18+
// Create and fit the model
19+
HierarchicalClustering hc(3, HierarchicalClustering::Linkage::AVERAGE);
20+
hc.fit(data);
21+
22+
// Get cluster labels
23+
std::vector<int> labels = hc.predict();
24+
25+
// Output cluster labels
26+
for (size_t i = 0; i < labels.size(); ++i) {
27+
std::cout << "Data point " << i << " is in cluster " << labels[i] << std::endl;
28+
}
29+
30+
return 0;
31+
}
32+
33+
int main(){
34+
testHierarchicalClustering();
35+
return 0;
36+
}
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
#ifndef HIERARCHICAL_CLUSTERING_HPP
2+
#define HIERARCHICAL_CLUSTERING_HPP
3+
4+
#include <vector>
5+
#include <cmath>
6+
#include <algorithm>
7+
#include <memory>
8+
#include <limits>
9+
10+
/**
11+
* @file HierarchicalClustering.hpp
12+
* @brief Implementation of Agglomerative Hierarchical Clustering.
13+
*/
14+
15+
/**
16+
* @class HierarchicalClustering
17+
* @brief Agglomerative Hierarchical Clustering for clustering tasks.
18+
*/
19+
class HierarchicalClustering {
20+
public:
21+
/**
22+
* @brief Linkage criteria for clustering.
23+
*/
24+
enum class Linkage {
25+
SINGLE,
26+
COMPLETE,
27+
AVERAGE
28+
};
29+
30+
/**
31+
* @brief Constructs a HierarchicalClustering instance.
32+
* @param n_clusters The number of clusters to form.
33+
* @param linkage The linkage criterion to use.
34+
*/
35+
HierarchicalClustering(int n_clusters = 2, Linkage linkage = Linkage::AVERAGE);
36+
37+
/**
38+
* @brief Destructor for HierarchicalClustering.
39+
*/
40+
~HierarchicalClustering();
41+
42+
/**
43+
* @brief Fits the clustering algorithm to the data.
44+
* @param X A vector of feature vectors (data points).
45+
*/
46+
void fit(const std::vector<std::vector<double>>& X);
47+
48+
/**
49+
* @brief Predicts the cluster labels for the data.
50+
* @return A vector of cluster labels.
51+
*/
52+
std::vector<int> predict() const;
53+
54+
/**
55+
* @brief Retrieves the cluster centers (centroids) after fitting.
56+
* @return A vector of cluster centroids.
57+
*/
58+
std::vector<std::vector<double>> get_cluster_centers() const;
59+
60+
private:
61+
int n_clusters; ///< Number of clusters to form.
62+
Linkage linkage; ///< Linkage criterion.
63+
std::vector<std::vector<double>> data; ///< Data points.
64+
65+
struct Cluster {
66+
int id; ///< Unique identifier for the cluster.
67+
std::vector<int> points; ///< Indices of data points in this cluster.
68+
};
69+
70+
std::vector<std::shared_ptr<Cluster>> clusters; ///< Current clusters.
71+
72+
/**
73+
* @brief Computes the Euclidean distance between two data points.
74+
* @param a Index of the first data point.
75+
* @param b Index of the second data point.
76+
* @return The Euclidean distance.
77+
*/
78+
double euclidean_distance(int a, int b) const;
79+
80+
/**
81+
* @brief Computes the distance between two clusters based on the linkage criterion.
82+
* @param cluster_a The first cluster.
83+
* @param cluster_b The second cluster.
84+
* @return The distance between the two clusters.
85+
*/
86+
double cluster_distance(const Cluster& cluster_a, const Cluster& cluster_b) const;
87+
88+
/**
89+
* @brief Merges the two closest clusters.
90+
*/
91+
void merge_clusters();
92+
93+
/**
94+
* @brief Finds the pair of clusters with the minimum distance.
95+
* @return A pair of indices representing the clusters to merge.
96+
*/
97+
std::pair<int, int> find_closest_clusters() const;
98+
};
99+
100+
HierarchicalClustering::HierarchicalClustering(int n_clusters, Linkage linkage)
101+
: n_clusters(n_clusters), linkage(linkage) {}
102+
103+
HierarchicalClustering::~HierarchicalClustering() {}
104+
105+
void HierarchicalClustering::fit(const std::vector<std::vector<double>>& X) {
106+
data = X;
107+
108+
// Initialize each data point as a separate cluster
109+
clusters.clear();
110+
for (size_t i = 0; i < data.size(); ++i) {
111+
auto cluster = std::make_shared<Cluster>();
112+
cluster->id = static_cast<int>(i);
113+
cluster->points.push_back(static_cast<int>(i));
114+
clusters.push_back(cluster);
115+
}
116+
117+
// Agglomerative clustering
118+
while (static_cast<int>(clusters.size()) > n_clusters) {
119+
merge_clusters();
120+
}
121+
}
122+
123+
std::vector<int> HierarchicalClustering::predict() const {
124+
std::vector<int> labels(data.size(), -1);
125+
for (size_t i = 0; i < clusters.size(); ++i) {
126+
for (int point_idx : clusters[i]->points) {
127+
labels[point_idx] = static_cast<int>(i);
128+
}
129+
}
130+
return labels;
131+
}
132+
133+
std::vector<std::vector<double>> HierarchicalClustering::get_cluster_centers() const {
134+
std::vector<std::vector<double>> centers;
135+
centers.reserve(clusters.size());
136+
137+
for (const auto& cluster : clusters) {
138+
std::vector<double> centroid(data[0].size(), 0.0);
139+
for (int idx : cluster->points) {
140+
const auto& point = data[idx];
141+
for (size_t i = 0; i < point.size(); ++i) {
142+
centroid[i] += point[i];
143+
}
144+
}
145+
// Divide by the number of points to get the mean
146+
for (double& val : centroid) {
147+
val /= cluster->points.size();
148+
}
149+
centers.push_back(centroid);
150+
}
151+
152+
return centers;
153+
}
154+
155+
double HierarchicalClustering::euclidean_distance(int a, int b) const {
156+
const auto& point_a = data[a];
157+
const auto& point_b = data[b];
158+
double distance = 0.0;
159+
for (size_t i = 0; i < point_a.size(); ++i) {
160+
double diff = point_a[i] - point_b[i];
161+
distance += diff * diff;
162+
}
163+
return std::sqrt(distance);
164+
}
165+
166+
double HierarchicalClustering::cluster_distance(const Cluster& cluster_a, const Cluster& cluster_b) const {
167+
double distance = 0.0;
168+
169+
if (linkage == Linkage::SINGLE) {
170+
// Minimum distance between any two points in the clusters
171+
distance = std::numeric_limits<double>::max();
172+
for (int idx_a : cluster_a.points) {
173+
for (int idx_b : cluster_b.points) {
174+
double dist = euclidean_distance(idx_a, idx_b);
175+
if (dist < distance) {
176+
distance = dist;
177+
}
178+
}
179+
}
180+
} else if (linkage == Linkage::COMPLETE) {
181+
// Maximum distance between any two points in the clusters
182+
distance = 0.0;
183+
for (int idx_a : cluster_a.points) {
184+
for (int idx_b : cluster_b.points) {
185+
double dist = euclidean_distance(idx_a, idx_b);
186+
if (dist > distance) {
187+
distance = dist;
188+
}
189+
}
190+
}
191+
} else if (linkage == Linkage::AVERAGE) {
192+
// Average distance between all pairs of points in the clusters
193+
distance = 0.0;
194+
int count = 0;
195+
for (int idx_a : cluster_a.points) {
196+
for (int idx_b : cluster_b.points) {
197+
distance += euclidean_distance(idx_a, idx_b);
198+
count++;
199+
}
200+
}
201+
distance /= count;
202+
}
203+
204+
return distance;
205+
}
206+
207+
void HierarchicalClustering::merge_clusters() {
208+
auto [idx_a, idx_b] = find_closest_clusters();
209+
210+
// Merge cluster b into cluster a
211+
clusters[idx_a]->points.insert(clusters[idx_a]->points.end(),
212+
clusters[idx_b]->points.begin(),
213+
clusters[idx_b]->points.end());
214+
215+
// Remove cluster b
216+
clusters.erase(clusters.begin() + idx_b);
217+
}
218+
219+
std::pair<int, int> HierarchicalClustering::find_closest_clusters() const {
220+
double min_distance = std::numeric_limits<double>::max();
221+
int idx_a = -1;
222+
int idx_b = -1;
223+
224+
for (size_t i = 0; i < clusters.size(); ++i) {
225+
for (size_t j = i + 1; j < clusters.size(); ++j) {
226+
double dist = cluster_distance(*clusters[i], *clusters[j]);
227+
if (dist < min_distance) {
228+
min_distance = dist;
229+
idx_a = static_cast<int>(i);
230+
idx_b = static_cast<int>(j);
231+
}
232+
}
233+
}
234+
235+
return {idx_a, idx_b};
236+
}
237+
238+
#endif // HIERARCHICAL_CLUSTERING_HPP
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#include "../ml_library_include/ml/clustering/HierarchicalClustering.hpp"
2+
#include <iostream>
3+
#include <vector>
4+
#include <cassert>
5+
#include "../TestUtils.hpp" // Utility file for approxEqual or similar functions
6+
7+
int main() {
8+
// Sample dataset with three distinct groups
9+
std::vector<std::vector<double>> data = {
10+
{1.0, 2.0}, {1.5, 1.8}, {1.0, 0.6}, // Group 1
11+
{5.0, 8.0}, {6.0, 9.0}, // Group 2
12+
{9.0, 11.0}, {8.0, 2.0}, {10.0, 2.0}, {9.0, 3.0} // Group 3
13+
};
14+
15+
// Initialize HierarchicalClustering with 3 clusters
16+
HierarchicalClustering hc(3, HierarchicalClustering::Linkage::AVERAGE);
17+
hc.fit(data);
18+
19+
// Predict cluster labels
20+
std::vector<int> labels = hc.predict();
21+
22+
// Ensure there are three unique clusters
23+
std::vector<size_t> actual_cluster_counts(3, 0);
24+
for (const int label : labels) {
25+
assert(label >= 0 && label < 3 && "Cluster label out of expected range.");
26+
actual_cluster_counts[label]++;
27+
}
28+
29+
// Check that no cluster is empty
30+
for (size_t count : actual_cluster_counts) {
31+
assert(count > 0 && "One of the clusters is empty.");
32+
}
33+
34+
// Expected cluster centers (approximately, for validation)
35+
std::vector<std::vector<double>> expected_centers = {
36+
{1.17, 1.47}, {5.5, 8.5}, {9.0, 4.5} // Approximate expected values
37+
};
38+
39+
// Get actual centers and validate against expected centers
40+
const auto& centers = hc.get_cluster_centers();
41+
bool centers_match = true;
42+
std::cout << "Hierarchical Clustering Centers:" << std::endl;
43+
for (const auto& center : centers) {
44+
std::cout << "Cluster center: (" << center[0] << ", " << center[1] << ")" << std::endl;
45+
bool matched = false;
46+
for (const auto& expected : expected_centers) {
47+
if (approxEqual(center[0], expected[0], 1.0) && approxEqual(center[1], expected[1], 1.0)) {
48+
matched = true;
49+
break;
50+
}
51+
}
52+
centers_match &= matched;
53+
}
54+
55+
assert(centers_match && "Cluster centers do not match expected locations within tolerance.");
56+
57+
// Inform user of successful test
58+
std::cout << "Hierarchical Clustering Test passed." << std::endl;
59+
60+
return 0;
61+
}

0 commit comments

Comments
 (0)