From 34f688a5ea8c22fd7a9661673ceade0ee4c0bfdd Mon Sep 17 00:00:00 2001 From: Martin Ritzert Date: Mon, 11 Aug 2025 15:02:52 +0200 Subject: [PATCH 1/5] black formatting --- bhc/api.py | 27 +++++++++++++-------------- bhc/core/bhc.py | 44 ++++++++++++++++++++++++++------------------ bhc/core/prior.py | 16 +++++++--------- 3 files changed, 46 insertions(+), 41 deletions(-) diff --git a/bhc/api.py b/bhc/api.py index 75d4d46..39bd4fa 100644 --- a/bhc/api.py +++ b/bhc/api.py @@ -6,13 +6,15 @@ class Result(object): - def __init__(self, - arc_list, - node_ids, - last_log_p, - weights, - hierarchy_cut, - n_clusters): + def __init__( + self, + arc_list, + node_ids, + last_log_p, + weights, + hierarchy_cut, + n_clusters, + ): self.arc_list = arc_list self.node_ids = node_ids self.last_log_p = last_log_p @@ -30,23 +32,20 @@ def __eq__(self, other): return self.source == other.source and self.target == other.target def __repr__(self): - return '{0} -> {1}'.format(str(self.source), str(self.target)) + return "{0} -> {1}".format(str(self.source), str(self.target)) class AbstractPrior(ABC): @abstractmethod - def calc_log_mlh(self, x_mat): - ... + def calc_log_mlh(self, x_mat): ... class AbstractHierarchicalClustering(ABC): @abstractmethod - def build(self): - ... + def build(self): ... -class AbstractBayesianBasedHierarchicalClustering( - AbstractHierarchicalClustering, ABC): +class AbstractBayesianBasedHierarchicalClustering(AbstractHierarchicalClustering, ABC): def __init__(self, data, model, alpha, cut_allowed): self.data = data self.model = model diff --git a/bhc/core/bhc.py b/bhc/core/bhc.py index 7f13189..c10e4c2 100644 --- a/bhc/core/bhc.py +++ b/bhc/core/bhc.py @@ -8,8 +8,7 @@ import bhc.api as api -class BayesianHierarchicalClustering( - api.AbstractBayesianBasedHierarchicalClustering): +class BayesianHierarchicalClustering(api.AbstractBayesianBasedHierarchicalClustering): """ Reference: HELLER, Katherine A.; GHAHRAMANI, Zoubin. Bayesian hierarchical clustering. @@ -26,7 +25,7 @@ def build(self): weights = [] - # active nodes + # active nodes (all) active_nodes = np.arange(n_objects) # assignments - starting each point in its own cluster assignments = np.arange(n_objects) @@ -41,7 +40,8 @@ def build(self): for i in range(n_objects): # compute log(d_k) log_d[i] = BayesianHierarchicalClustering.__calc_log_d( - self.alpha, n[i], None) + self.alpha, n[i], None + ) # compute log(p_i) log_p[i] = self.model.calc_log_mlh(self.data[i]) @@ -54,7 +54,8 @@ def build(self): n_ch = n[i] + n[j] log_d_ch = log_d[i] + log_d[j] log_dk = BayesianHierarchicalClustering.__calc_log_d( - self.alpha, n_ch, log_d_ch) + self.alpha, n_ch, log_d_ch + ) # compute log(pi_k) log_pik = np.log(self.alpha) + gammaln(n_ch) - log_dk # compute log(p_k) @@ -67,8 +68,11 @@ def build(self): log_r = r1 - r2 # store results merge_info = [i, j, log_r, r1, r2] - tmp_merge = merge_info if tmp_merge is None \ + tmp_merge = ( + merge_info + if tmp_merge is None else np.vstack((tmp_merge, merge_info)) + ) # find clusters to merge arc_list = np.empty(0, dtype=api.Arc) @@ -100,7 +104,8 @@ def build(self): # compute log(d_ij) log_d_ch = log_d[i] + log_d[j] log_d_ij = BayesianHierarchicalClustering.__calc_log_d( - self.alpha, n[ij], log_d_ch) + self.alpha, n[ij], log_d_ch + ) log_d = np.append(log_d, log_d_ij) # update assignments assignments[np.argwhere(assignments == i)] = ij @@ -129,14 +134,15 @@ def build(self): n_ch = n[k] + n[ij] log_d_ch = log_d[k] + log_d[ij] log_dij = BayesianHierarchicalClustering.__calc_log_d( - self.alpha, n_ch, log_d_ch) + self.alpha, n_ch, log_d_ch + ) # compute log(pi_k) log_pik = np.log(self.alpha) + gammaln(n_ch) - log_dij # compute log(p_k) - data_merged = self.data[np.argwhere( - assignments == active_nodes[k]).flatten()] - log_p_ij = self.model.calc_log_mlh( - np.vstack((x_mat_ij, data_merged))) + data_merged = self.data[ + np.argwhere(assignments == active_nodes[k]).flatten() + ] + log_p_ij = self.model.calc_log_mlh(np.vstack((x_mat_ij, data_merged))) # compute log(r_k) log_p_ch = log_p[ij] + log_p[active_nodes[k]] r1 = log_pik + log_p_ij @@ -146,12 +152,14 @@ def build(self): merge_info = [ij, active_nodes[k], log_r, r1, r2] tmp_merge = np.vstack((tmp_merge, merge_info)) - return api.Result(arc_list, - np.arange(0, ij + 1), - log_p[-1], - np.array(weights), - hierarchy_cut, - len(np.unique(assignments))) + return api.Result( + arc_list, + np.arange(0, ij + 1), + log_p[-1], + np.array(weights), + hierarchy_cut, + len(np.unique(assignments)), + ) @staticmethod def __calc_log_d(alpha, nk, log_d_ch): diff --git a/bhc/core/prior.py b/bhc/core/prior.py index 3390363..52ea4c6 100644 --- a/bhc/core/prior.py +++ b/bhc/core/prior.py @@ -31,7 +31,8 @@ def calc_log_mlh(self, x_mat): x_mat_l = x_mat_l[np.newaxis] if x_mat_l.ndim == 1 else x_mat_l n, d = x_mat_l.shape s_mat_p, rp, vp = NormalInverseWishart.__calc_posterior( - x_mat_l, self.s_mat, self.r, self.v, self.m) + x_mat_l, self.s_mat, self.r, self.v, self.m + ) log_prior = NormalInverseWishart.__calc_log_prior(s_mat_p, rp, vp) return log_prior - self.log_prior0 - LOG2PI * (n * d / 2.0) @@ -39,8 +40,7 @@ def calc_log_mlh(self, x_mat): def __calc_log_prior(s_mat, r, v): d = s_mat.shape[0] log_prior = LOG2 * (v * d / 2.0) + (d / 2.0) * np.log(2.0 * np.pi / r) - log_prior += multigammaln(v / 2.0, d) - \ - (v / 2.0) * np.log(np.linalg.det(s_mat)) + log_prior += multigammaln(v / 2.0, d) - (v / 2.0) * np.log(np.linalg.det(s_mat)) return log_prior @staticmethod @@ -49,8 +49,7 @@ def __calc_posterior(x_mat, s_mat, r, v, m): x_bar = np.mean(x_mat, axis=0) rp = r + n vp = v + n - s_mat_t = np.zeros(s_mat.shape) if n == 1 else ( - n - 1) * np.cov(x_mat.T) + s_mat_t = np.zeros(s_mat.shape) if n == 1 else (n - 1) * np.cov(x_mat.T) dt = (x_bar - m)[np.newaxis] s_mat_p = s_mat + s_mat_t + (r * n / rp) * np.dot(dt.T, dt) return s_mat_p, rp, vp @@ -62,7 +61,6 @@ def create(data, g, scale_factor): data_matrix_cov = np.cov(data.T) scatter_matrix = (data_matrix_cov / g).T - return NormalInverseWishart(scatter_matrix, - scale_factor, - degrees_of_freedom, - data_mean) + return NormalInverseWishart( + scatter_matrix, scale_factor, degrees_of_freedom, data_mean + ) From b169084bab83918eaa74128242f6a6298ab1031c Mon Sep 17 00:00:00 2001 From: Martin Ritzert Date: Mon, 11 Aug 2025 15:34:03 +0200 Subject: [PATCH 2/5] BHC speedup (memory optimization and some batching) --- bhc/core/bhc.py | 54 +++++++++++++++++++++++++---------------------- bhc/core/prior.py | 47 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 25 deletions(-) diff --git a/bhc/core/bhc.py b/bhc/core/bhc.py index c10e4c2..34f8398 100644 --- a/bhc/core/bhc.py +++ b/bhc/core/bhc.py @@ -48,7 +48,11 @@ def build(self): ij = n_objects - 1 # for every pair of data points + pair_count = n_objects * (n_objects - 1) // 2 + tmp_merge = np.empty((pair_count, 5), dtype=float) + row = 0 for i in range(n_objects): + log_p_k_row = self.model.row_of_log_likelihood_for_pairs(self.data, i) for j in range(i + 1, n_objects): # compute log(d_k) n_ch = n[i] + n[j] @@ -59,28 +63,22 @@ def build(self): # compute log(pi_k) log_pik = np.log(self.alpha) + gammaln(n_ch) - log_dk # compute log(p_k) - data_merged = np.vstack((self.data[i], self.data[j])) - log_p_k = self.model.calc_log_mlh(data_merged) + log_p_k = log_p_k_row[j - i - 1] # since j starts at i + 1 # compute log(r_k) log_p_ch = log_p[i] + log_p[j] r1 = log_pik + log_p_k r2 = log_d_ch - log_dk + log_p_ch log_r = r1 - r2 # store results - merge_info = [i, j, log_r, r1, r2] - tmp_merge = ( - merge_info - if tmp_merge is None - else np.vstack((tmp_merge, merge_info)) - ) + tmp_merge[row] = [i, j, log_r, r1, r2] + row += 1 # find clusters to merge arc_list = np.empty(0, dtype=api.Arc) + data_per_cluster = [np.array([self.data[i]]) for i in range(n_objects)] while active_nodes.size > 1: # find i, j with the highest probability of the merged hypothesis - max_log_rk = np.max(tmp_merge[:, 2]) - ids_matched = np.argwhere(tmp_merge[:, 2] == max_log_rk) - position = np.min(ids_matched) + position = np.argmax(tmp_merge[:, 2]) # returns the first occurrence i, j, log_r, r1, r2 = tmp_merge[position] i = int(i) j = int(j) @@ -91,12 +89,6 @@ def build(self): hierarchy_cut = True break - # turn nodes i,j off - tmp_merge[np.argwhere(tmp_merge[:, 0] == i).flatten(), 2] = -np.inf - tmp_merge[np.argwhere(tmp_merge[:, 1] == i).flatten(), 2] = -np.inf - tmp_merge[np.argwhere(tmp_merge[:, 0] == j).flatten(), 2] = -np.inf - tmp_merge[np.argwhere(tmp_merge[:, 1] == j).flatten(), 2] = -np.inf - # new node ij ij = n.size n_ch = n[i] + n[j] @@ -107,7 +99,12 @@ def build(self): self.alpha, n[ij], log_d_ch ) log_d = np.append(log_d, log_d_ij) - # update assignments + # update cluster assignments + data_per_cluster.append( + np.vstack((data_per_cluster[i], data_per_cluster[j])) + ) + data_per_cluster[i] = None + data_per_cluster[j] = None assignments[np.argwhere(assignments == i)] = ij assignments[np.argwhere(assignments == j)] = ij @@ -121,6 +118,12 @@ def build(self): j_idx = np.argwhere(active_nodes == j).flatten() active_nodes = np.delete(active_nodes, [i_idx, j_idx]) active_nodes = np.append(active_nodes, ij) + + # clean up tmp_merge + # keep rows where neither column 0 nor column 1 equals i or j + mask = ~np.isin(tmp_merge[:, :2], [i, j]).any(axis=1) + tmp_merge = tmp_merge[mask] + # compute log(p_ij) t1 = np.maximum(r1, r2) t2 = np.minimum(r1, r2) @@ -128,7 +131,7 @@ def build(self): log_p = np.append(log_p, log_p_ij) # for every pair ij x active - x_mat_ij = self.data[np.argwhere(assignments == ij).flatten()] + collected_merge_info = np.empty((len(active_nodes) - 1, 5), dtype=float) for k in range(active_nodes.size - 1): # compute log(d_k) n_ch = n[k] + n[ij] @@ -139,18 +142,19 @@ def build(self): # compute log(pi_k) log_pik = np.log(self.alpha) + gammaln(n_ch) - log_dij # compute log(p_k) - data_merged = self.data[ - np.argwhere(assignments == active_nodes[k]).flatten() - ] - log_p_ij = self.model.calc_log_mlh(np.vstack((x_mat_ij, data_merged))) + data_merged = np.vstack( + (data_per_cluster[ij], data_per_cluster[active_nodes[k]]) + ) + log_p_ij = self.model.calc_log_mlh(data_merged) # compute log(r_k) log_p_ch = log_p[ij] + log_p[active_nodes[k]] r1 = log_pik + log_p_ij r2 = log_d_ch - log_dij + log_p_ch log_r = r1 - r2 # store results - merge_info = [ij, active_nodes[k], log_r, r1, r2] - tmp_merge = np.vstack((tmp_merge, merge_info)) + collected_merge_info[k] = [ij, active_nodes[k], log_r, r1, r2] + + tmp_merge = np.vstack((tmp_merge, collected_merge_info)) return api.Result( arc_list, diff --git a/bhc/core/prior.py b/bhc/core/prior.py index 52ea4c6..0c17da5 100644 --- a/bhc/core/prior.py +++ b/bhc/core/prior.py @@ -36,6 +36,53 @@ def calc_log_mlh(self, x_mat): log_prior = NormalInverseWishart.__calc_log_prior(s_mat_p, rp, vp) return log_prior - self.log_prior0 - LOG2PI * (n * d / 2.0) + def row_of_log_likelihood_for_pairs( + self, + X, # (N, d) data matrix + i, # index of the row you want (int) + ): + """ + Returns 1D array containing the log-likelihoods for pairs of points needed for the + initialization of bhc. This function combines i with all other points j > i and returns + the log-likelihood of those clusters (containing two points each). + """ + N, d = X.shape + if d != self.s_mat.shape[0]: + raise ValueError("data dimension and prior scale matrix do not match") + + # ------------------------------------------------------------------ + # Pairwise sufficient statistics – only for j > i (batched) + # ------------------------------------------------------------------ + # slice of points that matter + Xj = X[i + 1 :] # shape (N-i-1, d) + diff = X[i] - Xj # broadcasted automatically + x_bar = 0.5 * (X[i] + Xj) # (N-i-1, d) + + # Scatter matrix S = ½ diff·diffᵀ → (N-i-1, d, d) + S = 0.5 * np.einsum("...i,...j->...ij", diff, diff) + # Term (r·2/(r+2))·(x̄‑m)(x̄‑m)ᵀ + dt = x_bar - self.m # (N-i-1, d) + outer_dt = np.einsum("...i,...j->...ij", dt, dt) # (N-i-1, d, d) + term = (self.r * 2.0 / (self.r + 2.0)) * outer_dt + # Posterior scale matrix for each pair + s_mat_p = self.s_mat[None, :, :] + S + term # (N-i-1, d, d) + + # ------------------------------------------------------------------ + # Log‑posterior for each pair (batched) + # ------------------------------------------------------------------ + rp = self.r + 2.0 # each cluster has two points + vp = self.v + 2.0 + sign, logdet = slogdet(s_mat_p) # (N-i-1,) + log_prior_post = ( + LOG2 * (vp * d / 2.0) + + (d / 2.0) * np.log(2.0 * np.pi / rp) + + multigammaln(vp / 2.0, d) + - (vp / 2.0) * logdet + ) # (N-i-1,) + + # Final log-likelihood for each pair + return log_prior_post - self.log_prior0 - LOG2PI * d # (N-i-1,) + @staticmethod def __calc_log_prior(s_mat, r, v): d = s_mat.shape[0] From 9e6c6c86fae24dde2101d36edf68a932f017a895 Mon Sep 17 00:00:00 2001 From: Martin Ritzert Date: Mon, 1 Sep 2025 10:25:15 +0200 Subject: [PATCH 3/5] formatting issues --- bhc/api.py | 4 +++- bhc/core/bhc.py | 16 ++++++++++++---- bhc/core/prior.py | 24 ++++++++++++++++-------- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/bhc/api.py b/bhc/api.py index 39bd4fa..c712907 100644 --- a/bhc/api.py +++ b/bhc/api.py @@ -45,7 +45,9 @@ class AbstractHierarchicalClustering(ABC): def build(self): ... -class AbstractBayesianBasedHierarchicalClustering(AbstractHierarchicalClustering, ABC): +class AbstractBayesianBasedHierarchicalClustering( + AbstractHierarchicalClustering, ABC +): def __init__(self, data, model, alpha, cut_allowed): self.data = data self.model = model diff --git a/bhc/core/bhc.py b/bhc/core/bhc.py index 34f8398..0690d05 100644 --- a/bhc/core/bhc.py +++ b/bhc/core/bhc.py @@ -8,7 +8,9 @@ import bhc.api as api -class BayesianHierarchicalClustering(api.AbstractBayesianBasedHierarchicalClustering): +class BayesianHierarchicalClustering( + api.AbstractBayesianBasedHierarchicalClustering +): """ Reference: HELLER, Katherine A.; GHAHRAMANI, Zoubin. Bayesian hierarchical clustering. @@ -52,7 +54,9 @@ def build(self): tmp_merge = np.empty((pair_count, 5), dtype=float) row = 0 for i in range(n_objects): - log_p_k_row = self.model.row_of_log_likelihood_for_pairs(self.data, i) + log_p_k_row = self.model.row_of_log_likelihood_for_pairs( + self.data, i + ) for j in range(i + 1, n_objects): # compute log(d_k) n_ch = n[i] + n[j] @@ -78,7 +82,9 @@ def build(self): data_per_cluster = [np.array([self.data[i]]) for i in range(n_objects)] while active_nodes.size > 1: # find i, j with the highest probability of the merged hypothesis - position = np.argmax(tmp_merge[:, 2]) # returns the first occurrence + position = np.argmax( + tmp_merge[:, 2] + ) # returns the first occurrence i, j, log_r, r1, r2 = tmp_merge[position] i = int(i) j = int(j) @@ -131,7 +137,9 @@ def build(self): log_p = np.append(log_p, log_p_ij) # for every pair ij x active - collected_merge_info = np.empty((len(active_nodes) - 1, 5), dtype=float) + collected_merge_info = np.empty( + (len(active_nodes) - 1, 5), dtype=float + ) for k in range(active_nodes.size - 1): # compute log(d_k) n_ch = n[k] + n[ij] diff --git a/bhc/core/prior.py b/bhc/core/prior.py index 0c17da5..5131ff2 100644 --- a/bhc/core/prior.py +++ b/bhc/core/prior.py @@ -16,7 +16,8 @@ class NormalInverseWishart(AbstractPrior): Reference: MURPHY, Kevin P. Conjugate Bayesian analysis of the Gaussian distribution. def, v. 1, n. 2σ2, p. 16, 2007. - https://www.cse.iitk.ac.in/users/piyush/courses/tpmi_winter19/readings/bayesGauss.pdf + https://www.cse.iitk.ac.in/users/piyush/courses/ + tpmi_winter19/readings/bayesGauss.pdf """ def __init__(self, s_mat, r, v, m): @@ -42,13 +43,16 @@ def row_of_log_likelihood_for_pairs( i, # index of the row you want (int) ): """ - Returns 1D array containing the log-likelihoods for pairs of points needed for the - initialization of bhc. This function combines i with all other points j > i and returns - the log-likelihood of those clusters (containing two points each). + Returns 1D array containing the log-likelihoods for pairs of points + needed for the initialization of bhc. This function combines i with + all other points j > i and returns the log-likelihood of those + clusters (containing two points each). """ N, d = X.shape if d != self.s_mat.shape[0]: - raise ValueError("data dimension and prior scale matrix do not match") + raise ValueError( + "data dimension and prior scale matrix do not match" + ) # ------------------------------------------------------------------ # Pairwise sufficient statistics – only for j > i (batched) @@ -72,7 +76,7 @@ def row_of_log_likelihood_for_pairs( # ------------------------------------------------------------------ rp = self.r + 2.0 # each cluster has two points vp = self.v + 2.0 - sign, logdet = slogdet(s_mat_p) # (N-i-1,) + sign, logdet = np.linalg.slogdet(s_mat_p) # (N-i-1,) log_prior_post = ( LOG2 * (vp * d / 2.0) + (d / 2.0) * np.log(2.0 * np.pi / rp) @@ -87,7 +91,9 @@ def row_of_log_likelihood_for_pairs( def __calc_log_prior(s_mat, r, v): d = s_mat.shape[0] log_prior = LOG2 * (v * d / 2.0) + (d / 2.0) * np.log(2.0 * np.pi / r) - log_prior += multigammaln(v / 2.0, d) - (v / 2.0) * np.log(np.linalg.det(s_mat)) + log_prior += multigammaln(v / 2.0, d) - (v / 2.0) * np.log( + np.linalg.det(s_mat) + ) return log_prior @staticmethod @@ -96,7 +102,9 @@ def __calc_posterior(x_mat, s_mat, r, v, m): x_bar = np.mean(x_mat, axis=0) rp = r + n vp = v + n - s_mat_t = np.zeros(s_mat.shape) if n == 1 else (n - 1) * np.cov(x_mat.T) + s_mat_t = ( + np.zeros(s_mat.shape) if n == 1 else (n - 1) * np.cov(x_mat.T) + ) dt = (x_bar - m)[np.newaxis] s_mat_p = s_mat + s_mat_t + (r * n / rp) * np.dot(dt.T, dt) return s_mat_p, rp, vp From 94c00f44b793678c8162262e906af220b18acf8f Mon Sep 17 00:00:00 2001 From: Martin Ritzert Date: Mon, 1 Sep 2025 10:54:57 +0200 Subject: [PATCH 4/5] more formatting, now ignoring flake8 W503 --- bhc/api.py | 6 ++++-- bhc/core/prior.py | 2 +- setup.cfg | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bhc/api.py b/bhc/api.py index c712907..42cb614 100644 --- a/bhc/api.py +++ b/bhc/api.py @@ -37,12 +37,14 @@ def __repr__(self): class AbstractPrior(ABC): @abstractmethod - def calc_log_mlh(self, x_mat): ... + def calc_log_mlh(self, x_mat): + ... class AbstractHierarchicalClustering(ABC): @abstractmethod - def build(self): ... + def build(self): + ... class AbstractBayesianBasedHierarchicalClustering( diff --git a/bhc/core/prior.py b/bhc/core/prior.py index 5131ff2..53db134 100644 --- a/bhc/core/prior.py +++ b/bhc/core/prior.py @@ -58,7 +58,7 @@ def row_of_log_likelihood_for_pairs( # Pairwise sufficient statistics – only for j > i (batched) # ------------------------------------------------------------------ # slice of points that matter - Xj = X[i + 1 :] # shape (N-i-1, d) + Xj = X[i + 1:] # shape (N-i-1, d) diff = X[i] - Xj # broadcasted automatically x_bar = 0.5 * (X[i] + Xj) # (N-i-1, d) diff --git a/setup.cfg b/setup.cfg index 73c30c6..d7289e9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,7 +9,7 @@ parentdir_prefix = bhc- [flake8] max-line-length = 79 exclude = dist,build,versioneer.py,bhc/_*,setup.py -ignore = BLK100 +extend-ignore = BLK100,W503 max_complexity = 15 [tool:pytest] From 8fc0bf5f9655f0470e906bd0c9f15517576875ab Mon Sep 17 00:00:00 2001 From: Martin Ritzert Date: Mon, 1 Sep 2025 11:08:30 +0200 Subject: [PATCH 5/5] flake8 config --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index d7289e9..0ba231e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,7 +9,7 @@ parentdir_prefix = bhc- [flake8] max-line-length = 79 exclude = dist,build,versioneer.py,bhc/_*,setup.py -extend-ignore = BLK100,W503 +extend-ignore = BLK100 max_complexity = 15 [tool:pytest]