Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions bhc/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@


class Result(object):
def __init__(self,
arc_list,
node_ids,
last_log_p,
weights,
hierarchy_cut,
n_clusters):
def __init__(
self,
arc_list,
node_ids,
last_log_p,
weights,
hierarchy_cut,
n_clusters,
):
self.arc_list = arc_list
self.node_ids = node_ids
self.last_log_p = last_log_p
Expand All @@ -30,7 +32,7 @@ def __eq__(self, other):
return self.source == other.source and self.target == other.target

def __repr__(self):
return '{0} -> {1}'.format(str(self.source), str(self.target))
return "{0} -> {1}".format(str(self.source), str(self.target))


class AbstractPrior(ABC):
Expand All @@ -46,7 +48,8 @@ def build(self):


class AbstractBayesianBasedHierarchicalClustering(
AbstractHierarchicalClustering, ABC):
AbstractHierarchicalClustering, ABC
):
def __init__(self, data, model, alpha, cut_allowed):
self.data = data
self.model = model
Expand Down
90 changes: 55 additions & 35 deletions bhc/core/bhc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@


class BayesianHierarchicalClustering(
api.AbstractBayesianBasedHierarchicalClustering):
api.AbstractBayesianBasedHierarchicalClustering
):
"""
Reference: HELLER, Katherine A.; GHAHRAMANI, Zoubin.
Bayesian hierarchical clustering.
Expand All @@ -26,7 +27,7 @@ def build(self):

weights = []

# active nodes
# active nodes (all)
active_nodes = np.arange(n_objects)
# assignments - starting each point in its own cluster
assignments = np.arange(n_objects)
Expand All @@ -41,42 +42,49 @@ def build(self):
for i in range(n_objects):
# compute log(d_k)
log_d[i] = BayesianHierarchicalClustering.__calc_log_d(
self.alpha, n[i], None)
self.alpha, n[i], None
)
# compute log(p_i)
log_p[i] = self.model.calc_log_mlh(self.data[i])

ij = n_objects - 1

# for every pair of data points
pair_count = n_objects * (n_objects - 1) // 2
tmp_merge = np.empty((pair_count, 5), dtype=float)
row = 0
for i in range(n_objects):
log_p_k_row = self.model.row_of_log_likelihood_for_pairs(
self.data, i
)
for j in range(i + 1, n_objects):
# compute log(d_k)
n_ch = n[i] + n[j]
log_d_ch = log_d[i] + log_d[j]
log_dk = BayesianHierarchicalClustering.__calc_log_d(
self.alpha, n_ch, log_d_ch)
self.alpha, n_ch, log_d_ch
)
# compute log(pi_k)
log_pik = np.log(self.alpha) + gammaln(n_ch) - log_dk
# compute log(p_k)
data_merged = np.vstack((self.data[i], self.data[j]))
log_p_k = self.model.calc_log_mlh(data_merged)
log_p_k = log_p_k_row[j - i - 1] # since j starts at i + 1
# compute log(r_k)
log_p_ch = log_p[i] + log_p[j]
r1 = log_pik + log_p_k
r2 = log_d_ch - log_dk + log_p_ch
log_r = r1 - r2
# store results
merge_info = [i, j, log_r, r1, r2]
tmp_merge = merge_info if tmp_merge is None \
else np.vstack((tmp_merge, merge_info))
tmp_merge[row] = [i, j, log_r, r1, r2]
row += 1

# find clusters to merge
arc_list = np.empty(0, dtype=api.Arc)
data_per_cluster = [np.array([self.data[i]]) for i in range(n_objects)]
while active_nodes.size > 1:
# find i, j with the highest probability of the merged hypothesis
max_log_rk = np.max(tmp_merge[:, 2])
ids_matched = np.argwhere(tmp_merge[:, 2] == max_log_rk)
position = np.min(ids_matched)
position = np.argmax(
tmp_merge[:, 2]
) # returns the first occurrence
i, j, log_r, r1, r2 = tmp_merge[position]
i = int(i)
j = int(j)
Expand All @@ -87,22 +95,22 @@ def build(self):
hierarchy_cut = True
break

# turn nodes i,j off
tmp_merge[np.argwhere(tmp_merge[:, 0] == i).flatten(), 2] = -np.inf
tmp_merge[np.argwhere(tmp_merge[:, 1] == i).flatten(), 2] = -np.inf
tmp_merge[np.argwhere(tmp_merge[:, 0] == j).flatten(), 2] = -np.inf
tmp_merge[np.argwhere(tmp_merge[:, 1] == j).flatten(), 2] = -np.inf

# new node ij
ij = n.size
n_ch = n[i] + n[j]
n = np.append(n, n_ch)
# compute log(d_ij)
log_d_ch = log_d[i] + log_d[j]
log_d_ij = BayesianHierarchicalClustering.__calc_log_d(
self.alpha, n[ij], log_d_ch)
self.alpha, n[ij], log_d_ch
)
log_d = np.append(log_d, log_d_ij)
# update assignments
# update cluster assignments
data_per_cluster.append(
np.vstack((data_per_cluster[i], data_per_cluster[j]))
)
data_per_cluster[i] = None
data_per_cluster[j] = None
Comment on lines +112 to +113
Copy link

Copilot AI Sep 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting merged cluster data to None creates potential for accessing None values later. Consider using a dictionary or set to track active clusters instead of relying on None checks.

Copilot uses AI. Check for mistakes.
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just checking in, I don't think both of those details found by copilot are quite relevant, but in case you think they are, I'll happily fix them.

assignments[np.argwhere(assignments == i)] = ij
assignments[np.argwhere(assignments == j)] = ij

Expand All @@ -116,42 +124,54 @@ def build(self):
j_idx = np.argwhere(active_nodes == j).flatten()
active_nodes = np.delete(active_nodes, [i_idx, j_idx])
active_nodes = np.append(active_nodes, ij)

# clean up tmp_merge
# keep rows where neither column 0 nor column 1 equals i or j
mask = ~np.isin(tmp_merge[:, :2], [i, j]).any(axis=1)
tmp_merge = tmp_merge[mask]

# compute log(p_ij)
t1 = np.maximum(r1, r2)
t2 = np.minimum(r1, r2)
log_p_ij = t1 + np.log(1 + np.exp(t2 - t1))
log_p = np.append(log_p, log_p_ij)

# for every pair ij x active
x_mat_ij = self.data[np.argwhere(assignments == ij).flatten()]
collected_merge_info = np.empty(
(len(active_nodes) - 1, 5), dtype=float
)
for k in range(active_nodes.size - 1):
# compute log(d_k)
n_ch = n[k] + n[ij]
log_d_ch = log_d[k] + log_d[ij]
log_dij = BayesianHierarchicalClustering.__calc_log_d(
self.alpha, n_ch, log_d_ch)
self.alpha, n_ch, log_d_ch
)
# compute log(pi_k)
log_pik = np.log(self.alpha) + gammaln(n_ch) - log_dij
# compute log(p_k)
data_merged = self.data[np.argwhere(
assignments == active_nodes[k]).flatten()]
log_p_ij = self.model.calc_log_mlh(
np.vstack((x_mat_ij, data_merged)))
data_merged = np.vstack(
(data_per_cluster[ij], data_per_cluster[active_nodes[k]])
)
log_p_ij = self.model.calc_log_mlh(data_merged)
# compute log(r_k)
log_p_ch = log_p[ij] + log_p[active_nodes[k]]
r1 = log_pik + log_p_ij
r2 = log_d_ch - log_dij + log_p_ch
log_r = r1 - r2
# store results
merge_info = [ij, active_nodes[k], log_r, r1, r2]
tmp_merge = np.vstack((tmp_merge, merge_info))

return api.Result(arc_list,
np.arange(0, ij + 1),
log_p[-1],
np.array(weights),
hierarchy_cut,
len(np.unique(assignments)))
collected_merge_info[k] = [ij, active_nodes[k], log_r, r1, r2]

tmp_merge = np.vstack((tmp_merge, collected_merge_info))

return api.Result(
arc_list,
np.arange(0, ij + 1),
log_p[-1],
np.array(weights),
hierarchy_cut,
len(np.unique(assignments)),
)

@staticmethod
def __calc_log_d(alpha, nk, log_d_ch):
Expand Down
73 changes: 63 additions & 10 deletions bhc/core/prior.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ class NormalInverseWishart(AbstractPrior):
Reference: MURPHY, Kevin P.
Conjugate Bayesian analysis of the Gaussian distribution.
def, v. 1, n. 2σ2, p. 16, 2007.
https://www.cse.iitk.ac.in/users/piyush/courses/tpmi_winter19/readings/bayesGauss.pdf
https://www.cse.iitk.ac.in/users/piyush/courses/
tpmi_winter19/readings/bayesGauss.pdf
"""

def __init__(self, s_mat, r, v, m):
Expand All @@ -31,16 +32,68 @@ def calc_log_mlh(self, x_mat):
x_mat_l = x_mat_l[np.newaxis] if x_mat_l.ndim == 1 else x_mat_l
n, d = x_mat_l.shape
s_mat_p, rp, vp = NormalInverseWishart.__calc_posterior(
x_mat_l, self.s_mat, self.r, self.v, self.m)
x_mat_l, self.s_mat, self.r, self.v, self.m
)
log_prior = NormalInverseWishart.__calc_log_prior(s_mat_p, rp, vp)
return log_prior - self.log_prior0 - LOG2PI * (n * d / 2.0)

def row_of_log_likelihood_for_pairs(
self,
X, # (N, d) data matrix
i, # index of the row you want (int)
):
"""
Returns 1D array containing the log-likelihoods for pairs of points
needed for the initialization of bhc. This function combines i with
all other points j > i and returns the log-likelihood of those
clusters (containing two points each).
"""
N, d = X.shape
if d != self.s_mat.shape[0]:
raise ValueError(
"data dimension and prior scale matrix do not match"
)

# ------------------------------------------------------------------
# Pairwise sufficient statistics – only for j > i (batched)
# ------------------------------------------------------------------
# slice of points that matter
Xj = X[i + 1:] # shape (N-i-1, d)
diff = X[i] - Xj # broadcasted automatically
x_bar = 0.5 * (X[i] + Xj) # (N-i-1, d)

# Scatter matrix S = ½ diff·diffᵀ → (N-i-1, d, d)
S = 0.5 * np.einsum("...i,...j->...ij", diff, diff)
# Term (r·2/(r+2))·(x̄‑m)(x̄‑m)ᵀ
dt = x_bar - self.m # (N-i-1, d)
outer_dt = np.einsum("...i,...j->...ij", dt, dt) # (N-i-1, d, d)
term = (self.r * 2.0 / (self.r + 2.0)) * outer_dt
# Posterior scale matrix for each pair
s_mat_p = self.s_mat[None, :, :] + S + term # (N-i-1, d, d)

# ------------------------------------------------------------------
# Log‑posterior for each pair (batched)
# ------------------------------------------------------------------
rp = self.r + 2.0 # each cluster has two points
vp = self.v + 2.0
sign, logdet = np.linalg.slogdet(s_mat_p) # (N-i-1,)
Copy link

Copilot AI Sep 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable sign is computed but never used. If the determinant can be negative or zero, this could lead to incorrect log-likelihood calculations since the subsequent computation assumes positive determinants.

Suggested change
sign, logdet = np.linalg.slogdet(s_mat_p) # (N-i-1,)
sign, logdet = np.linalg.slogdet(s_mat_p) # (N-i-1,)
if not np.all(sign > 0):
raise ValueError("Posterior scale matrix is not positive-definite for some pairs (determinant <= 0).")

Copilot uses AI. Check for mistakes.
log_prior_post = (
LOG2 * (vp * d / 2.0)
+ (d / 2.0) * np.log(2.0 * np.pi / rp)
+ multigammaln(vp / 2.0, d)
- (vp / 2.0) * logdet
) # (N-i-1,)

# Final log-likelihood for each pair
return log_prior_post - self.log_prior0 - LOG2PI * d # (N-i-1,)

@staticmethod
def __calc_log_prior(s_mat, r, v):
d = s_mat.shape[0]
log_prior = LOG2 * (v * d / 2.0) + (d / 2.0) * np.log(2.0 * np.pi / r)
log_prior += multigammaln(v / 2.0, d) - \
(v / 2.0) * np.log(np.linalg.det(s_mat))
log_prior += multigammaln(v / 2.0, d) - (v / 2.0) * np.log(
np.linalg.det(s_mat)
)
return log_prior

@staticmethod
Expand All @@ -49,8 +102,9 @@ def __calc_posterior(x_mat, s_mat, r, v, m):
x_bar = np.mean(x_mat, axis=0)
rp = r + n
vp = v + n
s_mat_t = np.zeros(s_mat.shape) if n == 1 else (
n - 1) * np.cov(x_mat.T)
s_mat_t = (
np.zeros(s_mat.shape) if n == 1 else (n - 1) * np.cov(x_mat.T)
)
dt = (x_bar - m)[np.newaxis]
s_mat_p = s_mat + s_mat_t + (r * n / rp) * np.dot(dt.T, dt)
return s_mat_p, rp, vp
Expand All @@ -62,7 +116,6 @@ def create(data, g, scale_factor):
data_matrix_cov = np.cov(data.T)
scatter_matrix = (data_matrix_cov / g).T

return NormalInverseWishart(scatter_matrix,
scale_factor,
degrees_of_freedom,
data_mean)
return NormalInverseWishart(
scatter_matrix, scale_factor, degrees_of_freedom, data_mean
)
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ parentdir_prefix = bhc-
[flake8]
max-line-length = 79
exclude = dist,build,versioneer.py,bhc/_*,setup.py
ignore = BLK100
extend-ignore = BLK100
max_complexity = 15

[tool:pytest]
Expand Down
Loading