Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions rework_pysatl_mpest/initializers/__init__.py
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In examples we use imports from modules, not files inside module. That's why there is __init__.py files

Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@

.. code-block:: python

>>> from rework_pysatl_mpest import Exponential
>>> import numpy as np
>>> from rework_pysatl_mpest.distributions.exponential import Exponential
>>> from sklearn.cluster import KMeans
>>> from rework_pysatl_mpest.initializers import ClusterizeInitializer
>>> from rework_pysatl_mpest.initializers.clusterize_initializer import ClusterizeInitializer
>>> from rework_pysatl_mpest.initializers import ClusterMatchStrategy, EstimationStrategy
>>> from rework_pysatl_mpest.core.mixture import MixtureModel

>>> # Create initializer with KMeans clustering
>>> initializer_cluster = ClusterizeInitializer(
Expand All @@ -32,7 +32,8 @@
>>>Exponential(loc=5.0, rate=0.05), Exponential(loc=10.0, rate=0.01)]

>>> # Generate sample data
>>> X = np.linspace(0.01, 25.0, 300)
>>> mixture = MixtureModel(distributions, [0.3, 0.4, 0.3])
>>> X = mixture.generate(300)

>>> # Perform initialization
>>> mixture_model = initializer_cluster.perform(
Expand Down
136 changes: 64 additions & 72 deletions rework_pysatl_mpest/initializers/cluster_match_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,43 @@
from rework_pysatl_mpest.optimizers import Optimizer, ScipyNelderMead


def _validate_clusters_distributions(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missed docstrings

H: np.ndarray, models_count: int, estimation_strategies_count: int, min_samples: int
) -> tuple[list[int], list[float]]:
if not np.allclose(np.sum(H, axis=1), 1, atol=1e-10):
raise ValueError("Sum of H matrix weights must be equal to 1")

n_clusters = H.shape[1]

if estimation_strategies_count != models_count:
raise ValueError("Number of estimation functions must match number of models")

cluster_weights: list[float] = np.sum(H, axis=0)

valid_clusters = [k for k in range(n_clusters) if cluster_weights[k] >= min_samples]
if len(valid_clusters) != models_count:
return [], []
return valid_clusters, cluster_weights


def _calculate_cluster_fit(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missed docstrings

temp_model: ContinuousDistribution,
estimation_func: Callable,
X: np.ndarray,
H_k: np.ndarray,
optimizer: Optimizer,
) -> tuple[dict[str, float], float]:
new_params = estimation_func(temp_model, X, H_k, optimizer)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
new_params = estimation_func(temp_model, X, H_k, optimizer)
param_names, param_values = estimation_func(temp_model, X, H_k, optimizer).items()

param_names = new_params.keys()
param_values = new_params.values()
temp_model.set_params_from_vector(param_names, param_values)

log_probs = np.clip(temp_model.lpdf(X), -1e9, -1e-9)
weighted_log_likelihood = np.sum(H_k * log_probs)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
weighted_log_likelihood = np.sum(H_k * log_probs)
weighted_log_likelihood = np.dot(H_k, log_probs)


return new_params, weighted_log_likelihood


def match_clusters_for_models_log_likelihood(
models: list[ContinuousDistribution],
X: np.ndarray,
Expand Down Expand Up @@ -77,52 +114,32 @@ def match_clusters_for_models_log_likelihood(
If insufficient valid clusters are found, returns default parameters and equal weights.
"""

if not np.allclose(np.sum(H, axis=1), 1, atol=1e-10):
raise ValueError("Sum of H matrix weights must be equal to 1")

X = X.flatten()
n_clusters = H.shape[1]
n_models = len(models)

if len(estimation_strategies) != n_models:
raise ValueError("Number of estimation functions must match number of models")

updated_params_list = []
model_weights = []

cluster_weights = np.sum(H, axis=0)

valid_clusters = [k for k in range(n_clusters) if cluster_weights[k] >= min_samples]
if len(valid_clusters) != n_models:
valid_clusters, cluster_weights = _validate_clusters_distributions(
H, n_models, len(estimation_strategies), min_samples
)
if not (valid_clusters or cluster_weights):
default_params: list[dict] = [{} for _ in range(n_models)]
equal_weights = [1.0 / n_models] * n_models
return models, default_params, equal_weights

updated_params_list = []
model_weights = []
used_clusters = set()

for i, (model, estimation_func) in enumerate(zip(models, estimation_strategies)):
for model, estimation_func in zip(models, estimation_strategies):
best_score = -np.inf
best_params = {}
best_cluster_weight = 0.0
best_cluster = None
temp_model = copy(model)
default_params_names, default_params_values = (
list(temp_model.params),
temp_model.get_params_vector(list(temp_model.params)),
)

for k in valid_clusters:
if k in used_clusters:
continue
H_k = H[:, k]

new_params = estimation_func(temp_model, X, H_k, optimizer)
param_names = new_params.keys()
param_values = new_params.values()
temp_model.set_params_from_vector(param_names, param_values)

log_probs = np.clip(temp_model.lpdf(X), -1e9, -1e-9)
weighted_log_likelihood = np.sum(H_k * log_probs)
H_k = H[:, k]
new_params, weighted_log_likelihood = _calculate_cluster_fit(
copy(model), estimation_func, X, H_k, optimizer
)

effective_n = cluster_weights[k]
score = weighted_log_likelihood / effective_n
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is there an additional division here?

Expand All @@ -133,8 +150,6 @@ def match_clusters_for_models_log_likelihood(
best_cluster_weight = cluster_weights[k] / len(X)
best_cluster = k

temp_model.set_params_from_vector(default_params_names, default_params_values)

used_clusters.add(best_cluster)
updated_params_list.append(best_params)
model_weights.append(float(best_cluster_weight))
Expand Down Expand Up @@ -201,65 +216,42 @@ def match_clusters_for_models_akaike(
AIC is calculated as: ``2 * k - 2 * log_likelihood``, where ``k`` is the number of parameters.
"""

if not np.allclose(np.sum(H, axis=1), 1, atol=1e-10):
raise ValueError("Sum of H matrix weights must be equal to 1")

n_clusters = H.shape[1]
n_models = len(models)

if len(estimation_strategies) != n_models:
raise ValueError("Number of estimation functions must match number of models")

aic_scores_dict = {}

cluster_weights = np.sum(H, axis=0)
valid_clusters = [k for k in range(n_clusters) if cluster_weights[k] >= min_samples]

if len(valid_clusters) != n_models:
default_params: list[dict] = [{} for _ in range(n_models)]
valid_clusters, cluster_weights = _validate_clusters_distributions(
H, n_models, len(estimation_strategies), min_samples
)
if not (valid_clusters or cluster_weights):
default_params: list[dict[str, float]] = [{} for _ in range(n_models)]
equal_weights = [1.0 / n_models] * n_models
return models, default_params, equal_weights

aic_scores_dict: dict[str, dict] = {}

for i, (model, estimation_func) in enumerate(zip(models, estimation_strategies)):
temp_model = copy(model)
default_params_names, default_params_values = (
list(temp_model.params),
temp_model.get_params_vector(list(temp_model.params)),
)
for k in valid_clusters:
H_k = H[:, k]

new_params = estimation_func(temp_model, X, H_k, optimizer)
param_names = new_params.keys()
param_values = new_params.values()
temp_model.set_params_from_vector(param_names, param_values)

log_probs = np.clip(temp_model.lpdf(X), -1e9, -1e-9)
weighted_log_likelihood = np.sum(H_k * log_probs)
new_params, weighted_log_likelihood = _calculate_cluster_fit(
copy(model), estimation_func, X, H_k, optimizer
)

k_params = len(model.params)

aic_score = 2 * k_params - 2 * weighted_log_likelihood

key = f"{i}_{k}"
aic_scores_dict[key] = {
"aic_score": aic_score,
"params": new_params,
"cluster_weight": cluster_weights[k] / len(X),
"model_idx": i,
"cluster_idx": k,
}
temp_model.set_params_from_vector(default_params_names, default_params_values)

best_total_aic = np.inf
best_params_assignment = []
best_weights_assignment = []
best_params_assignment: list[dict[str, float]] = []
best_weights_assignment: list[float] = []

for cluster_perm in permutations(valid_clusters, n_models):
total_aic = 0
params_assignment = []
weights_assignment = []
valid_assignment = True
total_aic = 0.0
params_assignment: list[dict[str, float]] = []
weights_assignment: list[float] = []

for i, cluster_idx in enumerate(cluster_perm):
key = f"{i}_{cluster_idx}"
Expand All @@ -268,7 +260,7 @@ def match_clusters_for_models_akaike(
params_assignment.append(data["params"])
weights_assignment.append(data["cluster_weight"])

if valid_assignment and total_aic < best_total_aic:
if total_aic < best_total_aic:
best_total_aic = total_aic
best_params_assignment = params_assignment
best_weights_assignment = weights_assignment
Expand Down
2 changes: 2 additions & 0 deletions rework_pysatl_mpest/initializers/clusterize_initializer.py
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactor the constructor and the perform signature:
The user should pass the default optimizer and clusterer to the constructor, and pass them to perform if they want to use other ones.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is necessary to add a check that the clusterer has the required methods in the constructor and in perform, since the object type is Any

Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,8 @@ def perform(
5. Returns the initialized mixture model
"""
X = np.asarray(X, dtype=np.float64)
if X.ndim == 1:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this inside self._clusterize(...)
Otherwise, your two-dimensional array X will be passed to the rest of the class's internal methods, which seem not to be suited for this.

X = X.reshape(-1, 1)
self.models = dists
self.n_components = len(dists)
H = self._clusterize(X, self.clusterizer)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clusterizer is available from the internal state of the object, why it is in the signature of the method self._clusterize??

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_perform_accurate_init_normal_path(self):
def test_perform_accurate_init_fallback_to_fast_init(self):
initializer = ClusterizeInitializer(is_accurate=True, is_soft=True, clusterizer=self.mock_clusterizer)

X = np.array([1.0, 2.0, 3.0])
X = np.array([[1.0], [2.0], [3.0]])
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For what?
This test is not intended to check various X formats.

Besides, why this particular test? It seems to me that tests should be written in the format the user will work with, assuming a one-dimensional array will be passed.

H = np.array([[0.8, 0.2], [0.7, 0.3], [0.9, 0.1]])
dists = [self.mock_distributions[0], self.mock_distributions[1]]

Expand Down