PySATL · VITYANA · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/rework_pysatl_mpest/initializers/__init__.py b/rework_pysatl_mpest/initializers/__init__.py
@@ -14,11 +14,11 @@
 
 .. code-block:: python
 
-    >>> from rework_pysatl_mpest import Exponential
-    >>> import numpy as np
+    >>> from rework_pysatl_mpest.distributions.exponential import Exponential
     >>> from sklearn.cluster import KMeans
-    >>> from rework_pysatl_mpest.initializers import ClusterizeInitializer
+    >>> from rework_pysatl_mpest.initializers.clusterize_initializer import ClusterizeInitializer
     >>> from rework_pysatl_mpest.initializers import ClusterMatchStrategy, EstimationStrategy
+    >>> from rework_pysatl_mpest.core.mixture import MixtureModel
 
     >>> # Create initializer with KMeans clustering
     >>> initializer_cluster = ClusterizeInitializer(
@@ -32,7 +32,8 @@
     >>>Exponential(loc=5.0, rate=0.05), Exponential(loc=10.0, rate=0.01)]
 
     >>> # Generate sample data
-    >>> X = np.linspace(0.01, 25.0, 300)
+    >>> mixture = MixtureModel(distributions, [0.3, 0.4, 0.3])
+    >>> X = mixture.generate(300)
 
     >>> # Perform initialization
     >>> mixture_model = initializer_cluster.perform(

diff --git a/rework_pysatl_mpest/initializers/cluster_match_strategy.py b/rework_pysatl_mpest/initializers/cluster_match_strategy.py
@@ -16,6 +16,43 @@
 from rework_pysatl_mpest.optimizers import Optimizer, ScipyNelderMead
 
 
+def _validate_clusters_distributions(
+    H: np.ndarray, models_count: int, estimation_strategies_count: int, min_samples: int
+) -> tuple[list[int], list[float]]:
+    if not np.allclose(np.sum(H, axis=1), 1, atol=1e-10):
+        raise ValueError("Sum of H matrix weights must be equal to 1")
+
+    n_clusters = H.shape[1]
+
+    if estimation_strategies_count != models_count:
+        raise ValueError("Number of estimation functions must match number of models")
+
+    cluster_weights: list[float] = np.sum(H, axis=0)
+
+    valid_clusters = [k for k in range(n_clusters) if cluster_weights[k] >= min_samples]
+    if len(valid_clusters) != models_count:
+        return [], []
+    return valid_clusters, cluster_weights
+
+
+def _calculate_cluster_fit(
+    temp_model: ContinuousDistribution,
+    estimation_func: Callable,
+    X: np.ndarray,
+    H_k: np.ndarray,
+    optimizer: Optimizer,
+) -> tuple[dict[str, float], float]:
+    new_params = estimation_func(temp_model, X, H_k, optimizer)
-    new_params = estimation_func(temp_model, X, H_k, optimizer)
+    param_names, param_values = estimation_func(temp_model, X, H_k, optimizer).items()
-    new_params = estimation_func(temp_model, X, H_k, optimizer)
+    param_names, param_values = estimation_func(temp_model, X, H_k, optimizer).items()
+    param_names = new_params.keys()
+    param_values = new_params.values()
+    temp_model.set_params_from_vector(param_names, param_values)
+
+    log_probs = np.clip(temp_model.lpdf(X), -1e9, -1e-9)
+    weighted_log_likelihood = np.sum(H_k * log_probs)
-    weighted_log_likelihood = np.sum(H_k * log_probs)
+    weighted_log_likelihood = np.dot(H_k, log_probs)
-    weighted_log_likelihood = np.sum(H_k * log_probs)
+    weighted_log_likelihood = np.dot(H_k, log_probs)
+
+    return new_params, weighted_log_likelihood
+
+
 def match_clusters_for_models_log_likelihood(
     models: list[ContinuousDistribution],
     X: np.ndarray,
@@ -77,52 +114,32 @@ def match_clusters_for_models_log_likelihood(
     If insufficient valid clusters are found, returns default parameters and equal weights.
     """
 
-    if not np.allclose(np.sum(H, axis=1), 1, atol=1e-10):
-        raise ValueError("Sum of H matrix weights must be equal to 1")
-
-    X = X.flatten()
-    n_clusters = H.shape[1]
     n_models = len(models)
-
-    if len(estimation_strategies) != n_models:
-        raise ValueError("Number of estimation functions must match number of models")
-
-    updated_params_list = []
-    model_weights = []
-
-    cluster_weights = np.sum(H, axis=0)
-
-    valid_clusters = [k for k in range(n_clusters) if cluster_weights[k] >= min_samples]
-    if len(valid_clusters) != n_models:
+    valid_clusters, cluster_weights = _validate_clusters_distributions(
+        H, n_models, len(estimation_strategies), min_samples
+    )
+    if not (valid_clusters or cluster_weights):
         default_params: list[dict] = [{} for _ in range(n_models)]
         equal_weights = [1.0 / n_models] * n_models
         return models, default_params, equal_weights
-
+    updated_params_list = []
+    model_weights = []
     used_clusters = set()
 
-    for i, (model, estimation_func) in enumerate(zip(models, estimation_strategies)):
+    for model, estimation_func in zip(models, estimation_strategies):
         best_score = -np.inf
         best_params = {}
         best_cluster_weight = 0.0
         best_cluster = None
-        temp_model = copy(model)
-        default_params_names, default_params_values = (
-            list(temp_model.params),
-            temp_model.get_params_vector(list(temp_model.params)),
-        )
 
         for k in valid_clusters:
             if k in used_clusters:
                 continue
-            H_k = H[:, k]
 
-            new_params = estimation_func(temp_model, X, H_k, optimizer)
-            param_names = new_params.keys()
-            param_values = new_params.values()
-            temp_model.set_params_from_vector(param_names, param_values)
-
-            log_probs = np.clip(temp_model.lpdf(X), -1e9, -1e-9)
-            weighted_log_likelihood = np.sum(H_k * log_probs)
+            H_k = H[:, k]
+            new_params, weighted_log_likelihood = _calculate_cluster_fit(
+                copy(model), estimation_func, X, H_k, optimizer
+            )
 
             effective_n = cluster_weights[k]
             score = weighted_log_likelihood / effective_n
@@ -133,8 +150,6 @@ def match_clusters_for_models_log_likelihood(
                 best_cluster_weight = cluster_weights[k] / len(X)
                 best_cluster = k
 
-            temp_model.set_params_from_vector(default_params_names, default_params_values)
-
         used_clusters.add(best_cluster)
         updated_params_list.append(best_params)
         model_weights.append(float(best_cluster_weight))
@@ -201,65 +216,42 @@ def match_clusters_for_models_akaike(
     AIC is calculated as: ``2 * k - 2 * log_likelihood``, where ``k`` is the number of parameters.
     """
 
-    if not np.allclose(np.sum(H, axis=1), 1, atol=1e-10):
-        raise ValueError("Sum of H matrix weights must be equal to 1")
-
-    n_clusters = H.shape[1]
     n_models = len(models)
-
-    if len(estimation_strategies) != n_models:
-        raise ValueError("Number of estimation functions must match number of models")
-
-    aic_scores_dict = {}
-
-    cluster_weights = np.sum(H, axis=0)
-    valid_clusters = [k for k in range(n_clusters) if cluster_weights[k] >= min_samples]
-
-    if len(valid_clusters) != n_models:
-        default_params: list[dict] = [{} for _ in range(n_models)]
+    valid_clusters, cluster_weights = _validate_clusters_distributions(
+        H, n_models, len(estimation_strategies), min_samples
+    )
+    if not (valid_clusters or cluster_weights):
+        default_params: list[dict[str, float]] = [{} for _ in range(n_models)]
         equal_weights = [1.0 / n_models] * n_models
         return models, default_params, equal_weights
 
+    aic_scores_dict: dict[str, dict] = {}
+
     for i, (model, estimation_func) in enumerate(zip(models, estimation_strategies)):
-        temp_model = copy(model)
-        default_params_names, default_params_values = (
-            list(temp_model.params),
-            temp_model.get_params_vector(list(temp_model.params)),
-        )
         for k in valid_clusters:
             H_k = H[:, k]
-
-            new_params = estimation_func(temp_model, X, H_k, optimizer)
-            param_names = new_params.keys()
-            param_values = new_params.values()
-            temp_model.set_params_from_vector(param_names, param_values)
-
-            log_probs = np.clip(temp_model.lpdf(X), -1e9, -1e-9)
-            weighted_log_likelihood = np.sum(H_k * log_probs)
+            new_params, weighted_log_likelihood = _calculate_cluster_fit(
+                copy(model), estimation_func, X, H_k, optimizer
+            )
 
             k_params = len(model.params)
-
             aic_score = 2 * k_params - 2 * weighted_log_likelihood
 
             key = f"{i}_{k}"
             aic_scores_dict[key] = {
                 "aic_score": aic_score,
                 "params": new_params,
                 "cluster_weight": cluster_weights[k] / len(X),
-                "model_idx": i,
-                "cluster_idx": k,
             }
-            temp_model.set_params_from_vector(default_params_names, default_params_values)
 
     best_total_aic = np.inf
-    best_params_assignment = []
-    best_weights_assignment = []
+    best_params_assignment: list[dict[str, float]] = []
+    best_weights_assignment: list[float] = []
 
     for cluster_perm in permutations(valid_clusters, n_models):
-        total_aic = 0
-        params_assignment = []
-        weights_assignment = []
-        valid_assignment = True
+        total_aic = 0.0
+        params_assignment: list[dict[str, float]] = []
+        weights_assignment: list[float] = []
 
         for i, cluster_idx in enumerate(cluster_perm):
             key = f"{i}_{cluster_idx}"
@@ -268,7 +260,7 @@ def match_clusters_for_models_akaike(
             params_assignment.append(data["params"])
             weights_assignment.append(data["cluster_weight"])
 
-        if valid_assignment and total_aic < best_total_aic:
+        if total_aic < best_total_aic:
             best_total_aic = total_aic
             best_params_assignment = params_assignment
             best_weights_assignment = weights_assignment

diff --git a/rework_pysatl_mpest/initializers/clusterize_initializer.py b/rework_pysatl_mpest/initializers/clusterize_initializer.py
@@ -320,6 +320,8 @@ def perform(
         5. Returns the initialized mixture model
         """
         X = np.asarray(X, dtype=np.float64)
+        if X.ndim == 1:
+            X = X.reshape(-1, 1)
         self.models = dists
         self.n_components = len(dists)
         H = self._clusterize(X, self.clusterizer)

diff --git a/rework_tests/unit/test_initializers/test_clusterize_initializer.py b/rework_tests/unit/test_initializers/test_clusterize_initializer.py
@@ -183,7 +183,7 @@ def test_perform_accurate_init_normal_path(self):
     def test_perform_accurate_init_fallback_to_fast_init(self):
         initializer = ClusterizeInitializer(is_accurate=True, is_soft=True, clusterizer=self.mock_clusterizer)
 
-        X = np.array([1.0, 2.0, 3.0])
+        X = np.array([[1.0], [2.0], [3.0]])
         H = np.array([[0.8, 0.2], [0.7, 0.3], [0.9, 0.1]])
         dists = [self.mock_distributions[0], self.mock_distributions[1]]