From 4862341e3b1ec84a9dead15852df55c179f7fe92 Mon Sep 17 00:00:00 2001 From: alistkova Date: Wed, 26 Mar 2025 19:41:48 +0300 Subject: [PATCH 01/31] feat(kliep): rewrite kliep algorithm implementation --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 176 ++++++++++++------ 1 file changed, 119 insertions(+), 57 deletions(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index 39eb255c..beffca43 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -1,82 +1,144 @@ -from typing import cast +""" +Module for implementation of CPD algorithm using KLIEP-based divergence estimation. +""" + +__author__ = "Aleksandra Listkova" +__copyright__ = "Copyright (c) 2025 Aleksandra Listkova" +__license__ = "SPDX-License-Identifier: MIT" import numpy as np import numpy.typing as npt -from numpy import dtype, float64, ndarray +from scipy.optimize import minimize +from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm -class KliepAlgorithm(DensityBasedAlgorithm): - """Kullback-Leibler Importance Estimation Procedure (KLIEP) algorithm - for change point detection. - - KLIEP estimates the density ratio between two distributions and uses - the importance weights for detecting changes in the data distribution. - """ +class KliepAlgorithm(Algorithm): + def __init__( + self, + bandwidth: float = 1.0, + regularization: float = 0.1, + threshold: float = 1.1, + max_iter: int = 100, + min_window_size: int = 10 + ): + """ + Initializes a new instance of KLIEP based change point detection algorithm. - def __init__(self, bandwidth: float, regularization_coef: float, threshold: float = 1.1): - """Initialize the KLIEP algorithm. + :param bandwidth: the bandwidth parameter for the kernel density estimation. + :param regularization: L2 regularization coefficient for the KLIEP optimization. + :param threshold: detection threshold for significant change points. + :param max_iter: maximum number of iterations for the L-BFGS-B optimizer. + :param min_window_size: minimum size of data segments to consider. + """ + self.bandwidth = bandwidth, + self.regularisation = regularization, + self.threshold = threshold, + self.max_iter = max_iter, + self.min_window_size = min_window_size - Args: - bandwidth (float): bandwidth parameter for density estimation. - regularization_coef (float): regularization parameter. - threshold (float, optional): threshold for detecting change points. - Defaults to 1.1. + def detect(self, window: npt.NDArray[np.float64]) -> int: """ - self.bandwidth = bandwidth - self.regularization_coef = regularization_coef - self.threshold = np.float64(threshold) + Finds change points in the given window. - def _loss_function(self, density_ratio: npt.NDArray[np.float64], alpha: npt.NDArray[np.float64]) -> float: - """Loss function for KLIEP. + :param window: input data window for change point detection. + :return: number of detected change points in the window. + """ + return len(self.localize(window)) - Args: - density_ratio (np.ndarray): estimated density ratio. - alpha (np.ndarray): coefficients for the density ratio. + def localize(self, window: npt.NDArray[np.float64]) -> list[int]: + """ + Identifies and returns the locations of change points in the window. - Returns: - float: the computed loss value. + :param window: input data window for change point localization. + :return: list of indices where change points were detected. """ - return -np.mean(density_ratio) + self.regularization_coef * np.sum(alpha**2) + window = self._validate_window(window) + if len(window) < self.min_window_size: + return [] - def detect(self, window: npt.NDArray[np.float64]) -> int: - """Detect the number of change points in the given data window - using KLIEP. + scores = self._compute_kliep_scores(window) + return self._find_change_points(scores) - Args: - window (Iterable[float]): the data window to detect change points. + def _validate_window(self, window: np.ndarray) -> np.ndarray: + """ + Validates and prepares the input window for processing. - Returns: - int: the number of detected change points. + :param window: input data window. + :return: validated window in 2D format. """ + window = np.asarray(window) + if window.ndim == 1: + window = window.reshape(-1, 1) + return window - window_sample = np.array(window) - weights = self._calculate_weights( - test_value=window_sample, - reference_value=window_sample, - bandwidth=self.bandwidth, - objective_function=self._loss_function, - ) + def _compute_kliep_scores(self, window: np.ndarray) -> np.ndarray: + """ + Computes KLIEP anomaly scores for each point in the window. - return np.count_nonzero(weights > self.threshold) + :param window: validated input data window. + :return: array of KLIEP scores for each point. + """ + n_points = window.shape(0) + scores = np.zeros(n_points) + + for i in range(self.min_window_size, n_points - self.min_window_size): + before = window[:i] + after = window[i:] + + before_density = DensityBasedAlgorithm._kernel_density_estimation( + before, self.bandwidth + ) + after_density = DensityBasedAlgorithm._kernel_density_estimation( + after, self.bandwidth + ) + + alpha = self._optimize_alpha(after_density, before_density) + scores[i] = np.mean(np.exp(after_density - before_density - alpha)) + + return scores + + def _optimize_alpha( + self, + test_density: np.ndarray, + ref_density: np.ndarray + ) -> np.ndarray: + """ + Optimizes the alpha parameters for KLIEP density ratio estimation. - def localize(self, window: npt.NDArray[np.float64]) -> list[int]: - """Localize the change points in the given data window using KLIEP. + :param test_density: density estimates for the test window. + :param ref_density: density estimates for the reference window. + :return: optimized alpha parameters. + """ + def loss(alpha: np.ndarray) -> float: + """Objective function for KLIEP optimization.""" + ratio = np.exp(test_density - ref_density - alpha) + return -np.mean(np.log(ratio)) + self.regularisation * np.sum(alpha**2) + + res = minimize( + loss, + np.zeros_like(test_density), + method='L-BFGS-B', + options={'maxiter': self.max_iter}, + bounds=[(0, None)] * len(test_density) + ) + return res.x - Args: - window (Iterable[float]): the data window to localize - change points. + def _find_change_points(self, scores: np.ndarray) -> list[int]: + """ + Identifies change points from computed KLIEP scores. - Returns: - List[int]: the indices of the detected change points. + :param scores: array of KLIEP scores for each point. + :return: list of detected change point indices. """ - window_sample = np.array(window) - weights: ndarray[tuple[int, ...], dtype[float64]] = self._calculate_weights( - test_value=window_sample, - reference_value=window_sample, - bandwidth=self.bandwidth, - objective_function=self._loss_function, - ) + candidates = np.where(scores > self.threshold)[0] + if len(candidates) == 0: + return [] + + change_points = [candidates[0]] + for points in candidates[1:]: + if points - change_points[-1] > self.min_window_size: + change_points.append(points) - return cast(list[int], np.where(weights > self.threshold)[0].tolist()) + return change_points From f6348e92b3cb1f2352215462218e69a4ef90f9a0 Mon Sep 17 00:00:00 2001 From: Alexey Date: Tue, 18 Mar 2025 03:02:29 +0300 Subject: [PATCH 02/31] refactor(algorithms): refactoring changes rebase. --- .../algorithms/bayesian_online_algorithm.py | 165 ++++++++++++++++++ .../core/algorithms/online_algorithm.py | 33 ++++ .../test_bayesian_online_algorithm.py | 150 ++++++++++++++++ 3 files changed, 348 insertions(+) create mode 100644 pysatl_cpd/core/algorithms/bayesian_online_algorithm.py create mode 100644 pysatl_cpd/core/algorithms/online_algorithm.py create mode 100644 tests/test_core/test_algorithms/test_bayesian_online_algorithm.py diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py new file mode 100644 index 00000000..3afa9890 --- /dev/null +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -0,0 +1,165 @@ +""" +Module for Bayesian online change point detection algorithm. +""" + +__author__ = "Alexey Tatyanenko" +__copyright__ = "Copyright (c) 2025 Alexey Tatyanenko" +__license__ = "SPDX-License-Identifier: MIT" + +from typing import List + +import numpy as np +import numpy.typing as npt + +from pysatl_cpd.core.algorithms.bayesian.abstracts.idetector import IDetector +from pysatl_cpd.core.algorithms.bayesian.abstracts.ihazard import IHazard +from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood +from pysatl_cpd.core.algorithms.bayesian.abstracts.ilocalizer import ILocalizer +from pysatl_cpd.core.algorithms.online_algorithm import OnlineCpdAlgorithm + + +class BayesianOnlineCpd(OnlineCpdAlgorithm): + """ + Class for Bayesian online change point detection algorithm. + """ + + def __init__( + self, + hazard: IHazard, + likelihood: ILikelihood, + learning_sample_size: int, + detector: IDetector, + localizer: ILocalizer, + ) -> None: + self.detector = detector + self.hazard = hazard + self.likelihood = likelihood + self.localizer = localizer + self.window_size = learning_sample_size + + self.training_data: List[np.float64] = [] + self.data_history: List[np.float64] = [] + self.current_time = 0 + + self.is_training: bool = True + self.run_length_probs: np.ndarray = np.array([]) + + self.__was_change_point = False + self.__change_point: int | None = None + + def clear(self) -> None: + """ + Clears the state of the algorithm's instance. + :return: + """ + self.training_data = [] + self.data_history = [] + self.current_time = 0 + + self.is_training = True + self.run_length_probs = np.array([]) + + self.__was_change_point = False + self.__change_point = None + + def __train(self, value: np.float64) -> None: + self.training_data.append(value) + if len(self.training_data) == self.window_size: + self.likelihood.clear() + self.detector.clear() + + self.likelihood.learn(self.training_data) + self.is_training = False + self.run_length_probs = np.array([1.0]) + + def __bayesian_update(self, value: np.float64): + predictive_prob = self.likelihood.predict(value) + current_run_lengths = np.arange(len(self.run_length_probs)) + hazards = self.hazard.hazard(current_run_lengths) + growth_probs = self.run_length_probs * (1 - hazards) * predictive_prob + reset_prob = np.sum(self.run_length_probs * hazards * predictive_prob) + new_probs = np.append(reset_prob, growth_probs) + new_probs /= np.sum(new_probs) + self.run_length_probs = new_probs + self.likelihood.update(value) + + def __handle_localization(self) -> None: + run_length = self.localizer.localize(self.run_length_probs) + change_point_location = self.current_time - run_length + self.training_data = self.data_history[-run_length:] + self.data_history = self.data_history[-run_length:] + self.__change_point = change_point_location + + self.likelihood.clear() + self.detector.clear() + self.is_training = True + + if len(self.training_data) >= self.window_size: + self.training_data = self.training_data[:self.window_size] + self.likelihood.learn(self.training_data) + self.is_training = False + self.run_length_probs = np.array([1.0]) + + for value in self.data_history[self.window_size + 1:]: + self.__bayesian_update(value) + + def __handle_detection(self) -> None: + self.data_history = self.data_history[-1:] + self.training_data = self.data_history[:] + self.likelihood.clear() + self.detector.clear() + self.is_training = True + + def __process_point(self, value: np.float64, with_localization: bool) -> None: + """ + Universal method for processing of another value of a time series. + :param value: new value of a time series. + :param with_localization: whether the method was called for localization of a change point. + :return: + """ + self.data_history.append(value) + self.current_time += 1 + + if self.is_training: + self.__train(value) + else: + self.__bayesian_update(value) + detected = self.detector.detect(self.run_length_probs) + + if not detected: + return + + self.__was_change_point = True + if with_localization: + self.__handle_localization() + else: + self.__handle_detection() + + def detect(self, value: np.float64 | npt.NDArray[np.float64]) -> bool: + """ + Performs a change point detection after processing another value of a time series. + :param value: new value of a time series. Note: multivariate time series aren't supported for now. + :return: whether a change point was detected after processing the new value. + """ + if value is npt.NDArray[np.float64]: + raise TypeError("Multivariate values are not supported") + + self.__process_point(np.float64(value), False) + result = self.__was_change_point + self.__was_change_point = False + return result + + def localize(self, value: np.float64 | npt.NDArray[np.float64]): + """ + Performs a change point localization after processing another value of a time series. + :param value: new value of a time series. + :return: location of a change point, acquired after processing the new value, or None if there wasn't any. + """ + if value is npt.NDArray[np.float64]: + raise TypeError("Multivariate values are not supported") + + self.__process_point(np.float64(value), True) + result = self.__change_point + self.__was_change_point = False + self.__change_point = None + return result diff --git a/pysatl_cpd/core/algorithms/online_algorithm.py b/pysatl_cpd/core/algorithms/online_algorithm.py new file mode 100644 index 00000000..2731292a --- /dev/null +++ b/pysatl_cpd/core/algorithms/online_algorithm.py @@ -0,0 +1,33 @@ +""" +Module for online change point detection algorithm's interface. +""" + +__author__ = "Alexey Tatyanenko" +__copyright__ = "Copyright (c) 2025 Alexey Tatyanenko" +__license__ = "SPDX-License-Identifier: MIT" + +from typing import Optional, Protocol + +import numpy as np +import numpy.typing as npt + + +class OnlineCpdAlgorithm(Protocol): + """ + Class for online change point detection algorithm's interface. + """ + def detect(self, value: np.float64 | npt.NDArray[np.float64]) -> bool: + """ + Method for detection of a change point. + :param value: new value of a time series. + :return: bool value whether a change point was detected after processing the new value. + """ + ... + + def localize(self, value: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: + """ + Method for localization of a change point. + :param value: new value of a time series + :return: location of a change point, acquired after processing the new value, or None if there wasn't any. + """ + ... diff --git a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py new file mode 100644 index 00000000..ed7549e7 --- /dev/null +++ b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py @@ -0,0 +1,150 @@ +import numpy as np +import pytest + +from pysatl_cpd.core.algorithms.bayesian.detectors.simple import SimpleDetector +from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard +from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_unknown_mean_and_variance import ( + GaussianUnknownMeanAndVariance, +) +from pysatl_cpd.core.algorithms.bayesian.localizers.simple import SimpleLocalizer +from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnlineCpd + + +def set_seed(): + np.random.seed(1) + + +def construct_bayesian_online_algorithm(): + return BayesianOnlineCpd( + learning_sample_size=5, + likelihood=GaussianUnknownMeanAndVariance(), + hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), + detector=SimpleDetector(threshold=0.04), + localizer=SimpleLocalizer(), + ) + + +@pytest.fixture(scope="function") +def data_params(): + return { + "num_of_tests": 10, + "size": 500, + "change_point": 250, + "tolerable_deviation": 25, + } + + +@pytest.fixture +def generate_data(data_params): + def _generate_data(): + set_seed() + return np.concatenate( + [ + np.random.normal(loc=0, scale=1, size=data_params["change_point"]), + np.random.normal(loc=5, scale=2, size=data_params["size"] - data_params["change_point"]), + ] + ) + + return _generate_data + + +@pytest.fixture(scope="function") +def outer_bayesian_algorithm(): + return construct_bayesian_online_algorithm() + + +@pytest.fixture +def inner_algorithm_factory(): + def _factory(): + return construct_bayesian_online_algorithm() + + return _factory + + +def test_consecutive_detection(outer_bayesian_algorithm, generate_data, data_params): + for _ in range(data_params["num_of_tests"]): + data = generate_data() + was_change_point = False + for value in data: + result = outer_bayesian_algorithm.detect(value) + if result: + was_change_point = True + + assert was_change_point, "There was undetected change point in data" + + +def test_correctness_of_consecutive_detection( + outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params +): + for _ in range(data_params["num_of_tests"]): + data = generate_data() + inner_algorithm = inner_algorithm_factory() + outer_result = [] + inner_result = [] + + for value in data: + outer_result.append(outer_bayesian_algorithm.detect(value)) + inner_result.append(inner_algorithm.detect(value)) + + outer_bayesian_algorithm.clear() + assert outer_result == inner_result, "Consecutive and independent detection should give same results" + + +def test_consecutive_localization(outer_bayesian_algorithm, generate_data, data_params): + for _ in range(data_params["num_of_tests"]): + data = generate_data() + was_change_point = False + + for value in data: + result = outer_bayesian_algorithm.localize(value) + if result is None: + continue + + was_change_point = True + + assert ( + data_params["change_point"] - data_params["tolerable_deviation"] + <= result + <= data_params["change_point"] + data_params["tolerable_deviation"] + ), "Incorrect change point localization" + + outer_bayesian_algorithm.clear() + assert was_change_point, "Actual change point was not detected at all" + + +def test_correctness_of_consecutive_localization( + outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params +): + for _ in range(data_params["num_of_tests"]): + data = generate_data() + inner_algorithm = inner_algorithm_factory() + + for value in data: + outer_result = outer_bayesian_algorithm.localize(value) + inner_result = inner_algorithm.localize(value) + assert outer_result == inner_result, "Consecutive and independent localization should give same results" + + outer_bayesian_algorithm.clear() + +def test_online_detection_correctness( + inner_algorithm_factory, generate_data, data_params +): + for _ in range(data_params["num_of_tests"]): + algorithm = inner_algorithm_factory() + data = generate_data() + for time, value in np.ndenumerate(data): + result = algorithm.detect(value) + if result: + assert data_params["change_point"] <= time[0], "Change point cannot be detected beforehand" + +def test_online_localization_correctness( + inner_algorithm_factory, generate_data, data_params +): + for _ in range(data_params["num_of_tests"]): + algorithm = inner_algorithm_factory() + data = generate_data() + for time, value in np.ndenumerate(data): + result = algorithm.localize(value) + if result: + assert result <= time[0], "Change point cannot be in future" + assert data_params["change_point"] <= time[0], "Change point cannot be detected beforehand" From b331aa687d276f01d7593bcb54da868a3246eac3 Mon Sep 17 00:00:00 2001 From: Alexey Date: Tue, 18 Mar 2025 03:08:59 +0300 Subject: [PATCH 03/31] fix(algorithms): typing fix from List to list. --- .../algorithms/bayesian_online_algorithm.py | 22 +++++++++---------- .../core/algorithms/online_algorithm.py | 1 + .../test_bayesian_online_algorithm.py | 20 ++++++++--------- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py index 3afa9890..3030e2ff 100644 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -6,8 +6,6 @@ __copyright__ = "Copyright (c) 2025 Alexey Tatyanenko" __license__ = "SPDX-License-Identifier: MIT" -from typing import List - import numpy as np import numpy.typing as npt @@ -24,12 +22,12 @@ class BayesianOnlineCpd(OnlineCpdAlgorithm): """ def __init__( - self, - hazard: IHazard, - likelihood: ILikelihood, - learning_sample_size: int, - detector: IDetector, - localizer: ILocalizer, + self, + hazard: IHazard, + likelihood: ILikelihood, + learning_sample_size: int, + detector: IDetector, + localizer: ILocalizer, ) -> None: self.detector = detector self.hazard = hazard @@ -37,8 +35,8 @@ def __init__( self.localizer = localizer self.window_size = learning_sample_size - self.training_data: List[np.float64] = [] - self.data_history: List[np.float64] = [] + self.training_data: list[np.float64] = [] + self.data_history: list[np.float64] = [] self.current_time = 0 self.is_training: bool = True @@ -95,12 +93,12 @@ def __handle_localization(self) -> None: self.is_training = True if len(self.training_data) >= self.window_size: - self.training_data = self.training_data[:self.window_size] + self.training_data = self.training_data[: self.window_size] self.likelihood.learn(self.training_data) self.is_training = False self.run_length_probs = np.array([1.0]) - for value in self.data_history[self.window_size + 1:]: + for value in self.data_history[self.window_size + 1 :]: self.__bayesian_update(value) def __handle_detection(self) -> None: diff --git a/pysatl_cpd/core/algorithms/online_algorithm.py b/pysatl_cpd/core/algorithms/online_algorithm.py index 2731292a..80a483f1 100644 --- a/pysatl_cpd/core/algorithms/online_algorithm.py +++ b/pysatl_cpd/core/algorithms/online_algorithm.py @@ -16,6 +16,7 @@ class OnlineCpdAlgorithm(Protocol): """ Class for online change point detection algorithm's interface. """ + def detect(self, value: np.float64 | npt.NDArray[np.float64]) -> bool: """ Method for detection of a change point. diff --git a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py index ed7549e7..0f17c817 100644 --- a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py +++ b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py @@ -74,7 +74,7 @@ def test_consecutive_detection(outer_bayesian_algorithm, generate_data, data_par def test_correctness_of_consecutive_detection( - outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params + outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params ): for _ in range(data_params["num_of_tests"]): data = generate_data() @@ -103,9 +103,9 @@ def test_consecutive_localization(outer_bayesian_algorithm, generate_data, data_ was_change_point = True assert ( - data_params["change_point"] - data_params["tolerable_deviation"] - <= result - <= data_params["change_point"] + data_params["tolerable_deviation"] + data_params["change_point"] - data_params["tolerable_deviation"] + <= result + <= data_params["change_point"] + data_params["tolerable_deviation"] ), "Incorrect change point localization" outer_bayesian_algorithm.clear() @@ -113,7 +113,7 @@ def test_consecutive_localization(outer_bayesian_algorithm, generate_data, data_ def test_correctness_of_consecutive_localization( - outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params + outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params ): for _ in range(data_params["num_of_tests"]): data = generate_data() @@ -126,9 +126,8 @@ def test_correctness_of_consecutive_localization( outer_bayesian_algorithm.clear() -def test_online_detection_correctness( - inner_algorithm_factory, generate_data, data_params -): + +def test_online_detection_correctness(inner_algorithm_factory, generate_data, data_params): for _ in range(data_params["num_of_tests"]): algorithm = inner_algorithm_factory() data = generate_data() @@ -137,9 +136,8 @@ def test_online_detection_correctness( if result: assert data_params["change_point"] <= time[0], "Change point cannot be detected beforehand" -def test_online_localization_correctness( - inner_algorithm_factory, generate_data, data_params -): + +def test_online_localization_correctness(inner_algorithm_factory, generate_data, data_params): for _ in range(data_params["num_of_tests"]): algorithm = inner_algorithm_factory() data = generate_data() From 02ebabab38fd820eef029cee5163c6fb7fa04f6a Mon Sep 17 00:00:00 2001 From: Alexey Date: Tue, 18 Mar 2025 03:19:06 +0300 Subject: [PATCH 04/31] refactor(algorithms): change of vision scope modifiers. --- .../algorithms/bayesian_online_algorithm.py | 102 +++++++++--------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py index 3030e2ff..a6e01f56 100644 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -29,18 +29,18 @@ def __init__( detector: IDetector, localizer: ILocalizer, ) -> None: - self.detector = detector - self.hazard = hazard - self.likelihood = likelihood - self.localizer = localizer - self.window_size = learning_sample_size + self.__detector = detector + self.__hazard = hazard + self.__likelihood = likelihood + self.__localizer = localizer + self.__window_size = learning_sample_size - self.training_data: list[np.float64] = [] - self.data_history: list[np.float64] = [] - self.current_time = 0 + self.__training_data: list[np.float64] = [] + self.__data_history: list[np.float64] = [] + self.__current_time = 0 - self.is_training: bool = True - self.run_length_probs: np.ndarray = np.array([]) + self.__is_training: bool = True + self.__run_length_probs: np.ndarray = np.array([]) self.__was_change_point = False self.__change_point: int | None = None @@ -50,63 +50,63 @@ def clear(self) -> None: Clears the state of the algorithm's instance. :return: """ - self.training_data = [] - self.data_history = [] - self.current_time = 0 + self.__training_data = [] + self.__data_history = [] + self.__current_time = 0 - self.is_training = True - self.run_length_probs = np.array([]) + self.__is_training = True + self.__run_length_probs = np.array([]) self.__was_change_point = False self.__change_point = None def __train(self, value: np.float64) -> None: - self.training_data.append(value) - if len(self.training_data) == self.window_size: - self.likelihood.clear() - self.detector.clear() + self.__training_data.append(value) + if len(self.__training_data) == self.__window_size: + self.__likelihood.clear() + self.__detector.clear() - self.likelihood.learn(self.training_data) - self.is_training = False - self.run_length_probs = np.array([1.0]) + self.__likelihood.learn(self.__training_data) + self.__is_training = False + self.__run_length_probs = np.array([1.0]) def __bayesian_update(self, value: np.float64): - predictive_prob = self.likelihood.predict(value) - current_run_lengths = np.arange(len(self.run_length_probs)) - hazards = self.hazard.hazard(current_run_lengths) - growth_probs = self.run_length_probs * (1 - hazards) * predictive_prob - reset_prob = np.sum(self.run_length_probs * hazards * predictive_prob) + predictive_prob = self.__likelihood.predict(value) + current_run_lengths = np.arange(len(self.__run_length_probs)) + hazards = self.__hazard.hazard(current_run_lengths) + growth_probs = self.__run_length_probs * (1 - hazards) * predictive_prob + reset_prob = np.sum(self.__run_length_probs * hazards * predictive_prob) new_probs = np.append(reset_prob, growth_probs) new_probs /= np.sum(new_probs) - self.run_length_probs = new_probs - self.likelihood.update(value) + self.__run_length_probs = new_probs + self.__likelihood.update(value) def __handle_localization(self) -> None: - run_length = self.localizer.localize(self.run_length_probs) - change_point_location = self.current_time - run_length - self.training_data = self.data_history[-run_length:] - self.data_history = self.data_history[-run_length:] + run_length = self.__localizer.localize(self.__run_length_probs) + change_point_location = self.__current_time - run_length + self.__training_data = self.__data_history[-run_length:] + self.__data_history = self.__data_history[-run_length:] self.__change_point = change_point_location - self.likelihood.clear() - self.detector.clear() - self.is_training = True + self.__likelihood.clear() + self.__detector.clear() + self.__is_training = True - if len(self.training_data) >= self.window_size: - self.training_data = self.training_data[: self.window_size] - self.likelihood.learn(self.training_data) - self.is_training = False - self.run_length_probs = np.array([1.0]) + if len(self.__training_data) >= self.__window_size: + self.__training_data = self.__training_data[: self.__window_size] + self.__likelihood.learn(self.__training_data) + self.__is_training = False + self.__run_length_probs = np.array([1.0]) - for value in self.data_history[self.window_size + 1 :]: + for value in self.__data_history[self.__window_size + 1 :]: self.__bayesian_update(value) def __handle_detection(self) -> None: - self.data_history = self.data_history[-1:] - self.training_data = self.data_history[:] - self.likelihood.clear() - self.detector.clear() - self.is_training = True + self.__data_history = self.__data_history[-1:] + self.__training_data = self.__data_history[:] + self.__likelihood.clear() + self.__detector.clear() + self.__is_training = True def __process_point(self, value: np.float64, with_localization: bool) -> None: """ @@ -115,14 +115,14 @@ def __process_point(self, value: np.float64, with_localization: bool) -> None: :param with_localization: whether the method was called for localization of a change point. :return: """ - self.data_history.append(value) - self.current_time += 1 + self.__data_history.append(value) + self.__current_time += 1 - if self.is_training: + if self.__is_training: self.__train(value) else: self.__bayesian_update(value) - detected = self.detector.detect(self.run_length_probs) + detected = self.__detector.detect(self.__run_length_probs) if not detected: return From 7b49a81d9dce6e42f725cf410c31ffc5087d6168 Mon Sep 17 00:00:00 2001 From: Alexey Date: Tue, 18 Mar 2025 04:55:44 +0300 Subject: [PATCH 05/31] fix(algorithms): change learning in Bayesian online CPD. --- .../algorithms/bayesian_online_algorithm.py | 41 ++++++++++++++----- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py index a6e01f56..59131b7e 100644 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -33,7 +33,7 @@ def __init__( self.__hazard = hazard self.__likelihood = likelihood self.__localizer = localizer - self.__window_size = learning_sample_size + self.__learning_sample_size = learning_sample_size self.__training_data: list[np.float64] = [] self.__data_history: list[np.float64] = [] @@ -60,9 +60,14 @@ def clear(self) -> None: self.__was_change_point = False self.__change_point = None - def __train(self, value: np.float64) -> None: + def __learn(self, value: np.float64) -> None: + """ + Performs a learning step for a prediction model until the given learning sample size is achieved. + :param value: new value of a time series. + :return: + """ self.__training_data.append(value) - if len(self.__training_data) == self.__window_size: + if len(self.__training_data) == self.__learning_sample_size: self.__likelihood.clear() self.__detector.clear() @@ -70,7 +75,12 @@ def __train(self, value: np.float64) -> None: self.__is_training = False self.__run_length_probs = np.array([1.0]) - def __bayesian_update(self, value: np.float64): + def __bayesian_update(self, value: np.float64) -> None: + """ + Performs a bayesian update of the algorithm's state. + :param value: new value of a time series. + :return: + """ predictive_prob = self.__likelihood.predict(value) current_run_lengths = np.arange(len(self.__run_length_probs)) hazards = self.__hazard.hazard(current_run_lengths) @@ -82,6 +92,11 @@ def __bayesian_update(self, value: np.float64): self.__likelihood.update(value) def __handle_localization(self) -> None: + """ + Handles localization of the change point. It includes acquiring location, updating stored data and state of the + algorithm, training it if possible and building corresponding run length distribution. + :return: + """ run_length = self.__localizer.localize(self.__run_length_probs) change_point_location = self.__current_time - run_length self.__training_data = self.__data_history[-run_length:] @@ -92,21 +107,25 @@ def __handle_localization(self) -> None: self.__detector.clear() self.__is_training = True - if len(self.__training_data) >= self.__window_size: - self.__training_data = self.__training_data[: self.__window_size] - self.__likelihood.learn(self.__training_data) - self.__is_training = False - self.__run_length_probs = np.array([1.0]) + if len(self.__training_data) >= self.__learning_sample_size: + self.__training_data = self.__training_data[: self.__learning_sample_size] + for value in self.__training_data: + self.__learn(value) - for value in self.__data_history[self.__window_size + 1 :]: + for value in self.__data_history[self.__learning_sample_size :]: self.__bayesian_update(value) def __handle_detection(self) -> None: + """ + Handles detection of the change point. It includes updating stored data and state of the algorithm. + :return: + """ self.__data_history = self.__data_history[-1:] self.__training_data = self.__data_history[:] self.__likelihood.clear() self.__detector.clear() self.__is_training = True + self.__learn(self.__training_data[-1]) def __process_point(self, value: np.float64, with_localization: bool) -> None: """ @@ -119,7 +138,7 @@ def __process_point(self, value: np.float64, with_localization: bool) -> None: self.__current_time += 1 if self.__is_training: - self.__train(value) + self.__learn(value) else: self.__bayesian_update(value) detected = self.__detector.detect(self.__run_length_probs) From 3f0e3ffb3da1d1b512da76f339004284866223f1 Mon Sep 17 00:00:00 2001 From: Alexey Date: Thu, 27 Mar 2025 00:18:57 +0300 Subject: [PATCH 06/31] refactor(bayesian algorithm): typing an naming changes --- ...ian_unknown_mean_and_variance.py => gaussian_conjugate.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename pysatl_cpd/core/algorithms/bayesian/likelihoods/{gaussian_unknown_mean_and_variance.py => gaussian_conjugate.py} (97%) diff --git a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_unknown_mean_and_variance.py b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py similarity index 97% rename from pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_unknown_mean_and_variance.py rename to pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py index 76ba3a4f..f2b6409d 100644 --- a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_unknown_mean_and_variance.py +++ b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py @@ -89,7 +89,7 @@ def update(self, observation: np.float64) -> None: self.__alpha_params = new_alpha_params self.__beta_params = new_beta_params - def predict(self, observation: np.float64) -> npt.ArrayLike: + def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: """ Returns predictive probabilities for a given observation based on posterior parameters. Predictive distribution is Student's t-distribution with 2 * alpha degrees of freedom. @@ -110,7 +110,7 @@ def predict(self, observation: np.float64) -> npt.ArrayLike: scale=scales, ) - return predictive_probabilities + return np.array(predictive_probabilities) def clear(self) -> None: """ From 067211fe16110b82e2305b8b3debb4133d3c3cce Mon Sep 17 00:00:00 2001 From: Alexey Date: Thu, 27 Mar 2025 00:21:16 +0300 Subject: [PATCH 07/31] refactor(bayesian algorithm): typing an naming changes --- .../bayesian/abstracts/idetector.py | 10 ++++---- .../algorithms/bayesian/abstracts/ihazard.py | 9 ++++--- .../bayesian/abstracts/ilikelihood.py | 14 ++++------- .../bayesian/abstracts/ilocalizer.py | 7 +++--- .../algorithms/bayesian/hazards/constant.py | 2 +- .../bayesian/likelihoods/gaussian.py | 8 ++++--- .../likelihoods/gaussian_conjugate.py | 2 +- .../core/algorithms/bayesian_algorithm.py | 10 ++++---- .../algorithms/bayesian_online_algorithm.py | 11 +++++---- .../test_bayesian_algorithm.py | 24 +++++++++---------- .../test_bayesian_online_algorithm.py | 6 ++--- 11 files changed, 49 insertions(+), 54 deletions(-) diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py index ee9d7453..5e83284d 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py @@ -7,29 +7,27 @@ __license__ = "SPDX-License-Identifier: MIT" -from abc import ABC, abstractmethod +from typing import Protocol import numpy as np import numpy.typing as npt -class IDetector(ABC): +class IDetector(Protocol): """ Abstract base class for detectors that detect a change point with given growth probabilities for run lengths. """ - @abstractmethod def detect(self, growth_probs: npt.NDArray[np.float64]) -> bool: """ Checks whether a changepoint occurred with given growth probabilities at the time. :param growth_probs: growth probabilities for run lengths at the time. :return: boolean indicating whether a changepoint occurred """ - raise NotImplementedError + ... - @abstractmethod def clear(self) -> None: """ Clears the detector's state. """ - raise NotImplementedError + ... diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py index fc7b8d1f..cabb4807 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py @@ -7,22 +7,21 @@ __license__ = "SPDX-License-Identifier: MIT" -from abc import ABC, abstractmethod +from typing import Protocol import numpy as np import numpy.typing as npt -class IHazard(ABC): +class IHazard(Protocol): """ Hazard function abstract base class. """ - @abstractmethod - def hazard(self, run_lengths: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: + def hazard(self, run_lengths: npt.NDArray[np.intp]) -> npt.NDArray[np.float64]: """ Calculates the hazard function for given run lengths. :param run_lengths: run lengths at the time. :return: hazard function's values for given run lengths. """ - raise NotImplementedError + ... diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py index f7ef8ad6..edcd7e69 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py @@ -7,18 +7,17 @@ __license__ = "SPDX-License-Identifier: MIT" -from abc import ABC, abstractmethod +from typing import Protocol import numpy as np import numpy.typing as npt -class ILikelihood(ABC): +class ILikelihood(Protocol): """ Likelihood function's abstract base class. """ - @abstractmethod def learn(self, learning_sample: npt.NDArray[np.float64]) -> None: """ Learns first parameters of a likelihood function on a given sample. @@ -26,8 +25,7 @@ def learn(self, learning_sample: npt.NDArray[np.float64]) -> None: """ ... - @abstractmethod - def predict(self, observation: np.float64) -> npt.ArrayLike: + def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: """ Returns predictive probabilities for a given observation based on stored parameters. :param observation: an observation from a sample. @@ -35,17 +33,15 @@ def predict(self, observation: np.float64) -> npt.ArrayLike: """ ... - @abstractmethod def update(self, observation: np.float64) -> None: """ Updates parameters of a likelihood function according to the given observation. :param observation: an observation from a sample. """ - raise NotImplementedError + ... - @abstractmethod def clear(self) -> None: """ Clears likelihood function's state. """ - raise NotImplementedError + ... diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py index 27223c42..7e328710 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py @@ -7,22 +7,21 @@ __license__ = "SPDX-License-Identifier: MIT" -from abc import ABC, abstractmethod +from typing import Protocol import numpy as np import numpy.typing as npt -class ILocalizer(ABC): +class ILocalizer(Protocol): """ Abstract base class for localizers that localize a change point with given growth probabilities for run lengths. """ - @abstractmethod def localize(self, growth_probs: npt.NDArray[np.float64]) -> int: """ Localizes a change point with given growth probabilities for run lengths. :param growth_probs: growth probabilities for run lengths at the time. :return: run length corresponding with a change point. """ - raise NotImplementedError + ... diff --git a/pysatl_cpd/core/algorithms/bayesian/hazards/constant.py b/pysatl_cpd/core/algorithms/bayesian/hazards/constant.py index f530f2c8..b0728820 100644 --- a/pysatl_cpd/core/algorithms/bayesian/hazards/constant.py +++ b/pysatl_cpd/core/algorithms/bayesian/hazards/constant.py @@ -27,7 +27,7 @@ def __init__(self, rate: float): self._rate = np.float64(rate) assert self._rate >= 1.0, "Hazard rate cannot be less than 1.0" - def hazard(self, run_lengths: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: + def hazard(self, run_lengths: npt.NDArray[np.intp]) -> npt.NDArray[np.float64]: """ Calculates the constant hazard function. :param run_lengths: run lengths at the time. diff --git a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py index d4464e14..860c7355 100644 --- a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py +++ b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py @@ -11,11 +11,13 @@ import numpy as np import numpy.typing as npt from scipy import stats +from sklearn.utils import deprecated from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood -class GaussianLikelihood(ILikelihood): +@deprecated("Use GaussianConjugate instead") +class Gaussian(ILikelihood): """ Likelihood for Gaussian (a.k.a. normal) distribution, parametrized by mean and standard deviation. """ @@ -76,13 +78,13 @@ def update(self, observation: np.float64) -> None: self.__update_parameters_lists() - def predict(self, observation: np.float64) -> npt.ArrayLike: + def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: """ Returns predictive probabilities for a given observation based on stored means and standard deviations. :param observation: an observation from a sample. :return: predictive probabilities for a given observation. """ - return stats.norm(self.__means, self.__standard_deviations).pdf(observation) + return np.array(stats.norm(self.__means, self.__standard_deviations).pdf(observation)) def clear(self) -> None: """ diff --git a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py index f2b6409d..e43419b3 100644 --- a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py +++ b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py @@ -15,7 +15,7 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood -class GaussianUnknownMeanAndVariance(ILikelihood): +class GaussianConjugate(ILikelihood): """ Likelihood for Gaussian (a.k.a. normal) distribution with unknown mean and variance estimated from normal-inverse gamma distribution as a conjugate prior. It uses 4 parameters, which priors are estimated from a learning sample and diff --git a/pysatl_cpd/core/algorithms/bayesian_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_algorithm.py index 82ef71b8..69d2c4a5 100644 --- a/pysatl_cpd/core/algorithms/bayesian_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_algorithm.py @@ -17,8 +17,8 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.ilocalizer import ILocalizer from pysatl_cpd.core.algorithms.bayesian.detectors.simple import SimpleDetector from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard -from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_unknown_mean_and_variance import ( - GaussianUnknownMeanAndVariance, +from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( + GaussianConjugate, ) from pysatl_cpd.core.algorithms.bayesian.localizers.simple import SimpleLocalizer @@ -36,7 +36,7 @@ class BayesianAlgorithm(Algorithm): def __init__( self, learning_steps: int = 50, - likelihood: ILikelihood = GaussianUnknownMeanAndVariance(), + likelihood: ILikelihood = GaussianConjugate(), hazard: IHazard = ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), detector: IDetector = SimpleDetector(threshold=0.04), localizer: ILocalizer = SimpleLocalizer(), @@ -122,7 +122,7 @@ def __bayesian_stage(self, sample: npt.NDArray[np.float64]) -> None: while self.__bayesian_condition(sample_size): assert self.__time >= 0 - observation = sample[self.__time] + observation = np.float64(sample[self.__time]) self.__shift_time(1) self.__gap_size += 1 assert self.__gap_size > 0 @@ -180,7 +180,7 @@ def __bayesian_update(self, observation: np.float64) -> None: return # 4. Evaluate the hazard function for the gap. - hazard_val = np.array(self.__hazard.hazard(np.array(range(self.__gap_size)))) + hazard_val = np.array(self.__hazard.hazard(np.arange(self.__gap_size, dtype=np.intp))) # Evaluate the changepoint probability at *this* step (NB: generally it can be found later, with some delay). changepoint_prob = np.sum(self.__growth_probs[0 : self.__gap_size] * predictive_probs * hazard_val) diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py index 59131b7e..d7d1bf77 100644 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -6,6 +6,8 @@ __copyright__ = "Copyright (c) 2025 Alexey Tatyanenko" __license__ = "SPDX-License-Identifier: MIT" +from typing import Optional + import numpy as np import numpy.typing as npt @@ -40,7 +42,7 @@ def __init__( self.__current_time = 0 self.__is_training: bool = True - self.__run_length_probs: np.ndarray = np.array([]) + self.__run_length_probs: npt.NDArray[np.float64] = np.array([]) self.__was_change_point = False self.__change_point: int | None = None @@ -71,7 +73,7 @@ def __learn(self, value: np.float64) -> None: self.__likelihood.clear() self.__detector.clear() - self.__likelihood.learn(self.__training_data) + self.__likelihood.learn(np.array(self.__training_data)) self.__is_training = False self.__run_length_probs = np.array([1.0]) @@ -82,8 +84,7 @@ def __bayesian_update(self, value: np.float64) -> None: :return: """ predictive_prob = self.__likelihood.predict(value) - current_run_lengths = np.arange(len(self.__run_length_probs)) - hazards = self.__hazard.hazard(current_run_lengths) + hazards = self.__hazard.hazard(np.arange(self.__run_length_probs.shape[0], dtype=np.intp)) growth_probs = self.__run_length_probs * (1 - hazards) * predictive_prob reset_prob = np.sum(self.__run_length_probs * hazards * predictive_prob) new_probs = np.append(reset_prob, growth_probs) @@ -166,7 +167,7 @@ def detect(self, value: np.float64 | npt.NDArray[np.float64]) -> bool: self.__was_change_point = False return result - def localize(self, value: np.float64 | npt.NDArray[np.float64]): + def localize(self, value: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: """ Performs a change point localization after processing another value of a time series. :param value: new value of a time series. diff --git a/tests/test_core/test_algorithms/test_bayesian_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_algorithm.py index 0694b8bf..b16598ea 100644 --- a/tests/test_core/test_algorithms/test_bayesian_algorithm.py +++ b/tests/test_core/test_algorithms/test_bayesian_algorithm.py @@ -3,8 +3,8 @@ from pysatl_cpd.core.algorithms.bayesian.detectors.simple import SimpleDetector from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard -from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_unknown_mean_and_variance import ( - GaussianUnknownMeanAndVariance, +from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( + GaussianConjugate, ) from pysatl_cpd.core.algorithms.bayesian.localizers.simple import SimpleLocalizer from pysatl_cpd.core.algorithms.bayesian_algorithm import BayesianAlgorithm @@ -17,7 +17,7 @@ def set_seed(): def construct_bayesian_algorithm(): return BayesianAlgorithm( learning_steps=50, - likelihood=GaussianUnknownMeanAndVariance(), + likelihood=GaussianConjugate(), hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), detector=SimpleDetector(threshold=0.04), localizer=SimpleLocalizer(), @@ -105,7 +105,7 @@ def test_correctness_of_consecutive_localization( @pytest.mark.parametrize("hazard_rate,max_run_length", [(1.1, 50), (10, 100), (200, 250), (500.325251, 500)]) def test_constant_hazard_for_constants(hazard_rate, max_run_length): constant_hazard = ConstantHazard(hazard_rate) - run_lengths = np.arange(max_run_length) + run_lengths = np.arange(max_run_length, dtype=np.intp) hazard_probs = constant_hazard.hazard(run_lengths) assert hazard_probs.shape[0] == max_run_length, ( f"Expected {max_run_length} probabilities, got {hazard_probs.shape[0]}" @@ -132,11 +132,11 @@ def test_learning_and_update(): np.random.normal(loc=5, scale=2, size=size - change_point), ] ) - likelihood = GaussianUnknownMeanAndVariance() - likelihood.learn(list(data[:learning_steps])) + likelihood = GaussianConjugate() + likelihood.learn(data[:learning_steps]) for time in range(learning_steps, size): - observation = float(data[time]) + observation = np.float64(data[time]) pred_probs = np.array(likelihood.predict(observation)) if time == time_after_learning: @@ -165,15 +165,15 @@ def test_clear(): size = 51 data = np.random.normal(loc=0, scale=1, size=size) - likelihood = GaussianUnknownMeanAndVariance() + likelihood = GaussianConjugate() - likelihood.learn(list(data[:-2])) - first = likelihood.predict(float(data[-1])) + likelihood.learn(data[:-2]) + first = likelihood.predict(np.float64(data[-1])) likelihood.clear() - likelihood.learn(list(data[:-2])) - second = likelihood.predict(float(data[-1])) + likelihood.learn(data[:-2]) + second = likelihood.predict(np.float64(data[-1])) assert first == second, f"Results differ after clear: {first} vs {second}" diff --git a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py index 0f17c817..8d145ac3 100644 --- a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py +++ b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py @@ -3,8 +3,8 @@ from pysatl_cpd.core.algorithms.bayesian.detectors.simple import SimpleDetector from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard -from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_unknown_mean_and_variance import ( - GaussianUnknownMeanAndVariance, +from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( + GaussianConjugate, ) from pysatl_cpd.core.algorithms.bayesian.localizers.simple import SimpleLocalizer from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnlineCpd @@ -17,7 +17,7 @@ def set_seed(): def construct_bayesian_online_algorithm(): return BayesianOnlineCpd( learning_sample_size=5, - likelihood=GaussianUnknownMeanAndVariance(), + likelihood=GaussianConjugate(), hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), detector=SimpleDetector(threshold=0.04), localizer=SimpleLocalizer(), From 53535e89eae12bfd67ef195ed2c59dec70670635 Mon Sep 17 00:00:00 2001 From: Alexey Date: Thu, 27 Mar 2025 00:30:45 +0300 Subject: [PATCH 08/31] refactor(bayesian algorithm): detectors and localizers renaming. --- .../bayesian/detectors/{simple.py => threshold.py} | 2 +- .../bayesian/localizers/{simple.py => argmax.py} | 2 +- pysatl_cpd/core/algorithms/bayesian_algorithm.py | 8 ++++---- .../test_algorithms/test_bayesian_algorithm.py | 14 +++++++------- .../test_bayesian_online_algorithm.py | 8 ++++---- 5 files changed, 17 insertions(+), 17 deletions(-) rename pysatl_cpd/core/algorithms/bayesian/detectors/{simple.py => threshold.py} (97%) rename pysatl_cpd/core/algorithms/bayesian/localizers/{simple.py => argmax.py} (96%) diff --git a/pysatl_cpd/core/algorithms/bayesian/detectors/simple.py b/pysatl_cpd/core/algorithms/bayesian/detectors/threshold.py similarity index 97% rename from pysatl_cpd/core/algorithms/bayesian/detectors/simple.py rename to pysatl_cpd/core/algorithms/bayesian/detectors/threshold.py index a0815360..5d66b002 100644 --- a/pysatl_cpd/core/algorithms/bayesian/detectors/simple.py +++ b/pysatl_cpd/core/algorithms/bayesian/detectors/threshold.py @@ -14,7 +14,7 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.idetector import IDetector -class SimpleDetector(IDetector): +class ThresholdDetector(IDetector): """ A detector that detects a change point if the probability of the maximum run length drops below the threshold. """ diff --git a/pysatl_cpd/core/algorithms/bayesian/localizers/simple.py b/pysatl_cpd/core/algorithms/bayesian/localizers/argmax.py similarity index 96% rename from pysatl_cpd/core/algorithms/bayesian/localizers/simple.py rename to pysatl_cpd/core/algorithms/bayesian/localizers/argmax.py index a8e552c9..574a01ca 100644 --- a/pysatl_cpd/core/algorithms/bayesian/localizers/simple.py +++ b/pysatl_cpd/core/algorithms/bayesian/localizers/argmax.py @@ -12,7 +12,7 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.ilocalizer import ILocalizer -class SimpleLocalizer(ILocalizer): +class ArgmaxLocalizer(ILocalizer): """ A localizer that localizes a change point corresponding with the most probable non-max run length. """ diff --git a/pysatl_cpd/core/algorithms/bayesian_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_algorithm.py index 69d2c4a5..19ae3c1c 100644 --- a/pysatl_cpd/core/algorithms/bayesian_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_algorithm.py @@ -15,12 +15,12 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.ihazard import IHazard from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood from pysatl_cpd.core.algorithms.bayesian.abstracts.ilocalizer import ILocalizer -from pysatl_cpd.core.algorithms.bayesian.detectors.simple import SimpleDetector +from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( GaussianConjugate, ) -from pysatl_cpd.core.algorithms.bayesian.localizers.simple import SimpleLocalizer +from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer class BayesianAlgorithm(Algorithm): @@ -38,8 +38,8 @@ def __init__( learning_steps: int = 50, likelihood: ILikelihood = GaussianConjugate(), hazard: IHazard = ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), - detector: IDetector = SimpleDetector(threshold=0.04), - localizer: ILocalizer = SimpleLocalizer(), + detector: IDetector = ThresholdDetector(threshold=0.04), + localizer: ILocalizer = ArgmaxLocalizer(), ): """ Initializes a new instance of Bayesian algorithm module with given customization. diff --git a/tests/test_core/test_algorithms/test_bayesian_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_algorithm.py index b16598ea..43001668 100644 --- a/tests/test_core/test_algorithms/test_bayesian_algorithm.py +++ b/tests/test_core/test_algorithms/test_bayesian_algorithm.py @@ -1,12 +1,12 @@ import numpy as np import pytest -from pysatl_cpd.core.algorithms.bayesian.detectors.simple import SimpleDetector +from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( GaussianConjugate, ) -from pysatl_cpd.core.algorithms.bayesian.localizers.simple import SimpleLocalizer +from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer from pysatl_cpd.core.algorithms.bayesian_algorithm import BayesianAlgorithm @@ -19,8 +19,8 @@ def construct_bayesian_algorithm(): learning_steps=50, likelihood=GaussianConjugate(), hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), - detector=SimpleDetector(threshold=0.04), - localizer=SimpleLocalizer(), + detector=ThresholdDetector(threshold=0.04), + localizer=ArgmaxLocalizer(), ) @@ -180,13 +180,13 @@ def test_clear(): def test_detector_detection(): run_lengths = np.full(100, 0.01) - detector = SimpleDetector(threshold=0.04) + detector = ThresholdDetector(threshold=0.04) assert detector.detect(run_lengths), "Change point should be detected" def test_detector_clear(): run_lengths = np.full(100, 0.01) - detector = SimpleDetector(threshold=0.04) + detector = ThresholdDetector(threshold=0.04) result1 = detector.detect(run_lengths) detector.clear() @@ -199,6 +199,6 @@ def test_localizer_localization(): change_point = 5 run_lengths = np.full(11, 0.05) run_lengths[change_point] = 0.5 - localizer = SimpleLocalizer() + localizer = ArgmaxLocalizer() result = localizer.localize(run_lengths) assert result == change_point, f"Expected change at {change_point}, got {result}" diff --git a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py index 8d145ac3..067e7c38 100644 --- a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py +++ b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py @@ -1,12 +1,12 @@ import numpy as np import pytest -from pysatl_cpd.core.algorithms.bayesian.detectors.simple import SimpleDetector +from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( GaussianConjugate, ) -from pysatl_cpd.core.algorithms.bayesian.localizers.simple import SimpleLocalizer +from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnlineCpd @@ -19,8 +19,8 @@ def construct_bayesian_online_algorithm(): learning_sample_size=5, likelihood=GaussianConjugate(), hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), - detector=SimpleDetector(threshold=0.04), - localizer=SimpleLocalizer(), + detector=ThresholdDetector(threshold=0.04), + localizer=ArgmaxLocalizer(), ) From ef1199080aaaf22c07b2b19eedb29a626caac286 Mon Sep 17 00:00:00 2001 From: Alexey Date: Thu, 27 Mar 2025 00:43:53 +0300 Subject: [PATCH 09/31] docs(bayesian algorithm): license change. --- pysatl_cpd/core/algorithms/bayesian_online_algorithm.py | 2 +- pysatl_cpd/core/algorithms/online_algorithm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py index d7d1bf77..12ae4973 100644 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -3,7 +3,7 @@ """ __author__ = "Alexey Tatyanenko" -__copyright__ = "Copyright (c) 2025 Alexey Tatyanenko" +__copyright__ = "Copyright (c) 2025 PySATL project" __license__ = "SPDX-License-Identifier: MIT" from typing import Optional diff --git a/pysatl_cpd/core/algorithms/online_algorithm.py b/pysatl_cpd/core/algorithms/online_algorithm.py index 80a483f1..6c014e87 100644 --- a/pysatl_cpd/core/algorithms/online_algorithm.py +++ b/pysatl_cpd/core/algorithms/online_algorithm.py @@ -3,7 +3,7 @@ """ __author__ = "Alexey Tatyanenko" -__copyright__ = "Copyright (c) 2025 Alexey Tatyanenko" +__copyright__ = "Copyright (c) 2025 PySATL project" __license__ = "SPDX-License-Identifier: MIT" from typing import Optional, Protocol From fdace5e9270703b7a806e227cc3928ccc7ed7278 Mon Sep 17 00:00:00 2001 From: Alexey Date: Sun, 30 Mar 2025 20:19:47 +0300 Subject: [PATCH 10/31] feat(algorithms): add online-algorithms related functionality --- .../core/algorithms/abstract_algorithm.py | 2 +- .../bayesian/abstracts/idetector.py | 2 +- .../algorithms/bayesian/abstracts/ihazard.py | 2 +- .../bayesian/abstracts/ilikelihood.py | 2 +- .../bayesian/abstracts/ilocalizer.py | 2 +- .../algorithms/bayesian_online_algorithm.py | 66 ++++---- .../core/algorithms/online_algorithm.py | 20 +-- pysatl_cpd/core/online_cpd_core.py | 38 +++++ pysatl_cpd/cpd_solver.py | 142 +--------------- pysatl_cpd/icpd_solver.py | 158 ++++++++++++++++++ pysatl_cpd/online_cpd_solver.py | 75 +++++++++ .../test_bayesian_online_algorithm.py | 4 +- 12 files changed, 326 insertions(+), 187 deletions(-) create mode 100644 pysatl_cpd/core/online_cpd_core.py create mode 100644 pysatl_cpd/icpd_solver.py create mode 100644 pysatl_cpd/online_cpd_solver.py diff --git a/pysatl_cpd/core/algorithms/abstract_algorithm.py b/pysatl_cpd/core/algorithms/abstract_algorithm.py index baa23462..ea5ba18a 100644 --- a/pysatl_cpd/core/algorithms/abstract_algorithm.py +++ b/pysatl_cpd/core/algorithms/abstract_algorithm.py @@ -5,7 +5,7 @@ class Algorithm(Protocol): - """Abstract class for change point detection algorithms""" + """Protocol for change point detection algorithms' interface""" def detect(self, window: npt.NDArray[np.float64]) -> int: """Function for finding change points in window diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py index 5e83284d..537547fd 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py @@ -15,7 +15,7 @@ class IDetector(Protocol): """ - Abstract base class for detectors that detect a change point with given growth probabilities for run lengths. + Protocol for detectors that detect a change point with given growth probabilities for run lengths. """ def detect(self, growth_probs: npt.NDArray[np.float64]) -> bool: diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py index cabb4807..876365f7 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py @@ -15,7 +15,7 @@ class IHazard(Protocol): """ - Hazard function abstract base class. + Hazard function protocol. """ def hazard(self, run_lengths: npt.NDArray[np.intp]) -> npt.NDArray[np.float64]: diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py index edcd7e69..51d5c416 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py @@ -15,7 +15,7 @@ class ILikelihood(Protocol): """ - Likelihood function's abstract base class. + Likelihood function's protocol. """ def learn(self, learning_sample: npt.NDArray[np.float64]) -> None: diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py index 7e328710..9ef9cb69 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py @@ -15,7 +15,7 @@ class ILocalizer(Protocol): """ - Abstract base class for localizers that localize a change point with given growth probabilities for run lengths. + Protocol for localizers that localize a change point with given growth probabilities for run lengths. """ def localize(self, growth_probs: npt.NDArray[np.float64]) -> int: diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py index 12ae4973..7be1d808 100644 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -15,10 +15,10 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.ihazard import IHazard from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood from pysatl_cpd.core.algorithms.bayesian.abstracts.ilocalizer import ILocalizer -from pysatl_cpd.core.algorithms.online_algorithm import OnlineCpdAlgorithm +from pysatl_cpd.core.algorithms.online_algorithm import OnlineAlgorithm -class BayesianOnlineCpd(OnlineCpdAlgorithm): +class BayesianOnline(OnlineAlgorithm): """ Class for Bayesian online change point detection algorithm. """ @@ -62,13 +62,13 @@ def clear(self) -> None: self.__was_change_point = False self.__change_point = None - def __learn(self, value: np.float64) -> None: + def __learn(self, observation: np.float64) -> None: """ Performs a learning step for a prediction model until the given learning sample size is achieved. - :param value: new value of a time series. + :param observation: new observation of a time series. :return: """ - self.__training_data.append(value) + self.__training_data.append(observation) if len(self.__training_data) == self.__learning_sample_size: self.__likelihood.clear() self.__detector.clear() @@ -77,20 +77,20 @@ def __learn(self, value: np.float64) -> None: self.__is_training = False self.__run_length_probs = np.array([1.0]) - def __bayesian_update(self, value: np.float64) -> None: + def __bayesian_update(self, observation: np.float64) -> None: """ Performs a bayesian update of the algorithm's state. - :param value: new value of a time series. + :param observation: new observation of a time series. :return: """ - predictive_prob = self.__likelihood.predict(value) + predictive_prob = self.__likelihood.predict(observation) hazards = self.__hazard.hazard(np.arange(self.__run_length_probs.shape[0], dtype=np.intp)) growth_probs = self.__run_length_probs * (1 - hazards) * predictive_prob reset_prob = np.sum(self.__run_length_probs * hazards * predictive_prob) new_probs = np.append(reset_prob, growth_probs) new_probs /= np.sum(new_probs) self.__run_length_probs = new_probs - self.__likelihood.update(value) + self.__likelihood.update(observation) def __handle_localization(self) -> None: """ @@ -110,11 +110,11 @@ def __handle_localization(self) -> None: if len(self.__training_data) >= self.__learning_sample_size: self.__training_data = self.__training_data[: self.__learning_sample_size] - for value in self.__training_data: - self.__learn(value) + for observation in self.__training_data: + self.__learn(observation) - for value in self.__data_history[self.__learning_sample_size :]: - self.__bayesian_update(value) + for observation in self.__data_history[self.__learning_sample_size :]: + self.__bayesian_update(observation) def __handle_detection(self) -> None: """ @@ -128,20 +128,20 @@ def __handle_detection(self) -> None: self.__is_training = True self.__learn(self.__training_data[-1]) - def __process_point(self, value: np.float64, with_localization: bool) -> None: + def __process_point(self, observation: np.float64, with_localization: bool) -> None: """ - Universal method for processing of another value of a time series. - :param value: new value of a time series. + Universal method for processing of another observation of a time series. + :param observation: new observation of a time series. :param with_localization: whether the method was called for localization of a change point. :return: """ - self.__data_history.append(value) + self.__data_history.append(observation) self.__current_time += 1 if self.__is_training: - self.__learn(value) + self.__learn(observation) else: - self.__bayesian_update(value) + self.__bayesian_update(observation) detected = self.__detector.detect(self.__run_length_probs) if not detected: @@ -153,30 +153,30 @@ def __process_point(self, value: np.float64, with_localization: bool) -> None: else: self.__handle_detection() - def detect(self, value: np.float64 | npt.NDArray[np.float64]) -> bool: + def detect(self, observation: np.float64 | npt.NDArray[np.float64]) -> bool: """ - Performs a change point detection after processing another value of a time series. - :param value: new value of a time series. Note: multivariate time series aren't supported for now. - :return: whether a change point was detected after processing the new value. + Performs a change point detection after processing another observation of a time series. + :param observation: new observation of a time series. Note: multivariate time series aren't supported for now. + :return: whether a change point was detected after processing the new observation. """ - if value is npt.NDArray[np.float64]: - raise TypeError("Multivariate values are not supported") + if observation is npt.NDArray[np.float64]: + raise TypeError("Multivariate observations are not supported") - self.__process_point(np.float64(value), False) + self.__process_point(np.float64(observation), False) result = self.__was_change_point self.__was_change_point = False return result - def localize(self, value: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: + def localize(self, observation: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: """ - Performs a change point localization after processing another value of a time series. - :param value: new value of a time series. - :return: location of a change point, acquired after processing the new value, or None if there wasn't any. + Performs a change point localization after processing another observation of a time series. + :param observation: new observation of a time series. + :return: location of a change point, acquired after processing the new observation, or None if there wasn't any. """ - if value is npt.NDArray[np.float64]: - raise TypeError("Multivariate values are not supported") + if observation is npt.NDArray[np.float64]: + raise TypeError("Multivariate observations are not supported") - self.__process_point(np.float64(value), True) + self.__process_point(np.float64(observation), True) result = self.__change_point self.__was_change_point = False self.__change_point = None diff --git a/pysatl_cpd/core/algorithms/online_algorithm.py b/pysatl_cpd/core/algorithms/online_algorithm.py index 6c014e87..5161a8ed 100644 --- a/pysatl_cpd/core/algorithms/online_algorithm.py +++ b/pysatl_cpd/core/algorithms/online_algorithm.py @@ -12,23 +12,23 @@ import numpy.typing as npt -class OnlineCpdAlgorithm(Protocol): +class OnlineAlgorithm(Protocol): """ - Class for online change point detection algorithm's interface. + Protocol for online change point detection algorithm's interface. """ - def detect(self, value: np.float64 | npt.NDArray[np.float64]) -> bool: + def detect(self, observation: np.float64 | npt.NDArray[np.float64]) -> bool: """ - Method for detection of a change point. - :param value: new value of a time series. - :return: bool value whether a change point was detected after processing the new value. + Method for a step of detection of a change point. + :param observation: new observation of a time series. + :return: bool observation whether a change point was detected after processing the new observation. """ ... - def localize(self, value: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: + def localize(self, observation: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: """ - Method for localization of a change point. - :param value: new value of a time series - :return: location of a change point, acquired after processing the new value, or None if there wasn't any. + Method for a step of localization of a change point. + :param observation: new observation of a time series + :return: location of a change point, acquired after processing the new observation, or None if there wasn't any. """ ... diff --git a/pysatl_cpd/core/online_cpd_core.py b/pysatl_cpd/core/online_cpd_core.py new file mode 100644 index 00000000..ef4963b8 --- /dev/null +++ b/pysatl_cpd/core/online_cpd_core.py @@ -0,0 +1,38 @@ +""" +Module for online-CPD core, which presents access to algorithms as iterators over provdied data. +""" + +__author__ = "Alexey Tatyanenko" +__copyright__ = "Copyright (c) 2025 PySATL project" +__license__ = "SPDX-License-Identifier: MIT" + +from collections.abc import Iterator + +from pysatl_cpd.core.algorithms.online_algorithm import OnlineAlgorithm +from pysatl_cpd.core.scrubber.data_providers import DataProvider + + +class OnlineCpdCore: + """ + Class that presents online CPD-algorithm as detection or localization iterator over the provided data. + """ + + def __init__(self, algorithm: OnlineAlgorithm, data_provider: DataProvider) -> None: + self.algorithm = algorithm + self.data_provider = data_provider + + def detect(self) -> Iterator[bool]: + """ + Iteratively tries to detect a change point in the provided data. + :return: whether a change point after processed observation was detected. + """ + for observation in self.data_provider: + yield self.algorithm.detect(observation) + + def localize(self) -> Iterator[int | None]: + """ + Iteratively tries to localize a change point in the provided data. + :return: change point location, if it was successfully localized, or None, otherwise. + """ + for observation in self.data_provider: + yield self.algorithm.localize(observation) diff --git a/pysatl_cpd/cpd_solver.py b/pysatl_cpd/cpd_solver.py index ec324da4..07acf192 100644 --- a/pysatl_cpd/cpd_solver.py +++ b/pysatl_cpd/cpd_solver.py @@ -1,5 +1,5 @@ """ -Module contains classes for solving change point detection problem and representation its results. +Module contains class for solving change point detection problem. """ __author__ = "Aleksei Ivanov, Artem Romanyuk, Vladimir Kutuev" @@ -7,150 +7,17 @@ __license__ = "SPDX-License-Identifier: MIT" import time -from collections.abc import Iterator -from pathlib import Path -import numpy as np -import numpy.typing as npt -from matplotlib import pyplot as plt - -from .analysis.results_analyzer import CpdResultsAnalyzer from .core.algorithms.abstract_algorithm import Algorithm from .core.cpd_core import CpdCore from .core.problem import CpdProblem from .core.scrubber.abstract import Scrubber from .core.scrubber.data_providers import LabeledDataProvider +from .icpd_solver import CpdLocalizationResults, ICpdSolver from .labeled_data import LabeledCpdData -class CpdLocalizationResults: - """Container for results of CPD algorithms""" - - def __init__( - self, - data: Iterator[np.float64] | Iterator[npt.NDArray[np.float64]], - result: list[int], - expected_result: list[int] | None, - time_sec: float, - ) -> None: - """Object constructor - - :param: result: list, containing change points, that were found by CPD algos - :param: expected_result: list, containing expected change points, if it is needed - :param: time_sec: a float number, time of CPD algo execution in fractional seconds - """ - self.data = data - self.result = result - self.expected_result = expected_result - self.time_sec = time_sec - - @property - def result_diff(self) -> list[int]: - """method for calculation symmetrical diff between results and expected results (if its granted) - - :return: symmetrical difference between results and expected results - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, thus diff cannot be calculated.") - first, second = set(self.result), set(self.expected_result) - return sorted(list(first.symmetric_difference(second))) - - def __str__(self) -> str: - """method for printing results of CPD algo results in a convenient way - - :return: string with brief CPD algo execution results - """ - cp_results = ";".join(map(str, self.result)) - method_output = f"Located change points: ({cp_results})\n" - if self.expected_result is not None: - expected_cp_results = ";".join(map(str, self.expected_result)) - diff = ";".join(map(str, self.result_diff)) - method_output += f"Expected change point: ({expected_cp_results})\n" - method_output += f"Difference: ({diff})\n" - method_output += f"Computation time (sec): {round(self.time_sec, 2)}" - return method_output - - def count_confusion_matrix(self, window: tuple[int, int] | None = None) -> tuple[int, int, int, int]: - """method for counting confusion matrix for hypothesis of equality of CPD results and expected - results on a window - - :param: window: tuple of two indices (start, stop), determines a window for hypothesis - - :return: tuple of integers (true-positive, true-negative, false-positive, false-negative) - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, confusion matrix cannot be calculated") - return CpdResultsAnalyzer.count_confusion_matrix(self.result, self.expected_result, window) - - def count_accuracy(self, window: tuple[int, int] | None = None) -> float: - """method for counting accuracy metric for hypothesis of equality of CPD results and expected - results on a window - - :param: window: tuple of two indices (start, stop), determines a window for hypothesis - - :return: float, accuracy metric - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, accuracy cannot be calculated") - return CpdResultsAnalyzer.count_accuracy(self.result, self.expected_result, window) - - def count_precision(self, window: tuple[int, int] | None = None) -> float: - """method for counting precision metric for hypothesis of equality of CPD results and expected - results on a window - - :param: window: tuple of two indices (start, stop), determines a window for hypothesis - - :return: float, precision metric - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, precision cannot be calculated") - return CpdResultsAnalyzer.count_precision(self.result, self.expected_result, window) - - def count_recall(self, window: tuple[int, int] | None = None) -> float: - """method for counting recall metric for hypothesis of equality of CPD results and expected results on a window - - :param: window: tuple of two indices (start, stop), determines a window for hypothesis - - :return: float, recall metric - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, recall cannot be calculated") - return CpdResultsAnalyzer.count_recall(self.result, self.expected_result, window) - - def visualize(self, to_show: bool = True, output_directory: Path | None = None, name: str = "plot") -> None: - """method for building and analyzing graph - - :param to_show: is it necessary to show a graph - :param output_directory: If necessary, the path to the directory to save the graph - :param name: A name for the output image with plot - """ - - data: npt.NDArray[np.float64] = np.array(list(self.data)) - plt.plot(data) - if self.expected_result is None: - plt.vlines(x=self.result, ymin=data.min(), ymax=data.max(), colors="orange", ls="--") - plt.gca().legend(("data", "detected")) - else: - correct, incorrect, undetected = set(), set(), set(self.expected_result) - for point in self.result: - if point in self.expected_result: - correct.add(point) - undetected.remove(point) - elif point not in undetected: - incorrect.add(point) - plt.vlines(x=list(correct), ymin=data.min(), ymax=data.max(), colors="green", ls="--") - plt.vlines(x=list(incorrect), ymin=data.min(), ymax=data.max(), colors="red", ls="--") - plt.vlines(x=list(undetected), ymin=data.min(), ymax=data.max(), colors="orange", ls="--") - plt.gca().legend(("data", "correct detected", "incorrect detected", "undetected")) - if output_directory: - if not output_directory.exists(): - output_directory.mkdir() - plt.savefig(output_directory.joinpath(Path(name))) - if to_show: - plt.show() - - -class CpdSolver: +class CpdSolver(ICpdSolver): """Class, that grants a convenient interface to work with CPD algorithms""" @@ -180,7 +47,8 @@ def __init__( def run(self) -> CpdLocalizationResults | int: """Execute CPD algorithm and return container with its results - :return: CPContainer object, containing algo result CP and expected CP if needed + :return: CpdLocalizationResults object, containing algo result CP and expected CP if needed, + or number of detected change points. """ time_start = time.perf_counter() if not self._scenario.to_localize: diff --git a/pysatl_cpd/icpd_solver.py b/pysatl_cpd/icpd_solver.py new file mode 100644 index 00000000..f90b14a3 --- /dev/null +++ b/pysatl_cpd/icpd_solver.py @@ -0,0 +1,158 @@ +""" +Module contains protocol for solution of change point detection problem and class for representation of its results. +""" + +__author__ = "Aleksei Ivanov, Alexey Tatyanenko, Artem Romanyuk, Vladimir Kutuev" +__copyright__ = "Copyright (c) 2025 PySATL project" +__license__ = "SPDX-License-Identifier: MIT" + +from collections.abc import Iterator +from pathlib import Path +from typing import Protocol + +import numpy as np +import numpy.typing as npt +from matplotlib import pyplot as plt + +from pysatl_cpd.analysis.results_analyzer import CpdResultsAnalyzer + + +class CpdLocalizationResults: + """Container for results of CPD algorithms""" + + def __init__( + self, + data: Iterator[np.float64] | Iterator[npt.NDArray[np.float64]], + result: list[int], + expected_result: list[int] | None, + time_sec: float, + ) -> None: + """Object constructor + + :param: result: list, containing change points, that were found by CPD algos + :param: expected_result: list, containing expected change points, if it is needed + :param: time_sec: a float number, time of CPD algo execution in fractional seconds + """ + self.data = data + self.result = result + self.expected_result = expected_result + self.time_sec = time_sec + + @property + def result_diff(self) -> list[int]: + """method for calculation symmetrical diff between results and expected results (if its granted) + + :return: symmetrical difference between results and expected results + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, thus diff cannot be calculated.") + first, second = set(self.result), set(self.expected_result) + return sorted(list(first.symmetric_difference(second))) + + def __str__(self) -> str: + """method for printing results of CPD algo results in a convenient way + + :return: string with brief CPD algo execution results + """ + cp_results = ";".join(map(str, self.result)) + method_output = f"Located change points: ({cp_results})\n" + if self.expected_result is not None: + expected_cp_results = ";".join(map(str, self.expected_result)) + diff = ";".join(map(str, self.result_diff)) + method_output += f"Expected change point: ({expected_cp_results})\n" + method_output += f"Difference: ({diff})\n" + method_output += f"Computation time (sec): {round(self.time_sec, 2)}" + return method_output + + def count_confusion_matrix(self, window: tuple[int, int] | None = None) -> tuple[int, int, int, int]: + """method for counting confusion matrix for hypothesis of equality of CPD results and expected + results on a window + + :param: window: tuple of two indices (start, stop), determines a window for hypothesis + + :return: tuple of integers (true-positive, true-negative, false-positive, false-negative) + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, confusion matrix cannot be calculated") + return CpdResultsAnalyzer.count_confusion_matrix(self.result, self.expected_result, window) + + def count_accuracy(self, window: tuple[int, int] | None = None) -> float: + """method for counting accuracy metric for hypothesis of equality of CPD results and expected + results on a window + + :param: window: tuple of two indices (start, stop), determines a window for hypothesis + + :return: float, accuracy metric + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, accuracy cannot be calculated") + return CpdResultsAnalyzer.count_accuracy(self.result, self.expected_result, window) + + def count_precision(self, window: tuple[int, int] | None = None) -> float: + """method for counting precision metric for hypothesis of equality of CPD results and expected + results on a window + + :param: window: tuple of two indices (start, stop), determines a window for hypothesis + + :return: float, precision metric + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, precision cannot be calculated") + return CpdResultsAnalyzer.count_precision(self.result, self.expected_result, window) + + def count_recall(self, window: tuple[int, int] | None = None) -> float: + """method for counting recall metric for hypothesis of equality of CPD results and expected results on a window + + :param: window: tuple of two indices (start, stop), determines a window for hypothesis + + :return: float, recall metric + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, recall cannot be calculated") + return CpdResultsAnalyzer.count_recall(self.result, self.expected_result, window) + + def visualize(self, to_show: bool = True, output_directory: Path | None = None, name: str = "plot") -> None: + """method for building and analyzing graph + + :param to_show: is it necessary to show a graph + :param output_directory: If necessary, the path to the directory to save the graph + :param name: A name for the output image with plot + """ + + data: npt.NDArray[np.float64] = np.array(list(self.data)) + plt.plot(data) + if self.expected_result is None: + plt.vlines(x=self.result, ymin=data.min(), ymax=data.max(), colors="orange", ls="--") + plt.gca().legend(("data", "detected")) + else: + correct, incorrect, undetected = set(), set(), set(self.expected_result) + for point in self.result: + if point in self.expected_result: + correct.add(point) + undetected.remove(point) + elif point not in undetected: + incorrect.add(point) + plt.vlines(x=list(correct), ymin=data.min(), ymax=data.max(), colors="green", ls="--") + plt.vlines(x=list(incorrect), ymin=data.min(), ymax=data.max(), colors="red", ls="--") + plt.vlines(x=list(undetected), ymin=data.min(), ymax=data.max(), colors="orange", ls="--") + plt.gca().legend(("data", "correct detected", "incorrect detected", "undetected")) + if output_directory: + if not output_directory.exists(): + output_directory.mkdir() + plt.savefig(output_directory.joinpath(Path(name))) + if to_show: + plt.show() + + +class ICpdSolver(Protocol): + """ + Protocol, describing interface of CPD problem's solution. + """ + + def run(self) -> CpdLocalizationResults | int: + """ + Method for evaluation of CPD problem's solution, executing some CPD-algorithm. + :return: CpdLocalizationResults object, containing algo result CP and expected CP if needed, + or number of detected change points. + """ + ... diff --git a/pysatl_cpd/online_cpd_solver.py b/pysatl_cpd/online_cpd_solver.py new file mode 100644 index 00000000..c7fa3eec --- /dev/null +++ b/pysatl_cpd/online_cpd_solver.py @@ -0,0 +1,75 @@ +""" +Module contains class for solving change point detection problem with an online CPD algorithm. +""" + +__author__ = "Alexey Tatyanenko" +__copyright__ = "Copyright (c) 2025 PySATL project" +__license__ = "SPDX-License-Identifier: MIT" + +import time + +from pysatl_cpd.core.algorithms.online_algorithm import OnlineAlgorithm +from pysatl_cpd.core.online_cpd_core import OnlineCpdCore +from pysatl_cpd.core.problem import CpdProblem +from pysatl_cpd.core.scrubber.data_providers import DataProvider, LabeledDataProvider +from pysatl_cpd.icpd_solver import CpdLocalizationResults, ICpdSolver +from pysatl_cpd.labeled_data import LabeledCpdData + + +class OnlineCpdSolver(ICpdSolver): + """Class, that grants a convenient interface to + work with online-CPD algorithms""" + + def __init__( + self, + scenario: CpdProblem, + algorithm: OnlineAlgorithm, + algorithm_input: DataProvider | LabeledCpdData, + ) -> None: + """pysatl_cpd object constructor + + :param: scenario: scenario specify + :param: algorithm: online-CPD algorithm, that will search for change points + :param: algorithm_input: data provider or labeled data to construct corresponding data provider. + """ + self._labeled_data: LabeledCpdData | None = None + self._cpd_core: OnlineCpdCore + match algorithm_input: + case DataProvider() as data_provider: + self._cpd_core = OnlineCpdCore( + data_provider=data_provider, + algorithm=algorithm, + ) + case LabeledCpdData() as data: + self._labeled_data = data + self._cpd_core = OnlineCpdCore( + data_provider=LabeledDataProvider(data), + algorithm=algorithm, + ) + + self._scenario = scenario + + def run(self) -> CpdLocalizationResults | int: + """Execute online-CPD algorithm and return container with its results + + :return: CpdLocalizationResults object, containing algo result CP and expected CP if needed + """ + time_start = time.perf_counter() + if not self._scenario.to_localize: + change_points_count = 0 + for was_changepoint in self._cpd_core.detect(): + if was_changepoint: + change_points_count += 1 + return change_points_count + + algo_results = [] + for change_point in self._cpd_core.localize(): + if change_point is not None: + algo_results.append(change_point) + + time_end = time.perf_counter() + expected_change_points: list[int] | None = None + if isinstance(self._labeled_data, LabeledCpdData): + expected_change_points = self._labeled_data.change_points + data = iter(self._cpd_core.data_provider) + return CpdLocalizationResults(data, algo_results, expected_change_points, time_end - time_start) diff --git a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py index 067e7c38..5f3b7dc2 100644 --- a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py +++ b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py @@ -7,7 +7,7 @@ GaussianConjugate, ) from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer -from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnlineCpd +from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnline def set_seed(): @@ -15,7 +15,7 @@ def set_seed(): def construct_bayesian_online_algorithm(): - return BayesianOnlineCpd( + return BayesianOnline( learning_sample_size=5, likelihood=GaussianConjugate(), hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), From b0f752f5e449f20cb6e1688ab2db7f9a691c3e97 Mon Sep 17 00:00:00 2001 From: Alexey Date: Sun, 30 Mar 2025 20:26:19 +0300 Subject: [PATCH 11/31] fix(tests): fix import in cpd_solver tests --- tests/test_solver.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_solver.py b/tests/test_solver.py index ef6d70f4..c2502e98 100644 --- a/tests/test_solver.py +++ b/tests/test_solver.py @@ -5,11 +5,12 @@ import numpy as np import pytest +from pysatl_cpd.analysis.results_analyzer import CpdResultsAnalyzer from pysatl_cpd.core.algorithms.graph_algorithm import GraphAlgorithm from pysatl_cpd.core.problem import CpdProblem from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider from pysatl_cpd.core.scrubber.linear import LinearScrubber -from pysatl_cpd.cpd_solver import CpdLocalizationResults, CpdResultsAnalyzer, CpdSolver, LabeledCpdData +from pysatl_cpd.cpd_solver import CpdLocalizationResults, CpdSolver, LabeledCpdData def custom_comparison(node1, node2): # TODO: Remove it everywhere From f7e47fb48c3968c058a7aa242fe953cc58299a7b Mon Sep 17 00:00:00 2001 From: Alexey Date: Tue, 1 Apr 2025 02:04:18 +0300 Subject: [PATCH 12/31] test(online-cpd): add tests to online-algorithms related functionality --- pysatl_cpd/core/scrubber/data_providers.py | 3 +- pysatl_cpd/online_cpd_solver.py | 10 +- .../test_bayesian_online_algorithm.py | 162 +++++++++--------- tests/test_core/test_online_cpd_core.py | 60 +++++++ tests/test_online_solver.py | 117 +++++++++++++ 5 files changed, 263 insertions(+), 89 deletions(-) create mode 100644 tests/test_core/test_online_cpd_core.py create mode 100644 tests/test_online_solver.py diff --git a/pysatl_cpd/core/scrubber/data_providers.py b/pysatl_cpd/core/scrubber/data_providers.py index 8f5d9cfc..9b12b25a 100644 --- a/pysatl_cpd/core/scrubber/data_providers.py +++ b/pysatl_cpd/core/scrubber/data_providers.py @@ -7,7 +7,7 @@ __license__ = "SPDX-License-Identifier: MIT" from collections.abc import Iterator -from typing import Protocol +from typing import Protocol, runtime_checkable import numpy as np import numpy.typing as npt @@ -15,6 +15,7 @@ from pysatl_cpd.labeled_data import LabeledCpdData +@runtime_checkable class DataProvider(Protocol): """Interface for abstracting the scrubber from the data source and its format""" diff --git a/pysatl_cpd/online_cpd_solver.py b/pysatl_cpd/online_cpd_solver.py index c7fa3eec..a49949e5 100644 --- a/pysatl_cpd/online_cpd_solver.py +++ b/pysatl_cpd/online_cpd_solver.py @@ -35,17 +35,17 @@ def __init__( self._labeled_data: LabeledCpdData | None = None self._cpd_core: OnlineCpdCore match algorithm_input: - case DataProvider() as data_provider: - self._cpd_core = OnlineCpdCore( - data_provider=data_provider, - algorithm=algorithm, - ) case LabeledCpdData() as data: self._labeled_data = data self._cpd_core = OnlineCpdCore( data_provider=LabeledDataProvider(data), algorithm=algorithm, ) + case DataProvider() as data_provider: + self._cpd_core = OnlineCpdCore( + data_provider=data_provider, + algorithm=algorithm, + ) self._scenario = scenario diff --git a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py index 5f3b7dc2..07e0d973 100644 --- a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py +++ b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py @@ -61,88 +61,84 @@ def _factory(): return _factory -def test_consecutive_detection(outer_bayesian_algorithm, generate_data, data_params): - for _ in range(data_params["num_of_tests"]): - data = generate_data() - was_change_point = False - for value in data: - result = outer_bayesian_algorithm.detect(value) - if result: - was_change_point = True - - assert was_change_point, "There was undetected change point in data" - - -def test_correctness_of_consecutive_detection( - outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params -): - for _ in range(data_params["num_of_tests"]): - data = generate_data() - inner_algorithm = inner_algorithm_factory() - outer_result = [] - inner_result = [] - - for value in data: - outer_result.append(outer_bayesian_algorithm.detect(value)) - inner_result.append(inner_algorithm.detect(value)) - - outer_bayesian_algorithm.clear() - assert outer_result == inner_result, "Consecutive and independent detection should give same results" - - -def test_consecutive_localization(outer_bayesian_algorithm, generate_data, data_params): - for _ in range(data_params["num_of_tests"]): - data = generate_data() - was_change_point = False - - for value in data: - result = outer_bayesian_algorithm.localize(value) - if result is None: - continue - - was_change_point = True - - assert ( - data_params["change_point"] - data_params["tolerable_deviation"] - <= result - <= data_params["change_point"] + data_params["tolerable_deviation"] - ), "Incorrect change point localization" - - outer_bayesian_algorithm.clear() - assert was_change_point, "Actual change point was not detected at all" - - -def test_correctness_of_consecutive_localization( - outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params -): - for _ in range(data_params["num_of_tests"]): - data = generate_data() - inner_algorithm = inner_algorithm_factory() - - for value in data: - outer_result = outer_bayesian_algorithm.localize(value) - inner_result = inner_algorithm.localize(value) - assert outer_result == inner_result, "Consecutive and independent localization should give same results" - - outer_bayesian_algorithm.clear() - - -def test_online_detection_correctness(inner_algorithm_factory, generate_data, data_params): - for _ in range(data_params["num_of_tests"]): - algorithm = inner_algorithm_factory() - data = generate_data() - for time, value in np.ndenumerate(data): - result = algorithm.detect(value) - if result: - assert data_params["change_point"] <= time[0], "Change point cannot be detected beforehand" +class TestBayesianOnlineAlgorithm: + def test_consecutive_detection(self, outer_bayesian_algorithm, generate_data, data_params): + for _ in range(data_params["num_of_tests"]): + data = generate_data() + was_change_point = False + for value in data: + result = outer_bayesian_algorithm.detect(value) + if result: + was_change_point = True + + assert was_change_point, "There was undetected change point in data" + + def test_correctness_of_consecutive_detection( + self, outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params + ): + for _ in range(data_params["num_of_tests"]): + data = generate_data() + inner_algorithm = inner_algorithm_factory() + outer_result = [] + inner_result = [] + + for value in data: + outer_result.append(outer_bayesian_algorithm.detect(value)) + inner_result.append(inner_algorithm.detect(value)) + + outer_bayesian_algorithm.clear() + assert outer_result == inner_result, "Consecutive and independent detection should give same results" + + def test_consecutive_localization(self, outer_bayesian_algorithm, generate_data, data_params): + for _ in range(data_params["num_of_tests"]): + data = generate_data() + was_change_point = False + + for value in data: + result = outer_bayesian_algorithm.localize(value) + if result is None: + continue + was_change_point = True -def test_online_localization_correctness(inner_algorithm_factory, generate_data, data_params): - for _ in range(data_params["num_of_tests"]): - algorithm = inner_algorithm_factory() - data = generate_data() - for time, value in np.ndenumerate(data): - result = algorithm.localize(value) - if result: - assert result <= time[0], "Change point cannot be in future" - assert data_params["change_point"] <= time[0], "Change point cannot be detected beforehand" + assert ( + data_params["change_point"] - data_params["tolerable_deviation"] + <= result + <= data_params["change_point"] + data_params["tolerable_deviation"] + ), "Incorrect change point localization" + + outer_bayesian_algorithm.clear() + assert was_change_point, "Actual change point was not detected at all" + + def test_correctness_of_consecutive_localization( + self, outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params + ): + for _ in range(data_params["num_of_tests"]): + data = generate_data() + inner_algorithm = inner_algorithm_factory() + + for value in data: + outer_result = outer_bayesian_algorithm.localize(value) + inner_result = inner_algorithm.localize(value) + assert outer_result == inner_result, "Consecutive and independent localization should give same results" + + outer_bayesian_algorithm.clear() + + def test_online_detection_correctness(self, inner_algorithm_factory, generate_data, data_params): + for _ in range(data_params["num_of_tests"]): + algorithm = inner_algorithm_factory() + data = generate_data() + for time, value in np.ndenumerate(data): + result = algorithm.detect(value) + if result: + assert data_params["change_point"] <= time[0], "Change point cannot be detected beforehand" + + def test_online_localization_correctness(self, inner_algorithm_factory, generate_data, data_params): + for _ in range(data_params["num_of_tests"]): + algorithm = inner_algorithm_factory() + data = generate_data() + for time, value in np.ndenumerate(data): + result = algorithm.localize(value) + if result: + assert result <= time[0], "Change point cannot be in future" + assert data_params["change_point"] <= time[0], "Change point cannot be detected beforehand" diff --git a/tests/test_core/test_online_cpd_core.py b/tests/test_core/test_online_cpd_core.py new file mode 100644 index 00000000..4be79521 --- /dev/null +++ b/tests/test_core/test_online_cpd_core.py @@ -0,0 +1,60 @@ +import numpy as np +import pytest + +from pysatl_cpd.core.online_cpd_core import OnlineCpdCore +from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider +from tests.test_core.test_algorithms.test_bayesian_online_algorithm import construct_bayesian_online_algorithm + +DATA_PARAMS = { + "num_of_tests": 10, + "size": 500, + "change_point": 250, + "tolerable_deviation": 25, +} + + +@pytest.fixture(scope="session") +def data_params(): + return DATA_PARAMS + + +@pytest.fixture +def algorithm(): + return construct_bayesian_online_algorithm() + + +@pytest.fixture(params=[True, False], ids=["with_cp", "without_cp"]) +def dataset(request, data_params): + np.random.seed(42 + request.param_index) + if request.param: # With change point + return np.concatenate( + [ + np.random.normal(0, 1, data_params["change_point"]), + np.random.normal(5, 2, data_params["size"] - data_params["change_point"]), + ] + ) + return np.random.normal(0, 1, data_params["size"]) # Without change point + + +@pytest.fixture +def online_core(dataset): + return OnlineCpdCore( + algorithm=construct_bayesian_online_algorithm(), data_provider=ListUnivariateProvider(list(dataset)) + ) + + +class TestOnlineCpdCore: + @pytest.mark.parametrize("test_iteration", range(DATA_PARAMS["num_of_tests"])) + @pytest.mark.parametrize("mode", ["detect", "localize"]) + def test_core_functionality(self, algorithm, online_core, dataset, data_params, mode, test_iteration): + core_iterator = getattr(online_core, mode)() + algo_method = getattr(algorithm, mode) + + for time_point in range(data_params["size"]): + observation = dataset[time_point] + algo_result = algo_method(observation) + core_result = next(core_iterator) + + assert algo_result == core_result, ( + f"Different results at {time_point} between manual {mode} and core {mode} iteration" + ) diff --git a/tests/test_online_solver.py b/tests/test_online_solver.py new file mode 100644 index 00000000..8b87c0e1 --- /dev/null +++ b/tests/test_online_solver.py @@ -0,0 +1,117 @@ +import numpy as np +import pytest + +from pysatl_cpd.core.problem import CpdProblem +from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider +from pysatl_cpd.icpd_solver import CpdLocalizationResults +from pysatl_cpd.labeled_data import LabeledCpdData +from pysatl_cpd.online_cpd_solver import OnlineCpdSolver +from tests.test_core.test_algorithms.test_bayesian_online_algorithm import construct_bayesian_online_algorithm + +DATA_PARAMS = { + "num_tests": 10, + "size": 500, + "change_point": 250, + "tolerable_deviation": 25, +} + + +@pytest.fixture(scope="session") +def data_params(): + return DATA_PARAMS + + +@pytest.fixture +def data_generator(data_params): + def _generate(has_cp, test_iteration): + seed = 42 + test_iteration + np.random.seed(seed) + if has_cp: + return np.concatenate( + [ + np.random.normal(0, 1, data_params["change_point"]), + np.random.normal(5, 2, data_params["size"] - data_params["change_point"]), + ] + ) + return np.random.normal(0, 1, data_params["size"]) + + return _generate + + +@pytest.fixture +def labeled_data_factory(data_params): + def _factory(data, has_cp): + return LabeledCpdData(raw_data=data, change_points=[data_params["change_point"]] if has_cp else None) + + return _factory + + +@pytest.fixture +def solver_factory(): + def _factory(data_input, with_localization): + return OnlineCpdSolver( + algorithm=construct_bayesian_online_algorithm(), + algorithm_input=data_input, + scenario=CpdProblem(with_localization), + ) + + return _factory + + +def pytest_generate_tests(metafunc): + if "test_iteration" in metafunc.fixturenames: + metafunc.parametrize("test_iteration", range(DATA_PARAMS["num_tests"])) + + +class TestOnlineCpdSolver: + @pytest.mark.parametrize( + "has_cp,with_localization,is_labeled", + [ + (False, True, True), + (True, True, True), + (False, True, False), + (True, True, False), + (False, False, True), + (True, False, True), + (False, False, False), + (True, False, False), + ], + ) + def test_all_scenarios( + self, + data_generator, + labeled_data_factory, + solver_factory, + has_cp, + with_localization, + is_labeled, + test_iteration, + data_params, + ): + raw_data = data_generator(has_cp, test_iteration) + + data_input = labeled_data_factory(raw_data, has_cp) if is_labeled else ListUnivariateProvider(raw_data.tolist()) + + solver = solver_factory(data_input, with_localization) + result = solver.run() + + if with_localization: + assert isinstance(result, CpdLocalizationResults), "Localization result must be CpdLocalizationResults" + if has_cp: + assert len(result.result) == 1, "There must be only one change point" + assert abs(result.result[0] - data_params["change_point"]) <= data_params["tolerable_deviation"], ( + "Change point must lie in tolerable interval" + ) + if is_labeled: + assert result.expected_result == [data_params["change_point"]], ( + "Labeled change point must be equal to generated one" + ) + else: + assert result.expected_result is None, "Expected result must be None for not labeled data" + else: + assert result.result == [], "There must be no change points" + else: + assert isinstance(result, int), "Detection result must be a number of detected change points" + assert result == (1 if has_cp else 0), ( + "Number of change points must be equal to expected in the generated data" + ) From f70d1cbec7e9580395310163f2b5af7659bdfe80 Mon Sep 17 00:00:00 2001 From: Alexey Date: Tue, 1 Apr 2025 03:46:20 +0300 Subject: [PATCH 13/31] refactor(bayesian online algorithm): add asserts --- .../algorithms/bayesian_online_algorithm.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py index 7be1d808..9325a57a 100644 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -77,6 +77,10 @@ def __learn(self, observation: np.float64) -> None: self.__is_training = False self.__run_length_probs = np.array([1.0]) + assert len(self.__training_data) <= self.__learning_sample_size, ( + "Training data should not be longer than learning sample size" + ) + def __bayesian_update(self, observation: np.float64) -> None: """ Performs a bayesian update of the algorithm's state. @@ -85,10 +89,16 @@ def __bayesian_update(self, observation: np.float64) -> None: """ predictive_prob = self.__likelihood.predict(observation) hazards = self.__hazard.hazard(np.arange(self.__run_length_probs.shape[0], dtype=np.intp)) + growth_probs = self.__run_length_probs * (1 - hazards) * predictive_prob - reset_prob = np.sum(self.__run_length_probs * hazards * predictive_prob) - new_probs = np.append(reset_prob, growth_probs) - new_probs /= np.sum(new_probs) + change_point_prob = np.sum(self.__run_length_probs * hazards * predictive_prob) + new_probs = np.append(change_point_prob, growth_probs) + + evidence = np.sum(new_probs) + assert evidence > 0.0, "Evidence must be > 0.0" + new_probs /= evidence + assert np.all(np.logical_and(new_probs >= 0.0, new_probs <= 1.0)) + self.__run_length_probs = new_probs self.__likelihood.update(observation) @@ -100,6 +110,11 @@ def __handle_localization(self) -> None: """ run_length = self.__localizer.localize(self.__run_length_probs) change_point_location = self.__current_time - run_length + assert 0 <= change_point_location <= self.__current_time, ( + "Change point shouldn't be outside the available scope" + ) + + assert len(self.__data_history) >= run_length, "Run length shouldn't exceed available data length" self.__training_data = self.__data_history[-run_length:] self.__data_history = self.__data_history[-run_length:] self.__change_point = change_point_location From 033c0a5f0d7e7e22c88f982dd24b2f0925c228ed Mon Sep 17 00:00:00 2001 From: Alexey Date: Tue, 1 Apr 2025 19:46:17 +0300 Subject: [PATCH 14/31] fix(bayesian online algorithm): fix learning behavior after change points --- .../algorithms/bayesian_online_algorithm.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py index 9325a57a..aa4f4307 100644 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -52,7 +52,7 @@ def clear(self) -> None: Clears the state of the algorithm's instance. :return: """ - self.__training_data = [] + self.__clear_training_data() self.__data_history = [] self.__current_time = 0 @@ -62,6 +62,13 @@ def clear(self) -> None: self.__was_change_point = False self.__change_point = None + def __clear_training_data(self) -> None: + """ + Clears list of training data. + :return: + """ + self.__training_data = [] + def __learn(self, observation: np.float64) -> None: """ Performs a learning step for a prediction model until the given learning sample size is achieved. @@ -115,19 +122,23 @@ def __handle_localization(self) -> None: ) assert len(self.__data_history) >= run_length, "Run length shouldn't exceed available data length" - self.__training_data = self.__data_history[-run_length:] self.__data_history = self.__data_history[-run_length:] + self.__clear_training_data() self.__change_point = change_point_location self.__likelihood.clear() self.__detector.clear() self.__is_training = True - if len(self.__training_data) >= self.__learning_sample_size: - self.__training_data = self.__training_data[: self.__learning_sample_size] - for observation in self.__training_data: - self.__learn(observation) + observations_to_learn = min(len(self.__data_history), self.__learning_sample_size) + data_to_train = self.__data_history[:observations_to_learn] + + # Learning as much as we can until we reach learning sample size limit + for observation in data_to_train: + self.__learn(observation) + # Modeling run length probabilities on the rest data + if len(self.__data_history) >= self.__learning_sample_size: for observation in self.__data_history[self.__learning_sample_size :]: self.__bayesian_update(observation) @@ -137,11 +148,11 @@ def __handle_detection(self) -> None: :return: """ self.__data_history = self.__data_history[-1:] - self.__training_data = self.__data_history[:] + self.__clear_training_data() self.__likelihood.clear() self.__detector.clear() self.__is_training = True - self.__learn(self.__training_data[-1]) + self.__learn(self.__data_history[-1]) def __process_point(self, observation: np.float64, with_localization: bool) -> None: """ From cc05b1612f91a3306be5c6ff6066cb3dcc165fe7 Mon Sep 17 00:00:00 2001 From: Alexey Date: Tue, 1 Apr 2025 19:47:28 +0300 Subject: [PATCH 15/31] docs(readme): update info about bayesian algorithm --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cb081174..b31efb62 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ **Change point detection** module (*abbreviated CPD module*) is a module, designed for detecting anomalies in time series data, which refer to significant deviations from expected patterns or trends. Anomalies can indicate unusual events or changes in a system, making them crucial for monitoring and analysis in various fields such as finance, healthcare, and network security. At the moment, the module implements the following CPD algorithms: -* Bayesian algorithm +* Bayesian algorithm (scrubbing and online versions) * Density based algorithms: * KLIEP * RuLSIF From df400b4c327a73e61299ffdc2c3ca82c8245409b Mon Sep 17 00:00:00 2001 From: Alexey Tatyanenko <57017816+alexdtat@users.noreply.github.com> Date: Sat, 5 Apr 2025 20:26:51 +0300 Subject: [PATCH 16/31] refactor(online cpd solver): add suggested cp processing optimization Co-authored-by: Vladimir Kutuev --- pysatl_cpd/online_cpd_solver.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pysatl_cpd/online_cpd_solver.py b/pysatl_cpd/online_cpd_solver.py index a49949e5..86e5c9be 100644 --- a/pysatl_cpd/online_cpd_solver.py +++ b/pysatl_cpd/online_cpd_solver.py @@ -56,16 +56,9 @@ def run(self) -> CpdLocalizationResults | int: """ time_start = time.perf_counter() if not self._scenario.to_localize: - change_points_count = 0 - for was_changepoint in self._cpd_core.detect(): - if was_changepoint: - change_points_count += 1 - return change_points_count + return sum(self._cpd_core.detect()) - algo_results = [] - for change_point in self._cpd_core.localize(): - if change_point is not None: - algo_results.append(change_point) + algo_results = [cp for cp in self._cpd_core.localize() if cp is not None] time_end = time.perf_counter() expected_change_points: list[int] | None = None From d98209319564675e0f13ce4541d68aeb44f56719 Mon Sep 17 00:00:00 2001 From: Alexey Date: Sat, 5 Apr 2025 20:52:51 +0300 Subject: [PATCH 17/31] fix(gaussian likelihood): change deprecated decorator --- pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py index 860c7355..6936c3b0 100644 --- a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py +++ b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py @@ -7,11 +7,10 @@ __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" __license__ = "SPDX-License-Identifier: MIT" - import numpy as np import numpy.typing as npt from scipy import stats -from sklearn.utils import deprecated +from typing_extensions import deprecated from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood From 1145426866521d465089294d328bdf9ad6cd5f3f Mon Sep 17 00:00:00 2001 From: Alexey Date: Sat, 5 Apr 2025 20:58:05 +0300 Subject: [PATCH 18/31] docs(online algorithms): clarify localization method docstring --- pysatl_cpd/core/algorithms/bayesian_online_algorithm.py | 3 ++- pysatl_cpd/core/algorithms/online_algorithm.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py index aa4f4307..242b2d5b 100644 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py @@ -197,7 +197,8 @@ def localize(self, observation: np.float64 | npt.NDArray[np.float64]) -> Optiona """ Performs a change point localization after processing another observation of a time series. :param observation: new observation of a time series. - :return: location of a change point, acquired after processing the new observation, or None if there wasn't any. + :return: absolute location of a change point, acquired after processing the new observation, + or None if there wasn't any. """ if observation is npt.NDArray[np.float64]: raise TypeError("Multivariate observations are not supported") diff --git a/pysatl_cpd/core/algorithms/online_algorithm.py b/pysatl_cpd/core/algorithms/online_algorithm.py index 5161a8ed..05a29b59 100644 --- a/pysatl_cpd/core/algorithms/online_algorithm.py +++ b/pysatl_cpd/core/algorithms/online_algorithm.py @@ -29,6 +29,7 @@ def localize(self, observation: np.float64 | npt.NDArray[np.float64]) -> Optiona """ Method for a step of localization of a change point. :param observation: new observation of a time series - :return: location of a change point, acquired after processing the new observation, or None if there wasn't any. + :return: absolute location of a change point, acquired after processing the new observation, + or None if there wasn't any. """ ... From 2f19c50e01834367c7f91b8504b02056ce291853 Mon Sep 17 00:00:00 2001 From: Alexey Date: Sat, 5 Apr 2025 21:09:26 +0300 Subject: [PATCH 19/31] chore(gitignore): add ./idea to gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8b932ca9..29277951 100644 --- a/.gitignore +++ b/.gitignore @@ -159,5 +159,5 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ /poetry.lock From 714fe9034504e771a1642ea45b1a19a4090e6dad Mon Sep 17 00:00:00 2001 From: Aleksandra Listkova <115529517+alistkova@users.noreply.github.com> Date: Tue, 8 Apr 2025 13:38:20 +0300 Subject: [PATCH 20/31] fix(kliep): typing fix --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 44 +++++++++++-------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index beffca43..4b3a1cd0 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -9,6 +9,8 @@ import numpy as np import numpy.typing as npt from scipy.optimize import minimize +from typing import List, Any +from typing import cast from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm @@ -22,7 +24,7 @@ def __init__( threshold: float = 1.1, max_iter: int = 100, min_window_size: int = 10 - ): + ) -> None: """ Initializes a new instance of KLIEP based change point detection algorithm. @@ -32,10 +34,10 @@ def __init__( :param max_iter: maximum number of iterations for the L-BFGS-B optimizer. :param min_window_size: minimum size of data segments to consider. """ - self.bandwidth = bandwidth, - self.regularisation = regularization, - self.threshold = threshold, - self.max_iter = max_iter, + self.bandwidth = bandwidth + self.regularisation = regularization + self.threshold = threshold + self.max_iter = max_iter self.min_window_size = min_window_size def detect(self, window: npt.NDArray[np.float64]) -> int: @@ -61,27 +63,27 @@ def localize(self, window: npt.NDArray[np.float64]) -> list[int]: scores = self._compute_kliep_scores(window) return self._find_change_points(scores) - def _validate_window(self, window: np.ndarray) -> np.ndarray: + def _validate_window(self, window: npt.NDArray[Any]) -> npt.NDArray[np.float64]: """ Validates and prepares the input window for processing. :param window: input data window. :return: validated window in 2D format. """ - window = np.asarray(window) + window = np.asarray(window, dtype=np.float64) if window.ndim == 1: window = window.reshape(-1, 1) return window - def _compute_kliep_scores(self, window: np.ndarray) -> np.ndarray: + def _compute_kliep_scores(self, window: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: """ Computes KLIEP anomaly scores for each point in the window. :param window: validated input data window. :return: array of KLIEP scores for each point. """ - n_points = window.shape(0) - scores = np.zeros(n_points) + n_points = window.shape[0] + scores = np.zeros(n_points, dtype=np.float64) for i in range(self.min_window_size, n_points - self.min_window_size): before = window[:i] @@ -101,9 +103,9 @@ def _compute_kliep_scores(self, window: np.ndarray) -> np.ndarray: def _optimize_alpha( self, - test_density: np.ndarray, - ref_density: np.ndarray - ) -> np.ndarray: + test_density: npt.NDArray[np.float64], + ref_density: npt.NDArray[np.float64] + ) -> npt.NDArray[np.float64]: """ Optimizes the alpha parameters for KLIEP density ratio estimation. @@ -111,21 +113,25 @@ def _optimize_alpha( :param ref_density: density estimates for the reference window. :return: optimized alpha parameters. """ - def loss(alpha: np.ndarray) -> float: + def loss(alpha: npt.NDArray[np.float64]) -> float: """Objective function for KLIEP optimization.""" ratio = np.exp(test_density - ref_density - alpha) - return -np.mean(np.log(ratio)) + self.regularisation * np.sum(alpha**2) + loss_val = -np.mean(np.log(ratio)) + self.regularisation * np.sum(alpha**2) + return float(loss_val) + + initial_alpha = np.zeros_like(test_density, dtype=np.float64) + bounds = [(0, None)] * len(test_density) res = minimize( loss, - np.zeros_like(test_density), + initial_alpha, method='L-BFGS-B', options={'maxiter': self.max_iter}, - bounds=[(0, None)] * len(test_density) + bounds=bounds ) - return res.x + return cast(npt.NDArray[np.float64], res.x) - def _find_change_points(self, scores: np.ndarray) -> list[int]: + def _find_change_points(self, scores: npt.NDArray[np.float64]) -> List[int]: """ Identifies change points from computed KLIEP scores. From 0b80f9e66e830494b06d384440b398bc4a6f8460 Mon Sep 17 00:00:00 2001 From: alistkova Date: Tue, 8 Apr 2025 13:52:39 +0300 Subject: [PATCH 21/31] revert merge --- .gitignore | 2 +- README.md | 2 +- .../core/algorithms/abstract_algorithm.py | 2 +- .../bayesian/abstracts/idetector.py | 12 +- .../algorithms/bayesian/abstracts/ihazard.py | 11 +- .../bayesian/abstracts/ilikelihood.py | 16 +- .../bayesian/abstracts/ilocalizer.py | 9 +- .../detectors/{threshold.py => simple.py} | 2 +- .../algorithms/bayesian/hazards/constant.py | 2 +- .../bayesian/likelihoods/gaussian.py | 9 +- ... => gaussian_unknown_mean_and_variance.py} | 6 +- .../localizers/{argmax.py => simple.py} | 2 +- .../core/algorithms/bayesian_algorithm.py | 18 +- .../algorithms/bayesian_online_algorithm.py | 210 ------------------ .../core/algorithms/online_algorithm.py | 35 --- pysatl_cpd/core/online_cpd_core.py | 38 ---- pysatl_cpd/core/scrubber/data_providers.py | 3 +- pysatl_cpd/cpd_solver.py | 142 +++++++++++- pysatl_cpd/icpd_solver.py | 158 ------------- pysatl_cpd/online_cpd_solver.py | 68 ------ .../test_bayesian_algorithm.py | 38 ++-- .../test_bayesian_online_algorithm.py | 144 ------------ tests/test_core/test_online_cpd_core.py | 60 ----- tests/test_online_solver.py | 117 ---------- tests/test_solver.py | 3 +- 25 files changed, 208 insertions(+), 901 deletions(-) rename pysatl_cpd/core/algorithms/bayesian/detectors/{threshold.py => simple.py} (97%) rename pysatl_cpd/core/algorithms/bayesian/likelihoods/{gaussian_conjugate.py => gaussian_unknown_mean_and_variance.py} (96%) rename pysatl_cpd/core/algorithms/bayesian/localizers/{argmax.py => simple.py} (96%) delete mode 100644 pysatl_cpd/core/algorithms/bayesian_online_algorithm.py delete mode 100644 pysatl_cpd/core/algorithms/online_algorithm.py delete mode 100644 pysatl_cpd/core/online_cpd_core.py delete mode 100644 pysatl_cpd/icpd_solver.py delete mode 100644 pysatl_cpd/online_cpd_solver.py delete mode 100644 tests/test_core/test_algorithms/test_bayesian_online_algorithm.py delete mode 100644 tests/test_core/test_online_cpd_core.py delete mode 100644 tests/test_online_solver.py diff --git a/.gitignore b/.gitignore index 29277951..8b932ca9 100644 --- a/.gitignore +++ b/.gitignore @@ -159,5 +159,5 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ +#.idea/ /poetry.lock diff --git a/README.md b/README.md index b31efb62..cb081174 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ **Change point detection** module (*abbreviated CPD module*) is a module, designed for detecting anomalies in time series data, which refer to significant deviations from expected patterns or trends. Anomalies can indicate unusual events or changes in a system, making them crucial for monitoring and analysis in various fields such as finance, healthcare, and network security. At the moment, the module implements the following CPD algorithms: -* Bayesian algorithm (scrubbing and online versions) +* Bayesian algorithm * Density based algorithms: * KLIEP * RuLSIF diff --git a/pysatl_cpd/core/algorithms/abstract_algorithm.py b/pysatl_cpd/core/algorithms/abstract_algorithm.py index ea5ba18a..baa23462 100644 --- a/pysatl_cpd/core/algorithms/abstract_algorithm.py +++ b/pysatl_cpd/core/algorithms/abstract_algorithm.py @@ -5,7 +5,7 @@ class Algorithm(Protocol): - """Protocol for change point detection algorithms' interface""" + """Abstract class for change point detection algorithms""" def detect(self, window: npt.NDArray[np.float64]) -> int: """Function for finding change points in window diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py index 537547fd..ee9d7453 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py @@ -7,27 +7,29 @@ __license__ = "SPDX-License-Identifier: MIT" -from typing import Protocol +from abc import ABC, abstractmethod import numpy as np import numpy.typing as npt -class IDetector(Protocol): +class IDetector(ABC): """ - Protocol for detectors that detect a change point with given growth probabilities for run lengths. + Abstract base class for detectors that detect a change point with given growth probabilities for run lengths. """ + @abstractmethod def detect(self, growth_probs: npt.NDArray[np.float64]) -> bool: """ Checks whether a changepoint occurred with given growth probabilities at the time. :param growth_probs: growth probabilities for run lengths at the time. :return: boolean indicating whether a changepoint occurred """ - ... + raise NotImplementedError + @abstractmethod def clear(self) -> None: """ Clears the detector's state. """ - ... + raise NotImplementedError diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py index 876365f7..fc7b8d1f 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py @@ -7,21 +7,22 @@ __license__ = "SPDX-License-Identifier: MIT" -from typing import Protocol +from abc import ABC, abstractmethod import numpy as np import numpy.typing as npt -class IHazard(Protocol): +class IHazard(ABC): """ - Hazard function protocol. + Hazard function abstract base class. """ - def hazard(self, run_lengths: npt.NDArray[np.intp]) -> npt.NDArray[np.float64]: + @abstractmethod + def hazard(self, run_lengths: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: """ Calculates the hazard function for given run lengths. :param run_lengths: run lengths at the time. :return: hazard function's values for given run lengths. """ - ... + raise NotImplementedError diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py index 51d5c416..f7ef8ad6 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py @@ -7,17 +7,18 @@ __license__ = "SPDX-License-Identifier: MIT" -from typing import Protocol +from abc import ABC, abstractmethod import numpy as np import numpy.typing as npt -class ILikelihood(Protocol): +class ILikelihood(ABC): """ - Likelihood function's protocol. + Likelihood function's abstract base class. """ + @abstractmethod def learn(self, learning_sample: npt.NDArray[np.float64]) -> None: """ Learns first parameters of a likelihood function on a given sample. @@ -25,7 +26,8 @@ def learn(self, learning_sample: npt.NDArray[np.float64]) -> None: """ ... - def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: + @abstractmethod + def predict(self, observation: np.float64) -> npt.ArrayLike: """ Returns predictive probabilities for a given observation based on stored parameters. :param observation: an observation from a sample. @@ -33,15 +35,17 @@ def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: """ ... + @abstractmethod def update(self, observation: np.float64) -> None: """ Updates parameters of a likelihood function according to the given observation. :param observation: an observation from a sample. """ - ... + raise NotImplementedError + @abstractmethod def clear(self) -> None: """ Clears likelihood function's state. """ - ... + raise NotImplementedError diff --git a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py index 9ef9cb69..27223c42 100644 --- a/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py +++ b/pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py @@ -7,21 +7,22 @@ __license__ = "SPDX-License-Identifier: MIT" -from typing import Protocol +from abc import ABC, abstractmethod import numpy as np import numpy.typing as npt -class ILocalizer(Protocol): +class ILocalizer(ABC): """ - Protocol for localizers that localize a change point with given growth probabilities for run lengths. + Abstract base class for localizers that localize a change point with given growth probabilities for run lengths. """ + @abstractmethod def localize(self, growth_probs: npt.NDArray[np.float64]) -> int: """ Localizes a change point with given growth probabilities for run lengths. :param growth_probs: growth probabilities for run lengths at the time. :return: run length corresponding with a change point. """ - ... + raise NotImplementedError diff --git a/pysatl_cpd/core/algorithms/bayesian/detectors/threshold.py b/pysatl_cpd/core/algorithms/bayesian/detectors/simple.py similarity index 97% rename from pysatl_cpd/core/algorithms/bayesian/detectors/threshold.py rename to pysatl_cpd/core/algorithms/bayesian/detectors/simple.py index 5d66b002..a0815360 100644 --- a/pysatl_cpd/core/algorithms/bayesian/detectors/threshold.py +++ b/pysatl_cpd/core/algorithms/bayesian/detectors/simple.py @@ -14,7 +14,7 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.idetector import IDetector -class ThresholdDetector(IDetector): +class SimpleDetector(IDetector): """ A detector that detects a change point if the probability of the maximum run length drops below the threshold. """ diff --git a/pysatl_cpd/core/algorithms/bayesian/hazards/constant.py b/pysatl_cpd/core/algorithms/bayesian/hazards/constant.py index b0728820..f530f2c8 100644 --- a/pysatl_cpd/core/algorithms/bayesian/hazards/constant.py +++ b/pysatl_cpd/core/algorithms/bayesian/hazards/constant.py @@ -27,7 +27,7 @@ def __init__(self, rate: float): self._rate = np.float64(rate) assert self._rate >= 1.0, "Hazard rate cannot be less than 1.0" - def hazard(self, run_lengths: npt.NDArray[np.intp]) -> npt.NDArray[np.float64]: + def hazard(self, run_lengths: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: """ Calculates the constant hazard function. :param run_lengths: run lengths at the time. diff --git a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py index 6936c3b0..d4464e14 100644 --- a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py +++ b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py @@ -7,16 +7,15 @@ __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" __license__ = "SPDX-License-Identifier: MIT" + import numpy as np import numpy.typing as npt from scipy import stats -from typing_extensions import deprecated from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood -@deprecated("Use GaussianConjugate instead") -class Gaussian(ILikelihood): +class GaussianLikelihood(ILikelihood): """ Likelihood for Gaussian (a.k.a. normal) distribution, parametrized by mean and standard deviation. """ @@ -77,13 +76,13 @@ def update(self, observation: np.float64) -> None: self.__update_parameters_lists() - def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: + def predict(self, observation: np.float64) -> npt.ArrayLike: """ Returns predictive probabilities for a given observation based on stored means and standard deviations. :param observation: an observation from a sample. :return: predictive probabilities for a given observation. """ - return np.array(stats.norm(self.__means, self.__standard_deviations).pdf(observation)) + return stats.norm(self.__means, self.__standard_deviations).pdf(observation) def clear(self) -> None: """ diff --git a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_unknown_mean_and_variance.py similarity index 96% rename from pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py rename to pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_unknown_mean_and_variance.py index e43419b3..76ba3a4f 100644 --- a/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_conjugate.py +++ b/pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian_unknown_mean_and_variance.py @@ -15,7 +15,7 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood -class GaussianConjugate(ILikelihood): +class GaussianUnknownMeanAndVariance(ILikelihood): """ Likelihood for Gaussian (a.k.a. normal) distribution with unknown mean and variance estimated from normal-inverse gamma distribution as a conjugate prior. It uses 4 parameters, which priors are estimated from a learning sample and @@ -89,7 +89,7 @@ def update(self, observation: np.float64) -> None: self.__alpha_params = new_alpha_params self.__beta_params = new_beta_params - def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: + def predict(self, observation: np.float64) -> npt.ArrayLike: """ Returns predictive probabilities for a given observation based on posterior parameters. Predictive distribution is Student's t-distribution with 2 * alpha degrees of freedom. @@ -110,7 +110,7 @@ def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: scale=scales, ) - return np.array(predictive_probabilities) + return predictive_probabilities def clear(self) -> None: """ diff --git a/pysatl_cpd/core/algorithms/bayesian/localizers/argmax.py b/pysatl_cpd/core/algorithms/bayesian/localizers/simple.py similarity index 96% rename from pysatl_cpd/core/algorithms/bayesian/localizers/argmax.py rename to pysatl_cpd/core/algorithms/bayesian/localizers/simple.py index 574a01ca..a8e552c9 100644 --- a/pysatl_cpd/core/algorithms/bayesian/localizers/argmax.py +++ b/pysatl_cpd/core/algorithms/bayesian/localizers/simple.py @@ -12,7 +12,7 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.ilocalizer import ILocalizer -class ArgmaxLocalizer(ILocalizer): +class SimpleLocalizer(ILocalizer): """ A localizer that localizes a change point corresponding with the most probable non-max run length. """ diff --git a/pysatl_cpd/core/algorithms/bayesian_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_algorithm.py index 19ae3c1c..82ef71b8 100644 --- a/pysatl_cpd/core/algorithms/bayesian_algorithm.py +++ b/pysatl_cpd/core/algorithms/bayesian_algorithm.py @@ -15,12 +15,12 @@ from pysatl_cpd.core.algorithms.bayesian.abstracts.ihazard import IHazard from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood from pysatl_cpd.core.algorithms.bayesian.abstracts.ilocalizer import ILocalizer -from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector +from pysatl_cpd.core.algorithms.bayesian.detectors.simple import SimpleDetector from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard -from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( - GaussianConjugate, +from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_unknown_mean_and_variance import ( + GaussianUnknownMeanAndVariance, ) -from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer +from pysatl_cpd.core.algorithms.bayesian.localizers.simple import SimpleLocalizer class BayesianAlgorithm(Algorithm): @@ -36,10 +36,10 @@ class BayesianAlgorithm(Algorithm): def __init__( self, learning_steps: int = 50, - likelihood: ILikelihood = GaussianConjugate(), + likelihood: ILikelihood = GaussianUnknownMeanAndVariance(), hazard: IHazard = ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), - detector: IDetector = ThresholdDetector(threshold=0.04), - localizer: ILocalizer = ArgmaxLocalizer(), + detector: IDetector = SimpleDetector(threshold=0.04), + localizer: ILocalizer = SimpleLocalizer(), ): """ Initializes a new instance of Bayesian algorithm module with given customization. @@ -122,7 +122,7 @@ def __bayesian_stage(self, sample: npt.NDArray[np.float64]) -> None: while self.__bayesian_condition(sample_size): assert self.__time >= 0 - observation = np.float64(sample[self.__time]) + observation = sample[self.__time] self.__shift_time(1) self.__gap_size += 1 assert self.__gap_size > 0 @@ -180,7 +180,7 @@ def __bayesian_update(self, observation: np.float64) -> None: return # 4. Evaluate the hazard function for the gap. - hazard_val = np.array(self.__hazard.hazard(np.arange(self.__gap_size, dtype=np.intp))) + hazard_val = np.array(self.__hazard.hazard(np.array(range(self.__gap_size)))) # Evaluate the changepoint probability at *this* step (NB: generally it can be found later, with some delay). changepoint_prob = np.sum(self.__growth_probs[0 : self.__gap_size] * predictive_probs * hazard_val) diff --git a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py b/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py deleted file mode 100644 index 242b2d5b..00000000 --- a/pysatl_cpd/core/algorithms/bayesian_online_algorithm.py +++ /dev/null @@ -1,210 +0,0 @@ -""" -Module for Bayesian online change point detection algorithm. -""" - -__author__ = "Alexey Tatyanenko" -__copyright__ = "Copyright (c) 2025 PySATL project" -__license__ = "SPDX-License-Identifier: MIT" - -from typing import Optional - -import numpy as np -import numpy.typing as npt - -from pysatl_cpd.core.algorithms.bayesian.abstracts.idetector import IDetector -from pysatl_cpd.core.algorithms.bayesian.abstracts.ihazard import IHazard -from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood -from pysatl_cpd.core.algorithms.bayesian.abstracts.ilocalizer import ILocalizer -from pysatl_cpd.core.algorithms.online_algorithm import OnlineAlgorithm - - -class BayesianOnline(OnlineAlgorithm): - """ - Class for Bayesian online change point detection algorithm. - """ - - def __init__( - self, - hazard: IHazard, - likelihood: ILikelihood, - learning_sample_size: int, - detector: IDetector, - localizer: ILocalizer, - ) -> None: - self.__detector = detector - self.__hazard = hazard - self.__likelihood = likelihood - self.__localizer = localizer - self.__learning_sample_size = learning_sample_size - - self.__training_data: list[np.float64] = [] - self.__data_history: list[np.float64] = [] - self.__current_time = 0 - - self.__is_training: bool = True - self.__run_length_probs: npt.NDArray[np.float64] = np.array([]) - - self.__was_change_point = False - self.__change_point: int | None = None - - def clear(self) -> None: - """ - Clears the state of the algorithm's instance. - :return: - """ - self.__clear_training_data() - self.__data_history = [] - self.__current_time = 0 - - self.__is_training = True - self.__run_length_probs = np.array([]) - - self.__was_change_point = False - self.__change_point = None - - def __clear_training_data(self) -> None: - """ - Clears list of training data. - :return: - """ - self.__training_data = [] - - def __learn(self, observation: np.float64) -> None: - """ - Performs a learning step for a prediction model until the given learning sample size is achieved. - :param observation: new observation of a time series. - :return: - """ - self.__training_data.append(observation) - if len(self.__training_data) == self.__learning_sample_size: - self.__likelihood.clear() - self.__detector.clear() - - self.__likelihood.learn(np.array(self.__training_data)) - self.__is_training = False - self.__run_length_probs = np.array([1.0]) - - assert len(self.__training_data) <= self.__learning_sample_size, ( - "Training data should not be longer than learning sample size" - ) - - def __bayesian_update(self, observation: np.float64) -> None: - """ - Performs a bayesian update of the algorithm's state. - :param observation: new observation of a time series. - :return: - """ - predictive_prob = self.__likelihood.predict(observation) - hazards = self.__hazard.hazard(np.arange(self.__run_length_probs.shape[0], dtype=np.intp)) - - growth_probs = self.__run_length_probs * (1 - hazards) * predictive_prob - change_point_prob = np.sum(self.__run_length_probs * hazards * predictive_prob) - new_probs = np.append(change_point_prob, growth_probs) - - evidence = np.sum(new_probs) - assert evidence > 0.0, "Evidence must be > 0.0" - new_probs /= evidence - assert np.all(np.logical_and(new_probs >= 0.0, new_probs <= 1.0)) - - self.__run_length_probs = new_probs - self.__likelihood.update(observation) - - def __handle_localization(self) -> None: - """ - Handles localization of the change point. It includes acquiring location, updating stored data and state of the - algorithm, training it if possible and building corresponding run length distribution. - :return: - """ - run_length = self.__localizer.localize(self.__run_length_probs) - change_point_location = self.__current_time - run_length - assert 0 <= change_point_location <= self.__current_time, ( - "Change point shouldn't be outside the available scope" - ) - - assert len(self.__data_history) >= run_length, "Run length shouldn't exceed available data length" - self.__data_history = self.__data_history[-run_length:] - self.__clear_training_data() - self.__change_point = change_point_location - - self.__likelihood.clear() - self.__detector.clear() - self.__is_training = True - - observations_to_learn = min(len(self.__data_history), self.__learning_sample_size) - data_to_train = self.__data_history[:observations_to_learn] - - # Learning as much as we can until we reach learning sample size limit - for observation in data_to_train: - self.__learn(observation) - - # Modeling run length probabilities on the rest data - if len(self.__data_history) >= self.__learning_sample_size: - for observation in self.__data_history[self.__learning_sample_size :]: - self.__bayesian_update(observation) - - def __handle_detection(self) -> None: - """ - Handles detection of the change point. It includes updating stored data and state of the algorithm. - :return: - """ - self.__data_history = self.__data_history[-1:] - self.__clear_training_data() - self.__likelihood.clear() - self.__detector.clear() - self.__is_training = True - self.__learn(self.__data_history[-1]) - - def __process_point(self, observation: np.float64, with_localization: bool) -> None: - """ - Universal method for processing of another observation of a time series. - :param observation: new observation of a time series. - :param with_localization: whether the method was called for localization of a change point. - :return: - """ - self.__data_history.append(observation) - self.__current_time += 1 - - if self.__is_training: - self.__learn(observation) - else: - self.__bayesian_update(observation) - detected = self.__detector.detect(self.__run_length_probs) - - if not detected: - return - - self.__was_change_point = True - if with_localization: - self.__handle_localization() - else: - self.__handle_detection() - - def detect(self, observation: np.float64 | npt.NDArray[np.float64]) -> bool: - """ - Performs a change point detection after processing another observation of a time series. - :param observation: new observation of a time series. Note: multivariate time series aren't supported for now. - :return: whether a change point was detected after processing the new observation. - """ - if observation is npt.NDArray[np.float64]: - raise TypeError("Multivariate observations are not supported") - - self.__process_point(np.float64(observation), False) - result = self.__was_change_point - self.__was_change_point = False - return result - - def localize(self, observation: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: - """ - Performs a change point localization after processing another observation of a time series. - :param observation: new observation of a time series. - :return: absolute location of a change point, acquired after processing the new observation, - or None if there wasn't any. - """ - if observation is npt.NDArray[np.float64]: - raise TypeError("Multivariate observations are not supported") - - self.__process_point(np.float64(observation), True) - result = self.__change_point - self.__was_change_point = False - self.__change_point = None - return result diff --git a/pysatl_cpd/core/algorithms/online_algorithm.py b/pysatl_cpd/core/algorithms/online_algorithm.py deleted file mode 100644 index 05a29b59..00000000 --- a/pysatl_cpd/core/algorithms/online_algorithm.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Module for online change point detection algorithm's interface. -""" - -__author__ = "Alexey Tatyanenko" -__copyright__ = "Copyright (c) 2025 PySATL project" -__license__ = "SPDX-License-Identifier: MIT" - -from typing import Optional, Protocol - -import numpy as np -import numpy.typing as npt - - -class OnlineAlgorithm(Protocol): - """ - Protocol for online change point detection algorithm's interface. - """ - - def detect(self, observation: np.float64 | npt.NDArray[np.float64]) -> bool: - """ - Method for a step of detection of a change point. - :param observation: new observation of a time series. - :return: bool observation whether a change point was detected after processing the new observation. - """ - ... - - def localize(self, observation: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: - """ - Method for a step of localization of a change point. - :param observation: new observation of a time series - :return: absolute location of a change point, acquired after processing the new observation, - or None if there wasn't any. - """ - ... diff --git a/pysatl_cpd/core/online_cpd_core.py b/pysatl_cpd/core/online_cpd_core.py deleted file mode 100644 index ef4963b8..00000000 --- a/pysatl_cpd/core/online_cpd_core.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -Module for online-CPD core, which presents access to algorithms as iterators over provdied data. -""" - -__author__ = "Alexey Tatyanenko" -__copyright__ = "Copyright (c) 2025 PySATL project" -__license__ = "SPDX-License-Identifier: MIT" - -from collections.abc import Iterator - -from pysatl_cpd.core.algorithms.online_algorithm import OnlineAlgorithm -from pysatl_cpd.core.scrubber.data_providers import DataProvider - - -class OnlineCpdCore: - """ - Class that presents online CPD-algorithm as detection or localization iterator over the provided data. - """ - - def __init__(self, algorithm: OnlineAlgorithm, data_provider: DataProvider) -> None: - self.algorithm = algorithm - self.data_provider = data_provider - - def detect(self) -> Iterator[bool]: - """ - Iteratively tries to detect a change point in the provided data. - :return: whether a change point after processed observation was detected. - """ - for observation in self.data_provider: - yield self.algorithm.detect(observation) - - def localize(self) -> Iterator[int | None]: - """ - Iteratively tries to localize a change point in the provided data. - :return: change point location, if it was successfully localized, or None, otherwise. - """ - for observation in self.data_provider: - yield self.algorithm.localize(observation) diff --git a/pysatl_cpd/core/scrubber/data_providers.py b/pysatl_cpd/core/scrubber/data_providers.py index 9b12b25a..8f5d9cfc 100644 --- a/pysatl_cpd/core/scrubber/data_providers.py +++ b/pysatl_cpd/core/scrubber/data_providers.py @@ -7,7 +7,7 @@ __license__ = "SPDX-License-Identifier: MIT" from collections.abc import Iterator -from typing import Protocol, runtime_checkable +from typing import Protocol import numpy as np import numpy.typing as npt @@ -15,7 +15,6 @@ from pysatl_cpd.labeled_data import LabeledCpdData -@runtime_checkable class DataProvider(Protocol): """Interface for abstracting the scrubber from the data source and its format""" diff --git a/pysatl_cpd/cpd_solver.py b/pysatl_cpd/cpd_solver.py index 07acf192..ec324da4 100644 --- a/pysatl_cpd/cpd_solver.py +++ b/pysatl_cpd/cpd_solver.py @@ -1,5 +1,5 @@ """ -Module contains class for solving change point detection problem. +Module contains classes for solving change point detection problem and representation its results. """ __author__ = "Aleksei Ivanov, Artem Romanyuk, Vladimir Kutuev" @@ -7,17 +7,150 @@ __license__ = "SPDX-License-Identifier: MIT" import time +from collections.abc import Iterator +from pathlib import Path +import numpy as np +import numpy.typing as npt +from matplotlib import pyplot as plt + +from .analysis.results_analyzer import CpdResultsAnalyzer from .core.algorithms.abstract_algorithm import Algorithm from .core.cpd_core import CpdCore from .core.problem import CpdProblem from .core.scrubber.abstract import Scrubber from .core.scrubber.data_providers import LabeledDataProvider -from .icpd_solver import CpdLocalizationResults, ICpdSolver from .labeled_data import LabeledCpdData -class CpdSolver(ICpdSolver): +class CpdLocalizationResults: + """Container for results of CPD algorithms""" + + def __init__( + self, + data: Iterator[np.float64] | Iterator[npt.NDArray[np.float64]], + result: list[int], + expected_result: list[int] | None, + time_sec: float, + ) -> None: + """Object constructor + + :param: result: list, containing change points, that were found by CPD algos + :param: expected_result: list, containing expected change points, if it is needed + :param: time_sec: a float number, time of CPD algo execution in fractional seconds + """ + self.data = data + self.result = result + self.expected_result = expected_result + self.time_sec = time_sec + + @property + def result_diff(self) -> list[int]: + """method for calculation symmetrical diff between results and expected results (if its granted) + + :return: symmetrical difference between results and expected results + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, thus diff cannot be calculated.") + first, second = set(self.result), set(self.expected_result) + return sorted(list(first.symmetric_difference(second))) + + def __str__(self) -> str: + """method for printing results of CPD algo results in a convenient way + + :return: string with brief CPD algo execution results + """ + cp_results = ";".join(map(str, self.result)) + method_output = f"Located change points: ({cp_results})\n" + if self.expected_result is not None: + expected_cp_results = ";".join(map(str, self.expected_result)) + diff = ";".join(map(str, self.result_diff)) + method_output += f"Expected change point: ({expected_cp_results})\n" + method_output += f"Difference: ({diff})\n" + method_output += f"Computation time (sec): {round(self.time_sec, 2)}" + return method_output + + def count_confusion_matrix(self, window: tuple[int, int] | None = None) -> tuple[int, int, int, int]: + """method for counting confusion matrix for hypothesis of equality of CPD results and expected + results on a window + + :param: window: tuple of two indices (start, stop), determines a window for hypothesis + + :return: tuple of integers (true-positive, true-negative, false-positive, false-negative) + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, confusion matrix cannot be calculated") + return CpdResultsAnalyzer.count_confusion_matrix(self.result, self.expected_result, window) + + def count_accuracy(self, window: tuple[int, int] | None = None) -> float: + """method for counting accuracy metric for hypothesis of equality of CPD results and expected + results on a window + + :param: window: tuple of two indices (start, stop), determines a window for hypothesis + + :return: float, accuracy metric + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, accuracy cannot be calculated") + return CpdResultsAnalyzer.count_accuracy(self.result, self.expected_result, window) + + def count_precision(self, window: tuple[int, int] | None = None) -> float: + """method for counting precision metric for hypothesis of equality of CPD results and expected + results on a window + + :param: window: tuple of two indices (start, stop), determines a window for hypothesis + + :return: float, precision metric + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, precision cannot be calculated") + return CpdResultsAnalyzer.count_precision(self.result, self.expected_result, window) + + def count_recall(self, window: tuple[int, int] | None = None) -> float: + """method for counting recall metric for hypothesis of equality of CPD results and expected results on a window + + :param: window: tuple of two indices (start, stop), determines a window for hypothesis + + :return: float, recall metric + """ + if self.expected_result is None: + raise ValueError("this object is not provided with expected result, recall cannot be calculated") + return CpdResultsAnalyzer.count_recall(self.result, self.expected_result, window) + + def visualize(self, to_show: bool = True, output_directory: Path | None = None, name: str = "plot") -> None: + """method for building and analyzing graph + + :param to_show: is it necessary to show a graph + :param output_directory: If necessary, the path to the directory to save the graph + :param name: A name for the output image with plot + """ + + data: npt.NDArray[np.float64] = np.array(list(self.data)) + plt.plot(data) + if self.expected_result is None: + plt.vlines(x=self.result, ymin=data.min(), ymax=data.max(), colors="orange", ls="--") + plt.gca().legend(("data", "detected")) + else: + correct, incorrect, undetected = set(), set(), set(self.expected_result) + for point in self.result: + if point in self.expected_result: + correct.add(point) + undetected.remove(point) + elif point not in undetected: + incorrect.add(point) + plt.vlines(x=list(correct), ymin=data.min(), ymax=data.max(), colors="green", ls="--") + plt.vlines(x=list(incorrect), ymin=data.min(), ymax=data.max(), colors="red", ls="--") + plt.vlines(x=list(undetected), ymin=data.min(), ymax=data.max(), colors="orange", ls="--") + plt.gca().legend(("data", "correct detected", "incorrect detected", "undetected")) + if output_directory: + if not output_directory.exists(): + output_directory.mkdir() + plt.savefig(output_directory.joinpath(Path(name))) + if to_show: + plt.show() + + +class CpdSolver: """Class, that grants a convenient interface to work with CPD algorithms""" @@ -47,8 +180,7 @@ def __init__( def run(self) -> CpdLocalizationResults | int: """Execute CPD algorithm and return container with its results - :return: CpdLocalizationResults object, containing algo result CP and expected CP if needed, - or number of detected change points. + :return: CPContainer object, containing algo result CP and expected CP if needed """ time_start = time.perf_counter() if not self._scenario.to_localize: diff --git a/pysatl_cpd/icpd_solver.py b/pysatl_cpd/icpd_solver.py deleted file mode 100644 index f90b14a3..00000000 --- a/pysatl_cpd/icpd_solver.py +++ /dev/null @@ -1,158 +0,0 @@ -""" -Module contains protocol for solution of change point detection problem and class for representation of its results. -""" - -__author__ = "Aleksei Ivanov, Alexey Tatyanenko, Artem Romanyuk, Vladimir Kutuev" -__copyright__ = "Copyright (c) 2025 PySATL project" -__license__ = "SPDX-License-Identifier: MIT" - -from collections.abc import Iterator -from pathlib import Path -from typing import Protocol - -import numpy as np -import numpy.typing as npt -from matplotlib import pyplot as plt - -from pysatl_cpd.analysis.results_analyzer import CpdResultsAnalyzer - - -class CpdLocalizationResults: - """Container for results of CPD algorithms""" - - def __init__( - self, - data: Iterator[np.float64] | Iterator[npt.NDArray[np.float64]], - result: list[int], - expected_result: list[int] | None, - time_sec: float, - ) -> None: - """Object constructor - - :param: result: list, containing change points, that were found by CPD algos - :param: expected_result: list, containing expected change points, if it is needed - :param: time_sec: a float number, time of CPD algo execution in fractional seconds - """ - self.data = data - self.result = result - self.expected_result = expected_result - self.time_sec = time_sec - - @property - def result_diff(self) -> list[int]: - """method for calculation symmetrical diff between results and expected results (if its granted) - - :return: symmetrical difference between results and expected results - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, thus diff cannot be calculated.") - first, second = set(self.result), set(self.expected_result) - return sorted(list(first.symmetric_difference(second))) - - def __str__(self) -> str: - """method for printing results of CPD algo results in a convenient way - - :return: string with brief CPD algo execution results - """ - cp_results = ";".join(map(str, self.result)) - method_output = f"Located change points: ({cp_results})\n" - if self.expected_result is not None: - expected_cp_results = ";".join(map(str, self.expected_result)) - diff = ";".join(map(str, self.result_diff)) - method_output += f"Expected change point: ({expected_cp_results})\n" - method_output += f"Difference: ({diff})\n" - method_output += f"Computation time (sec): {round(self.time_sec, 2)}" - return method_output - - def count_confusion_matrix(self, window: tuple[int, int] | None = None) -> tuple[int, int, int, int]: - """method for counting confusion matrix for hypothesis of equality of CPD results and expected - results on a window - - :param: window: tuple of two indices (start, stop), determines a window for hypothesis - - :return: tuple of integers (true-positive, true-negative, false-positive, false-negative) - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, confusion matrix cannot be calculated") - return CpdResultsAnalyzer.count_confusion_matrix(self.result, self.expected_result, window) - - def count_accuracy(self, window: tuple[int, int] | None = None) -> float: - """method for counting accuracy metric for hypothesis of equality of CPD results and expected - results on a window - - :param: window: tuple of two indices (start, stop), determines a window for hypothesis - - :return: float, accuracy metric - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, accuracy cannot be calculated") - return CpdResultsAnalyzer.count_accuracy(self.result, self.expected_result, window) - - def count_precision(self, window: tuple[int, int] | None = None) -> float: - """method for counting precision metric for hypothesis of equality of CPD results and expected - results on a window - - :param: window: tuple of two indices (start, stop), determines a window for hypothesis - - :return: float, precision metric - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, precision cannot be calculated") - return CpdResultsAnalyzer.count_precision(self.result, self.expected_result, window) - - def count_recall(self, window: tuple[int, int] | None = None) -> float: - """method for counting recall metric for hypothesis of equality of CPD results and expected results on a window - - :param: window: tuple of two indices (start, stop), determines a window for hypothesis - - :return: float, recall metric - """ - if self.expected_result is None: - raise ValueError("this object is not provided with expected result, recall cannot be calculated") - return CpdResultsAnalyzer.count_recall(self.result, self.expected_result, window) - - def visualize(self, to_show: bool = True, output_directory: Path | None = None, name: str = "plot") -> None: - """method for building and analyzing graph - - :param to_show: is it necessary to show a graph - :param output_directory: If necessary, the path to the directory to save the graph - :param name: A name for the output image with plot - """ - - data: npt.NDArray[np.float64] = np.array(list(self.data)) - plt.plot(data) - if self.expected_result is None: - plt.vlines(x=self.result, ymin=data.min(), ymax=data.max(), colors="orange", ls="--") - plt.gca().legend(("data", "detected")) - else: - correct, incorrect, undetected = set(), set(), set(self.expected_result) - for point in self.result: - if point in self.expected_result: - correct.add(point) - undetected.remove(point) - elif point not in undetected: - incorrect.add(point) - plt.vlines(x=list(correct), ymin=data.min(), ymax=data.max(), colors="green", ls="--") - plt.vlines(x=list(incorrect), ymin=data.min(), ymax=data.max(), colors="red", ls="--") - plt.vlines(x=list(undetected), ymin=data.min(), ymax=data.max(), colors="orange", ls="--") - plt.gca().legend(("data", "correct detected", "incorrect detected", "undetected")) - if output_directory: - if not output_directory.exists(): - output_directory.mkdir() - plt.savefig(output_directory.joinpath(Path(name))) - if to_show: - plt.show() - - -class ICpdSolver(Protocol): - """ - Protocol, describing interface of CPD problem's solution. - """ - - def run(self) -> CpdLocalizationResults | int: - """ - Method for evaluation of CPD problem's solution, executing some CPD-algorithm. - :return: CpdLocalizationResults object, containing algo result CP and expected CP if needed, - or number of detected change points. - """ - ... diff --git a/pysatl_cpd/online_cpd_solver.py b/pysatl_cpd/online_cpd_solver.py deleted file mode 100644 index 86e5c9be..00000000 --- a/pysatl_cpd/online_cpd_solver.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Module contains class for solving change point detection problem with an online CPD algorithm. -""" - -__author__ = "Alexey Tatyanenko" -__copyright__ = "Copyright (c) 2025 PySATL project" -__license__ = "SPDX-License-Identifier: MIT" - -import time - -from pysatl_cpd.core.algorithms.online_algorithm import OnlineAlgorithm -from pysatl_cpd.core.online_cpd_core import OnlineCpdCore -from pysatl_cpd.core.problem import CpdProblem -from pysatl_cpd.core.scrubber.data_providers import DataProvider, LabeledDataProvider -from pysatl_cpd.icpd_solver import CpdLocalizationResults, ICpdSolver -from pysatl_cpd.labeled_data import LabeledCpdData - - -class OnlineCpdSolver(ICpdSolver): - """Class, that grants a convenient interface to - work with online-CPD algorithms""" - - def __init__( - self, - scenario: CpdProblem, - algorithm: OnlineAlgorithm, - algorithm_input: DataProvider | LabeledCpdData, - ) -> None: - """pysatl_cpd object constructor - - :param: scenario: scenario specify - :param: algorithm: online-CPD algorithm, that will search for change points - :param: algorithm_input: data provider or labeled data to construct corresponding data provider. - """ - self._labeled_data: LabeledCpdData | None = None - self._cpd_core: OnlineCpdCore - match algorithm_input: - case LabeledCpdData() as data: - self._labeled_data = data - self._cpd_core = OnlineCpdCore( - data_provider=LabeledDataProvider(data), - algorithm=algorithm, - ) - case DataProvider() as data_provider: - self._cpd_core = OnlineCpdCore( - data_provider=data_provider, - algorithm=algorithm, - ) - - self._scenario = scenario - - def run(self) -> CpdLocalizationResults | int: - """Execute online-CPD algorithm and return container with its results - - :return: CpdLocalizationResults object, containing algo result CP and expected CP if needed - """ - time_start = time.perf_counter() - if not self._scenario.to_localize: - return sum(self._cpd_core.detect()) - - algo_results = [cp for cp in self._cpd_core.localize() if cp is not None] - - time_end = time.perf_counter() - expected_change_points: list[int] | None = None - if isinstance(self._labeled_data, LabeledCpdData): - expected_change_points = self._labeled_data.change_points - data = iter(self._cpd_core.data_provider) - return CpdLocalizationResults(data, algo_results, expected_change_points, time_end - time_start) diff --git a/tests/test_core/test_algorithms/test_bayesian_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_algorithm.py index 43001668..0694b8bf 100644 --- a/tests/test_core/test_algorithms/test_bayesian_algorithm.py +++ b/tests/test_core/test_algorithms/test_bayesian_algorithm.py @@ -1,12 +1,12 @@ import numpy as np import pytest -from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector +from pysatl_cpd.core.algorithms.bayesian.detectors.simple import SimpleDetector from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard -from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( - GaussianConjugate, +from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_unknown_mean_and_variance import ( + GaussianUnknownMeanAndVariance, ) -from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer +from pysatl_cpd.core.algorithms.bayesian.localizers.simple import SimpleLocalizer from pysatl_cpd.core.algorithms.bayesian_algorithm import BayesianAlgorithm @@ -17,10 +17,10 @@ def set_seed(): def construct_bayesian_algorithm(): return BayesianAlgorithm( learning_steps=50, - likelihood=GaussianConjugate(), + likelihood=GaussianUnknownMeanAndVariance(), hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), - detector=ThresholdDetector(threshold=0.04), - localizer=ArgmaxLocalizer(), + detector=SimpleDetector(threshold=0.04), + localizer=SimpleLocalizer(), ) @@ -105,7 +105,7 @@ def test_correctness_of_consecutive_localization( @pytest.mark.parametrize("hazard_rate,max_run_length", [(1.1, 50), (10, 100), (200, 250), (500.325251, 500)]) def test_constant_hazard_for_constants(hazard_rate, max_run_length): constant_hazard = ConstantHazard(hazard_rate) - run_lengths = np.arange(max_run_length, dtype=np.intp) + run_lengths = np.arange(max_run_length) hazard_probs = constant_hazard.hazard(run_lengths) assert hazard_probs.shape[0] == max_run_length, ( f"Expected {max_run_length} probabilities, got {hazard_probs.shape[0]}" @@ -132,11 +132,11 @@ def test_learning_and_update(): np.random.normal(loc=5, scale=2, size=size - change_point), ] ) - likelihood = GaussianConjugate() - likelihood.learn(data[:learning_steps]) + likelihood = GaussianUnknownMeanAndVariance() + likelihood.learn(list(data[:learning_steps])) for time in range(learning_steps, size): - observation = np.float64(data[time]) + observation = float(data[time]) pred_probs = np.array(likelihood.predict(observation)) if time == time_after_learning: @@ -165,28 +165,28 @@ def test_clear(): size = 51 data = np.random.normal(loc=0, scale=1, size=size) - likelihood = GaussianConjugate() + likelihood = GaussianUnknownMeanAndVariance() - likelihood.learn(data[:-2]) - first = likelihood.predict(np.float64(data[-1])) + likelihood.learn(list(data[:-2])) + first = likelihood.predict(float(data[-1])) likelihood.clear() - likelihood.learn(data[:-2]) - second = likelihood.predict(np.float64(data[-1])) + likelihood.learn(list(data[:-2])) + second = likelihood.predict(float(data[-1])) assert first == second, f"Results differ after clear: {first} vs {second}" def test_detector_detection(): run_lengths = np.full(100, 0.01) - detector = ThresholdDetector(threshold=0.04) + detector = SimpleDetector(threshold=0.04) assert detector.detect(run_lengths), "Change point should be detected" def test_detector_clear(): run_lengths = np.full(100, 0.01) - detector = ThresholdDetector(threshold=0.04) + detector = SimpleDetector(threshold=0.04) result1 = detector.detect(run_lengths) detector.clear() @@ -199,6 +199,6 @@ def test_localizer_localization(): change_point = 5 run_lengths = np.full(11, 0.05) run_lengths[change_point] = 0.5 - localizer = ArgmaxLocalizer() + localizer = SimpleLocalizer() result = localizer.localize(run_lengths) assert result == change_point, f"Expected change at {change_point}, got {result}" diff --git a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py b/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py deleted file mode 100644 index 07e0d973..00000000 --- a/tests/test_core/test_algorithms/test_bayesian_online_algorithm.py +++ /dev/null @@ -1,144 +0,0 @@ -import numpy as np -import pytest - -from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector -from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard -from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( - GaussianConjugate, -) -from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer -from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnline - - -def set_seed(): - np.random.seed(1) - - -def construct_bayesian_online_algorithm(): - return BayesianOnline( - learning_sample_size=5, - likelihood=GaussianConjugate(), - hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), - detector=ThresholdDetector(threshold=0.04), - localizer=ArgmaxLocalizer(), - ) - - -@pytest.fixture(scope="function") -def data_params(): - return { - "num_of_tests": 10, - "size": 500, - "change_point": 250, - "tolerable_deviation": 25, - } - - -@pytest.fixture -def generate_data(data_params): - def _generate_data(): - set_seed() - return np.concatenate( - [ - np.random.normal(loc=0, scale=1, size=data_params["change_point"]), - np.random.normal(loc=5, scale=2, size=data_params["size"] - data_params["change_point"]), - ] - ) - - return _generate_data - - -@pytest.fixture(scope="function") -def outer_bayesian_algorithm(): - return construct_bayesian_online_algorithm() - - -@pytest.fixture -def inner_algorithm_factory(): - def _factory(): - return construct_bayesian_online_algorithm() - - return _factory - - -class TestBayesianOnlineAlgorithm: - def test_consecutive_detection(self, outer_bayesian_algorithm, generate_data, data_params): - for _ in range(data_params["num_of_tests"]): - data = generate_data() - was_change_point = False - for value in data: - result = outer_bayesian_algorithm.detect(value) - if result: - was_change_point = True - - assert was_change_point, "There was undetected change point in data" - - def test_correctness_of_consecutive_detection( - self, outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params - ): - for _ in range(data_params["num_of_tests"]): - data = generate_data() - inner_algorithm = inner_algorithm_factory() - outer_result = [] - inner_result = [] - - for value in data: - outer_result.append(outer_bayesian_algorithm.detect(value)) - inner_result.append(inner_algorithm.detect(value)) - - outer_bayesian_algorithm.clear() - assert outer_result == inner_result, "Consecutive and independent detection should give same results" - - def test_consecutive_localization(self, outer_bayesian_algorithm, generate_data, data_params): - for _ in range(data_params["num_of_tests"]): - data = generate_data() - was_change_point = False - - for value in data: - result = outer_bayesian_algorithm.localize(value) - if result is None: - continue - - was_change_point = True - - assert ( - data_params["change_point"] - data_params["tolerable_deviation"] - <= result - <= data_params["change_point"] + data_params["tolerable_deviation"] - ), "Incorrect change point localization" - - outer_bayesian_algorithm.clear() - assert was_change_point, "Actual change point was not detected at all" - - def test_correctness_of_consecutive_localization( - self, outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params - ): - for _ in range(data_params["num_of_tests"]): - data = generate_data() - inner_algorithm = inner_algorithm_factory() - - for value in data: - outer_result = outer_bayesian_algorithm.localize(value) - inner_result = inner_algorithm.localize(value) - assert outer_result == inner_result, "Consecutive and independent localization should give same results" - - outer_bayesian_algorithm.clear() - - def test_online_detection_correctness(self, inner_algorithm_factory, generate_data, data_params): - for _ in range(data_params["num_of_tests"]): - algorithm = inner_algorithm_factory() - data = generate_data() - for time, value in np.ndenumerate(data): - result = algorithm.detect(value) - if result: - assert data_params["change_point"] <= time[0], "Change point cannot be detected beforehand" - - def test_online_localization_correctness(self, inner_algorithm_factory, generate_data, data_params): - for _ in range(data_params["num_of_tests"]): - algorithm = inner_algorithm_factory() - data = generate_data() - for time, value in np.ndenumerate(data): - result = algorithm.localize(value) - if result: - assert result <= time[0], "Change point cannot be in future" - assert data_params["change_point"] <= time[0], "Change point cannot be detected beforehand" diff --git a/tests/test_core/test_online_cpd_core.py b/tests/test_core/test_online_cpd_core.py deleted file mode 100644 index 4be79521..00000000 --- a/tests/test_core/test_online_cpd_core.py +++ /dev/null @@ -1,60 +0,0 @@ -import numpy as np -import pytest - -from pysatl_cpd.core.online_cpd_core import OnlineCpdCore -from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider -from tests.test_core.test_algorithms.test_bayesian_online_algorithm import construct_bayesian_online_algorithm - -DATA_PARAMS = { - "num_of_tests": 10, - "size": 500, - "change_point": 250, - "tolerable_deviation": 25, -} - - -@pytest.fixture(scope="session") -def data_params(): - return DATA_PARAMS - - -@pytest.fixture -def algorithm(): - return construct_bayesian_online_algorithm() - - -@pytest.fixture(params=[True, False], ids=["with_cp", "without_cp"]) -def dataset(request, data_params): - np.random.seed(42 + request.param_index) - if request.param: # With change point - return np.concatenate( - [ - np.random.normal(0, 1, data_params["change_point"]), - np.random.normal(5, 2, data_params["size"] - data_params["change_point"]), - ] - ) - return np.random.normal(0, 1, data_params["size"]) # Without change point - - -@pytest.fixture -def online_core(dataset): - return OnlineCpdCore( - algorithm=construct_bayesian_online_algorithm(), data_provider=ListUnivariateProvider(list(dataset)) - ) - - -class TestOnlineCpdCore: - @pytest.mark.parametrize("test_iteration", range(DATA_PARAMS["num_of_tests"])) - @pytest.mark.parametrize("mode", ["detect", "localize"]) - def test_core_functionality(self, algorithm, online_core, dataset, data_params, mode, test_iteration): - core_iterator = getattr(online_core, mode)() - algo_method = getattr(algorithm, mode) - - for time_point in range(data_params["size"]): - observation = dataset[time_point] - algo_result = algo_method(observation) - core_result = next(core_iterator) - - assert algo_result == core_result, ( - f"Different results at {time_point} between manual {mode} and core {mode} iteration" - ) diff --git a/tests/test_online_solver.py b/tests/test_online_solver.py deleted file mode 100644 index 8b87c0e1..00000000 --- a/tests/test_online_solver.py +++ /dev/null @@ -1,117 +0,0 @@ -import numpy as np -import pytest - -from pysatl_cpd.core.problem import CpdProblem -from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider -from pysatl_cpd.icpd_solver import CpdLocalizationResults -from pysatl_cpd.labeled_data import LabeledCpdData -from pysatl_cpd.online_cpd_solver import OnlineCpdSolver -from tests.test_core.test_algorithms.test_bayesian_online_algorithm import construct_bayesian_online_algorithm - -DATA_PARAMS = { - "num_tests": 10, - "size": 500, - "change_point": 250, - "tolerable_deviation": 25, -} - - -@pytest.fixture(scope="session") -def data_params(): - return DATA_PARAMS - - -@pytest.fixture -def data_generator(data_params): - def _generate(has_cp, test_iteration): - seed = 42 + test_iteration - np.random.seed(seed) - if has_cp: - return np.concatenate( - [ - np.random.normal(0, 1, data_params["change_point"]), - np.random.normal(5, 2, data_params["size"] - data_params["change_point"]), - ] - ) - return np.random.normal(0, 1, data_params["size"]) - - return _generate - - -@pytest.fixture -def labeled_data_factory(data_params): - def _factory(data, has_cp): - return LabeledCpdData(raw_data=data, change_points=[data_params["change_point"]] if has_cp else None) - - return _factory - - -@pytest.fixture -def solver_factory(): - def _factory(data_input, with_localization): - return OnlineCpdSolver( - algorithm=construct_bayesian_online_algorithm(), - algorithm_input=data_input, - scenario=CpdProblem(with_localization), - ) - - return _factory - - -def pytest_generate_tests(metafunc): - if "test_iteration" in metafunc.fixturenames: - metafunc.parametrize("test_iteration", range(DATA_PARAMS["num_tests"])) - - -class TestOnlineCpdSolver: - @pytest.mark.parametrize( - "has_cp,with_localization,is_labeled", - [ - (False, True, True), - (True, True, True), - (False, True, False), - (True, True, False), - (False, False, True), - (True, False, True), - (False, False, False), - (True, False, False), - ], - ) - def test_all_scenarios( - self, - data_generator, - labeled_data_factory, - solver_factory, - has_cp, - with_localization, - is_labeled, - test_iteration, - data_params, - ): - raw_data = data_generator(has_cp, test_iteration) - - data_input = labeled_data_factory(raw_data, has_cp) if is_labeled else ListUnivariateProvider(raw_data.tolist()) - - solver = solver_factory(data_input, with_localization) - result = solver.run() - - if with_localization: - assert isinstance(result, CpdLocalizationResults), "Localization result must be CpdLocalizationResults" - if has_cp: - assert len(result.result) == 1, "There must be only one change point" - assert abs(result.result[0] - data_params["change_point"]) <= data_params["tolerable_deviation"], ( - "Change point must lie in tolerable interval" - ) - if is_labeled: - assert result.expected_result == [data_params["change_point"]], ( - "Labeled change point must be equal to generated one" - ) - else: - assert result.expected_result is None, "Expected result must be None for not labeled data" - else: - assert result.result == [], "There must be no change points" - else: - assert isinstance(result, int), "Detection result must be a number of detected change points" - assert result == (1 if has_cp else 0), ( - "Number of change points must be equal to expected in the generated data" - ) diff --git a/tests/test_solver.py b/tests/test_solver.py index c2502e98..ef6d70f4 100644 --- a/tests/test_solver.py +++ b/tests/test_solver.py @@ -5,12 +5,11 @@ import numpy as np import pytest -from pysatl_cpd.analysis.results_analyzer import CpdResultsAnalyzer from pysatl_cpd.core.algorithms.graph_algorithm import GraphAlgorithm from pysatl_cpd.core.problem import CpdProblem from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider from pysatl_cpd.core.scrubber.linear import LinearScrubber -from pysatl_cpd.cpd_solver import CpdLocalizationResults, CpdSolver, LabeledCpdData +from pysatl_cpd.cpd_solver import CpdLocalizationResults, CpdResultsAnalyzer, CpdSolver, LabeledCpdData def custom_comparison(node1, node2): # TODO: Remove it everywhere From 36433ffc8ddea114055c5d01e3e213461dea721f Mon Sep 17 00:00:00 2001 From: alistkova Date: Tue, 8 Apr 2025 14:04:06 +0300 Subject: [PATCH 22/31] fix(kliep): organize imports --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index 4b3a1cd0..e3909267 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -6,11 +6,11 @@ __copyright__ = "Copyright (c) 2025 Aleksandra Listkova" __license__ = "SPDX-License-Identifier: MIT" +from typing import Any, cast + import numpy as np import numpy.typing as npt from scipy.optimize import minimize -from typing import List, Any -from typing import cast from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm @@ -131,7 +131,7 @@ def loss(alpha: npt.NDArray[np.float64]) -> float: ) return cast(npt.NDArray[np.float64], res.x) - def _find_change_points(self, scores: npt.NDArray[np.float64]) -> List[int]: + def _find_change_points(self, scores: npt.NDArray[np.float64]) -> list[int]: """ Identifies change points from computed KLIEP scores. From e130d777bbdbd59f9ef4d83c6b7486e2a0952376 Mon Sep 17 00:00:00 2001 From: Aleksandra Listkova <115529517+alistkova@users.noreply.github.com> Date: Sat, 3 May 2025 19:49:47 +0300 Subject: [PATCH 23/31] fix: correct type annotations --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index e3909267..7950b0f6 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -6,11 +6,11 @@ __copyright__ = "Copyright (c) 2025 Aleksandra Listkova" __license__ = "SPDX-License-Identifier: MIT" -from typing import Any, cast +from typing import Any, List, Tuple, Optional, cast import numpy as np import numpy.typing as npt -from scipy.optimize import minimize +from scipy.optimize import minimize # type: ignore[import-untyped] from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm @@ -49,7 +49,7 @@ def detect(self, window: npt.NDArray[np.float64]) -> int: """ return len(self.localize(window)) - def localize(self, window: npt.NDArray[np.float64]) -> list[int]: + def localize(self, window: npt.NDArray[np.float64]) -> List[int]: """ Identifies and returns the locations of change points in the window. @@ -102,36 +102,35 @@ def _compute_kliep_scores(self, window: npt.NDArray[np.float64]) -> npt.NDArray[ return scores def _optimize_alpha( - self, - test_density: npt.NDArray[np.float64], - ref_density: npt.NDArray[np.float64] + self, + test_density: npt.NDArray[np.float64], + ref_density: npt.NDArray[np.float64] ) -> npt.NDArray[np.float64]: """ Optimizes the alpha parameters for KLIEP density ratio estimation. - - :param test_density: density estimates for the test window. - :param ref_density: density estimates for the reference window. - :return: optimized alpha parameters. """ - def loss(alpha: npt.NDArray[np.float64]) -> float: - """Objective function for KLIEP optimization.""" + def loss(alpha: npt.NDArray[np.float64]) -> np.float64: ratio = np.exp(test_density - ref_density - alpha) - loss_val = -np.mean(np.log(ratio)) + self.regularisation * np.sum(alpha**2) - return float(loss_val) + return np.float64(-np.mean(np.log(ratio)) + self.regularisation * np.sum(alpha**2)) - initial_alpha = np.zeros_like(test_density, dtype=np.float64) - bounds = [(0, None)] * len(test_density) + initial_alpha: npt.NDArray[np.float64] = np.zeros_like(test_density).flatten() + bounds: List[Tuple[float, Optional[float]]] = [(0.0, None) for _ in test_density.flatten()] + + def wrapped_loss(alpha_flat: npt.NDArray[np.float64]) -> float: + alpha = alpha_flat.reshape(test_density.shape) + return float(loss(alpha)) res = minimize( - loss, + wrapped_loss, initial_alpha, method='L-BFGS-B', options={'maxiter': self.max_iter}, bounds=bounds ) - return cast(npt.NDArray[np.float64], res.x) + + return cast(npt.NDArray[np.float64], res.x.reshape(test_density.shape)) - def _find_change_points(self, scores: npt.NDArray[np.float64]) -> list[int]: + def _find_change_points(self, scores: npt.NDArray[np.float64]) -> List[int]: """ Identifies change points from computed KLIEP scores. @@ -142,9 +141,10 @@ def _find_change_points(self, scores: npt.NDArray[np.float64]) -> list[int]: if len(candidates) == 0: return [] - change_points = [candidates[0]] - for points in candidates[1:]: - if points - change_points[-1] > self.min_window_size: - change_points.append(points) + change_points = [int(candidates[0])] + for point in candidates[1:]: + if point - change_points[-1] > self.min_window_size: + change_points.append(int(point)) return change_points + From c8e94052df82b02f05c7538ac27152d93db00ba5 Mon Sep 17 00:00:00 2001 From: Aleksandra Listkova <115529517+alistkova@users.noreply.github.com> Date: Sat, 3 May 2025 20:02:25 +0300 Subject: [PATCH 24/31] style: modernize type annotations and clean imports --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index 7950b0f6..f98fa541 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -6,8 +6,7 @@ __copyright__ = "Copyright (c) 2025 Aleksandra Listkova" __license__ = "SPDX-License-Identifier: MIT" -from typing import Any, List, Tuple, Optional, cast - +from typing import Any, Optional, cast import numpy as np import numpy.typing as npt from scipy.optimize import minimize # type: ignore[import-untyped] @@ -49,7 +48,7 @@ def detect(self, window: npt.NDArray[np.float64]) -> int: """ return len(self.localize(window)) - def localize(self, window: npt.NDArray[np.float64]) -> List[int]: + def localize(self, window: npt.NDArray[np.float64]) -> list[int]: """ Identifies and returns the locations of change points in the window. @@ -102,9 +101,9 @@ def _compute_kliep_scores(self, window: npt.NDArray[np.float64]) -> npt.NDArray[ return scores def _optimize_alpha( - self, - test_density: npt.NDArray[np.float64], - ref_density: npt.NDArray[np.float64] + self, + test_density: npt.NDArray[np.float64], + ref_density: npt.NDArray[np.float64] ) -> npt.NDArray[np.float64]: """ Optimizes the alpha parameters for KLIEP density ratio estimation. @@ -114,8 +113,8 @@ def loss(alpha: npt.NDArray[np.float64]) -> np.float64: return np.float64(-np.mean(np.log(ratio)) + self.regularisation * np.sum(alpha**2)) initial_alpha: npt.NDArray[np.float64] = np.zeros_like(test_density).flatten() - bounds: List[Tuple[float, Optional[float]]] = [(0.0, None) for _ in test_density.flatten()] - + bounds: list[tuple[float, Optional[float]]] = [(0.0, None) for _ in test_density.flatten()] + def wrapped_loss(alpha_flat: npt.NDArray[np.float64]) -> float: alpha = alpha_flat.reshape(test_density.shape) return float(loss(alpha)) @@ -127,10 +126,10 @@ def wrapped_loss(alpha_flat: npt.NDArray[np.float64]) -> float: options={'maxiter': self.max_iter}, bounds=bounds ) - + return cast(npt.NDArray[np.float64], res.x.reshape(test_density.shape)) - def _find_change_points(self, scores: npt.NDArray[np.float64]) -> List[int]: + def _find_change_points(self, scores: npt.NDArray[np.float64]) -> list[int]: """ Identifies change points from computed KLIEP scores. From fecf34fb2afa96886b86a64e1cd0a127c93f9c9c Mon Sep 17 00:00:00 2001 From: Aleksandra Listkova <115529517+alistkova@users.noreply.github.com> Date: Sat, 3 May 2025 20:14:49 +0300 Subject: [PATCH 25/31] style: organize imports --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index f98fa541..0735348a 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -7,12 +7,14 @@ __license__ = "SPDX-License-Identifier: MIT" from typing import Any, Optional, cast + import numpy as np import numpy.typing as npt from scipy.optimize import minimize # type: ignore[import-untyped] from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm -from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm +from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import \ + DensityBasedAlgorithm class KliepAlgorithm(Algorithm): @@ -114,7 +116,7 @@ def loss(alpha: npt.NDArray[np.float64]) -> np.float64: initial_alpha: npt.NDArray[np.float64] = np.zeros_like(test_density).flatten() bounds: list[tuple[float, Optional[float]]] = [(0.0, None) for _ in test_density.flatten()] - + def wrapped_loss(alpha_flat: npt.NDArray[np.float64]) -> float: alpha = alpha_flat.reshape(test_density.shape) return float(loss(alpha)) @@ -126,7 +128,7 @@ def wrapped_loss(alpha_flat: npt.NDArray[np.float64]) -> float: options={'maxiter': self.max_iter}, bounds=bounds ) - + return cast(npt.NDArray[np.float64], res.x.reshape(test_density.shape)) def _find_change_points(self, scores: npt.NDArray[np.float64]) -> list[int]: @@ -146,4 +148,3 @@ def _find_change_points(self, scores: npt.NDArray[np.float64]) -> list[int]: change_points.append(int(point)) return change_points - From b8fd7f50e736a2c87cb60746b561fa5f7b31ec53 Mon Sep 17 00:00:00 2001 From: Aleksandra Listkova <115529517+alistkova@users.noreply.github.com> Date: Sat, 3 May 2025 20:31:31 +0300 Subject: [PATCH 26/31] style: organize imports --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index 0735348a..e53aee38 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -13,8 +13,7 @@ from scipy.optimize import minimize # type: ignore[import-untyped] from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm -from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import \ - DensityBasedAlgorithm +from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm class KliepAlgorithm(Algorithm): From 9d18ad1e565120a70ad68a714b96566f5e67eb1c Mon Sep 17 00:00:00 2001 From: Aleksandra Listkova <115529517+alistkova@users.noreply.github.com> Date: Thu, 29 May 2025 22:18:14 +0300 Subject: [PATCH 27/31] fix: np type error --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 162 +++++++----------- 1 file changed, 65 insertions(+), 97 deletions(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index e53aee38..b0f8e639 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -6,144 +6,112 @@ __copyright__ = "Copyright (c) 2025 Aleksandra Listkova" __license__ = "SPDX-License-Identifier: MIT" -from typing import Any, Optional, cast - import numpy as np import numpy.typing as npt -from scipy.optimize import minimize # type: ignore[import-untyped] +from scipy.optimize import minimize # type: ignore -from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm -class KliepAlgorithm(Algorithm): +class KliepAlgorithm(DensityBasedAlgorithm): def __init__( self, bandwidth: float = 1.0, regularization: float = 0.1, threshold: float = 1.1, max_iter: int = 100, - min_window_size: int = 10 + min_window_size: int = 10, ) -> None: """ - Initializes a new instance of KLIEP based change point detection algorithm. + Initializes a new instance of KLIEP-based change point detection algorithm. - :param bandwidth: the bandwidth parameter for the kernel density estimation. - :param regularization: L2 regularization coefficient for the KLIEP optimization. - :param threshold: detection threshold for significant change points. - :param max_iter: maximum number of iterations for the L-BFGS-B optimizer. - :param min_window_size: minimum size of data segments to consider. + :param bandwidth: kernel bandwidth for density estimation. + :param regularization: regularization coefficient for alpha optimization. + :param threshold: threshold for change point detection. + :param max_iter: maximum iterations for optimization solver. + :param min_window_size: minimum segment size for reliable estimation. """ - self.bandwidth = bandwidth - self.regularisation = regularization - self.threshold = threshold + super().__init__(min_window_size, threshold, bandwidth) + self.regularization = regularization self.max_iter = max_iter - self.min_window_size = min_window_size - - def detect(self, window: npt.NDArray[np.float64]) -> int: - """ - Finds change points in the given window. - - :param window: input data window for change point detection. - :return: number of detected change points in the window. - """ - return len(self.localize(window)) - - def localize(self, window: npt.NDArray[np.float64]) -> list[int]: - """ - Identifies and returns the locations of change points in the window. - - :param window: input data window for change point localization. - :return: list of indices where change points were detected. - """ - window = self._validate_window(window) - if len(window) < self.min_window_size: - return [] - - scores = self._compute_kliep_scores(window) - return self._find_change_points(scores) - - def _validate_window(self, window: npt.NDArray[Any]) -> npt.NDArray[np.float64]: - """ - Validates and prepares the input window for processing. - - :param window: input data window. - :return: validated window in 2D format. - """ - window = np.asarray(window, dtype=np.float64) - if window.ndim == 1: - window = window.reshape(-1, 1) - return window - def _compute_kliep_scores(self, window: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: + def _compute_scores(self, window: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: """ - Computes KLIEP anomaly scores for each point in the window. + Computes KLIEP-based change point scores for each position in the window. - :param window: validated input data window. - :return: array of KLIEP scores for each point. + :param window: input data window (1D array). + :return: array of change point scores at each index. """ n_points = window.shape[0] - scores = np.zeros(n_points, dtype=np.float64) + scores: npt.NDArray[np.float64] = np.zeros(n_points, dtype=np.float64) + common_grid = self._build_common_grid(window) for i in range(self.min_window_size, n_points - self.min_window_size): before = window[:i] after = window[i:] - before_density = DensityBasedAlgorithm._kernel_density_estimation( - before, self.bandwidth - ) - after_density = DensityBasedAlgorithm._kernel_density_estimation( - after, self.bandwidth - ) + before_density = self._kde_on_grid(before, self.bandwidth, common_grid) + after_density = self._kde_on_grid(after, self.bandwidth, common_grid) alpha = self._optimize_alpha(after_density, before_density) - scores[i] = np.mean(np.exp(after_density - before_density - alpha)) - + scores[i] = np.mean(np.log(after_density + 1e-10)) - np.mean(np.log(before_density + 1e-10)) - alpha return scores def _optimize_alpha( - self, - test_density: npt.NDArray[np.float64], - ref_density: npt.NDArray[np.float64] - ) -> npt.NDArray[np.float64]: - """ - Optimizes the alpha parameters for KLIEP density ratio estimation. + self, + test_density: npt.NDArray[np.float64], + ref_density: npt.NDArray[np.float64] + ) -> float: """ - def loss(alpha: npt.NDArray[np.float64]) -> np.float64: - ratio = np.exp(test_density - ref_density - alpha) - return np.float64(-np.mean(np.log(ratio)) + self.regularisation * np.sum(alpha**2)) - - initial_alpha: npt.NDArray[np.float64] = np.zeros_like(test_density).flatten() - bounds: list[tuple[float, Optional[float]]] = [(0.0, None) for _ in test_density.flatten()] + Optimizes alpha parameter for density ratio estimation. - def wrapped_loss(alpha_flat: npt.NDArray[np.float64]) -> float: - alpha = alpha_flat.reshape(test_density.shape) - return float(loss(alpha)) + :param test_density: KDE values for test segment (after potential CP). + :param ref_density: KDE values for reference segment (before potential CP). + :return: optimal alpha value for density ratio adjustment. + """ + def loss(alpha: float) -> float: + ratio = np.exp(np.log(test_density) - np.log(ref_density + 1e-10) - alpha) + return float(-np.mean(np.log(ratio + 1e-10)) + self.regularization * alpha**2) res = minimize( - wrapped_loss, - initial_alpha, + loss, + x0=0.0, method='L-BFGS-B', - options={'maxiter': self.max_iter}, - bounds=bounds + bounds=[(0.0, None)], + options={'maxiter': self.max_iter} ) + return float(res.x[0]) - return cast(npt.NDArray[np.float64], res.x.reshape(test_density.shape)) - - def _find_change_points(self, scores: npt.NDArray[np.float64]) -> list[int]: + def _build_common_grid(self, window: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: """ - Identifies change points from computed KLIEP scores. + Creates evaluation grid for density estimation. - :param scores: array of KLIEP scores for each point. - :return: list of detected change point indices. + :param window: input data window. + :return: grid spanning data range with bandwidth-adjusted margins. """ - candidates = np.where(scores > self.threshold)[0] - if len(candidates) == 0: - return [] + return np.linspace( + np.min(window) - 3 * self.bandwidth, + np.max(window) + 3 * self.bandwidth, + 1000, + dtype=np.float64 + ) - change_points = [int(candidates[0])] - for point in candidates[1:]: - if point - change_points[-1] > self.min_window_size: - change_points.append(int(point)) + def _kde_on_grid( + self, + observation: npt.NDArray[np.float64], + bandwidth: float, + grid: npt.NDArray[np.float64] + ) -> npt.NDArray[np.float64]: + """ + Computes kernel density estimate on specified grid. - return change_points + :param observation: data points for KDE. + :param bandwidth: kernel bandwidth parameter. + :param grid: evaluation grid points. + :return: density values at grid points. + """ + n = observation.shape[0] + diff = grid[:, np.newaxis] - observation + kernel_vals = np.exp(-0.5 * (diff / bandwidth) ** 2) + kde_vals = kernel_vals.sum(axis=1) + return np.asarray(kde_vals / (n * bandwidth * np.sqrt(2 * np.pi)), dtype=np.float64) From 469d94f35243d531f5b3f4238cad4b71d69f6fed Mon Sep 17 00:00:00 2001 From: Aleksandra Listkova <115529517+alistkova@users.noreply.github.com> Date: Thu, 29 May 2025 22:26:53 +0300 Subject: [PATCH 28/31] fix: upd density based --- .../abstracts/density_based_algorithm.py | 189 +++++++++++------- 1 file changed, 116 insertions(+), 73 deletions(-) diff --git a/pysatl_cpd/core/algorithms/density/abstracts/density_based_algorithm.py b/pysatl_cpd/core/algorithms/density/abstracts/density_based_algorithm.py index 0cea21bb..f2412d2e 100644 --- a/pysatl_cpd/core/algorithms/density/abstracts/density_based_algorithm.py +++ b/pysatl_cpd/core/algorithms/density/abstracts/density_based_algorithm.py @@ -1,112 +1,155 @@ from abc import abstractmethod -from collections.abc import Callable -from typing import TypeAlias +from typing import Any, Callable, TypeAlias import numpy as np import numpy.typing as npt -from scipy.optimize import minimize from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm _TObjFunc: TypeAlias = Callable[[npt.NDArray[np.float64], npt.NDArray[np.float64]], float] _TMetrics: TypeAlias = dict[str, int | float] - class DensityBasedAlgorithm(Algorithm): - @staticmethod - def _kernel_density_estimation(observation: npt.NDArray[np.float64], bandwidth: float) -> npt.NDArray[np.float64]: - """Perform kernel density estimation on the given observations without fitting a model. + """Abstract base class for density-based change point detection algorithms. - :param observation: the data points for which to estimate the density. - :param bandwidth: the bandwidth parameter for the kernel density estimation. + Provides common infrastructure for methods that detect change points by + analyzing probability density changes in data segments. + """ + def __init__( + self, + min_window_size: int = 10, + threshold: float = 1.1, + bandwidth: float = 1.0 + ) -> None: + """ + Initializes density-based change point detector. - :return: estimated density values for the observations. + :param min_window_size: minimum data points required in each segment + :param threshold: detection sensitivity (higher = fewer detections) + :param bandwidth:kernel bandwidth for density estimation """ - n = len(observation) - x_grid = np.linspace(np.min(observation) - 3 * bandwidth, np.max(observation) + 3 * bandwidth, 1000) - kde_values = np.zeros_like(x_grid) - for x in observation: - kde_values += np.exp(-0.5 * ((x_grid - x) / bandwidth) ** 2) + self.min_window_size = min_window_size + self.threshold = threshold + self.bandwidth = bandwidth - kde_values /= n * bandwidth * np.sqrt(2 * np.pi) - return kde_values + def detect(self, window: npt.NDArray[np.float64]) -> int: + """Counts change points in the given data window. - def _calculate_weights( - self, - test_value: npt.NDArray[np.float64], - reference_value: npt.NDArray[np.float64], - bandwidth: float, - objective_function: _TObjFunc, - ) -> npt.NDArray[np.float64]: - """Calculate the weights based on the density ratio between test and reference values. + :param window: input data array (1D or 2D) + :return: number of detected change points + """ + return len(self.localize(window)) - :param test_value: the test data points. - :param reference_value: the reference data points. - :param bandwidth: the bandwidth parameter for the kernel density estimation. - :param objective_function: the objective function to minimize. + def localize(self, window: npt.NDArray[np.float64]) -> list[int]: + """Identifies positions of change points in the data window. - :return: the calculated density ratios normalized to their mean. + :param window: input data array (1D or 2D) + :return: list of change point indices """ - test_density = self._kernel_density_estimation(test_value, bandwidth) - reference_density = self._kernel_density_estimation(reference_value, bandwidth) + window = self._validate_window(window) + if not self._is_window_valid(window): + return [] + scores = self._compute_scores(window) + return self._find_change_points(scores) - def objective_function_wrapper(alpha: npt.NDArray[np.float64], /) -> float: - """Wrapper for the objective function to calculate the density ratio. + def _validate_window(self, window: npt.NDArray[Any]) -> npt.NDArray[np.float64]: + """Ensures input window meets processing requirements. - :param alpha: relative parameter that controls the weighting between the numerator distribution - and the denominator distribution in the density ratio estimation. + :param window: raw input data + :return: validated 2D float64 array + """ + window_arr = np.asarray(window, dtype=np.float64) + if window_arr.ndim == 1: + window_arr = window_arr.reshape(-1, 1).astype(np.float64) + return np.array(window_arr, dtype=np.float64) - :return: the value of the objective function to minimize. - """ - objective_density_ratio = np.exp(test_density - reference_density - alpha) - return objective_function(objective_density_ratio, alpha) + def _find_change_points(self, scores: npt.NDArray[np.float64]) -> list[int]: + """Filters candidate points using threshold and minimum separation. - res = minimize(objective_function_wrapper, np.zeros(len(test_value)), method="L-BFGS-B") - optimized_alpha: npt.NDArray[np.float64] = res.x - density_ratio: npt.NDArray[np.float64] = np.exp(test_density - reference_density - optimized_alpha) - return density_ratio / np.mean(density_ratio) + :param scores: change point scores for each position + :return: filtered list of change point indices + """ + candidates = np.where(scores > self.threshold)[0] + if not candidates.size: + return [] + change_points = [int(candidates[0])] + for point in candidates[1:]: + if point - change_points[-1] > self.min_window_size: + change_points.append(int(point)) + return change_points + + def _is_window_valid(self, window: npt.NDArray[np.float64]) -> bool: + """Verifies window meets minimum size requirements. + + :param window: input data window + :return: True if window can be processed, else False + """ + return len(window) >= 2 * self.min_window_size - @abstractmethod - def detect(self, window: npt.NDArray[np.float64]) -> int: - # maybe rtype tuple[int] - """Function for finding change points in window + @staticmethod + def _kernel_density_estimation( + observation: npt.NDArray[np.float64], + bandwidth: float + ) -> npt.NDArray[np.float64]: + """Computes kernel density estimate using Gaussian kernels. - :param window: part of global data for finding change points - :return: list of right borders of window change points + :param observation: data points for density estimation + :param bandwidth: smoothing parameter for KDE + :return: density values at evaluation points """ - raise NotImplementedError + n = observation.shape[0] + x_grid: npt.NDArray[np.float64] = np.linspace( + np.min(observation) - 3*bandwidth, + np.max(observation) + 3*bandwidth, + 1000, + dtype=np.float64 + ) + + diff = x_grid[:, np.newaxis] - observation + kernel_vals = np.exp(-0.5 * (diff / bandwidth) ** 2) + kde_vals = kernel_vals.sum(axis=1) + + return np.asarray(kde_vals / (n * bandwidth * np.sqrt(2*np.pi)), dtype=np.float64) @abstractmethod - def localize(self, window: npt.NDArray[np.float64]) -> list[int]: - """Function for finding coordinates of change points in window + def _compute_scores(self, window: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: + """Computes change point scores. - :param window: part of global data for finding change points - :return: list of window change points + :param window: validated input data + :return: array of change point scores """ raise NotImplementedError @staticmethod - def evaluate_detection_accuracy(true_change_points: list[int], detected_change_points: list[int]) -> _TMetrics: - """Evaluate the accuracy of change point detection. - - :param true_change_points: list of true change point indices. - :param detected_change_points: list of detected change point indices. - - :return: a dictionary with evaluation metrics (precision, recall, F1 score). + def evaluate_detection_accuracy( + true_change_points: list[int], + detected_change_points: list[int] + ) -> _TMetrics: + """Computes detection performance metrics. + + :param true_change_points: ground truth change points + :param detected_change_points: algorithm-detected change points + :return: dictionary containing precision, recall, F1, and error counts """ - true_positive = len(set(true_change_points) & set(detected_change_points)) - false_positive = len(set(detected_change_points) - set(true_change_points)) - false_negative = len(set(true_change_points) - set(detected_change_points)) - - precision = true_positive / (true_positive + false_positive) if true_positive + false_positive > 0 else 0.0 - recall = true_positive / (true_positive + false_negative) if true_positive + false_negative > 0 else 0.0 - f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0 + true_positives = len(set(true_change_points) & set(detected_change_points)) + false_positives = len(set(detected_change_points) - set(true_change_points)) + false_negatives = len(set(true_change_points) - set(detected_change_points)) + + precision = ( + true_positives / (true_positives + false_positives) + if (true_positives + false_positives) > 0 else 0.0 + ) + recall = ( + true_positives / (true_positives + false_negatives) + if (true_positives + false_negatives) > 0 else 0.0 + ) + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 return { "precision": precision, "recall": recall, - "f1_score": f1_score, - "true_positive": true_positive, - "false_positive": false_positive, - "false_negative": false_negative, + "f1_score": f1, + "true_positive": true_positives, + "false_positive": false_positives, + "false_negative": false_negatives, } From 097fcf9dbcb19a38e43d08cec558a928b2be502a Mon Sep 17 00:00:00 2001 From: Aleksandra Listkova <115529517+alistkova@users.noreply.github.com> Date: Thu, 29 May 2025 22:32:27 +0300 Subject: [PATCH 29/31] feat: rewrite rulsif algorithm implementation --- .../core/algorithms/rulsif_algorithm.py | 115 ++++++++---------- 1 file changed, 50 insertions(+), 65 deletions(-) diff --git a/pysatl_cpd/core/algorithms/rulsif_algorithm.py b/pysatl_cpd/core/algorithms/rulsif_algorithm.py index f0f04da8..1a808b2e 100644 --- a/pysatl_cpd/core/algorithms/rulsif_algorithm.py +++ b/pysatl_cpd/core/algorithms/rulsif_algorithm.py @@ -1,79 +1,64 @@ -from typing import cast +""" +Module for implementation of CPD algorithm using RULSIF-based divergence estimation. +""" + +__author__ = "Aleksandra Listkova" +__copyright__ = "Copyright (c) 2025 Aleksandra Listkova" +__license__ = "SPDX-License-Identifier: MIT" import numpy as np import numpy.typing as npt +from scipy.linalg import solve from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm class RulsifAlgorithm(DensityBasedAlgorithm): - """Relative Unconstrained Least-Squares Importance Fitting (RULSIF) - algorithm for change point detection. - - RULSIF estimates the density ratio between two distributions and uses - the importance weights for detecting changes in the data distribution. - """ - - def __init__(self, bandwidth: float, regularization_coef: float, threshold: float = 1.1): - """Initialize the RULSIF algorithm. - - Args: - bandwidth (float): bandwidth parameter for density estimation. - regularization_coef (float): regularization parameter. - threshold (float, optional): threshold for detecting change points. - Defaults to 1.1. + def __init__( + self, + alpha: float = 0.1, + bandwidth: float = 1.0, + lambda_reg: float = 0.1, + threshold: float = 1.1, + min_window_size: int = 10, + ) -> None: """ - self.bandwidth = bandwidth - self.regularization_coef = regularization_coef - self.threshold = threshold - - def _loss_function(self, density_ratio: npt.NDArray[np.float64], alpha: npt.NDArray[np.float64]) -> float: - """Loss function for RULSIF. - - Args: - density_ratio (np.ndarray): estimated density ratio. - alpha (np.ndarray): coefficients for the density ratio. - - Returns: - float: the computed loss value. + Initializes RULSIF-based change point detector. + + :param alpha: mixture coefficient (0-1) for reference/test densities + :param bandwidth: kernel bandwidth for density estimation + :param lambda_reg: L2 regularization strength + :param threshold: detection sensitivity threshold + :param min_window_size: minimum segment size requirement + :raises ValueError: if alpha is not in (0,1) """ - return np.mean((density_ratio - 1) ** 2) + self.regularization_coef * np.sum(alpha**2) + super().__init__(min_window_size, threshold, bandwidth) + if not 0 < alpha < 1: + raise ValueError("Alpha must be between 0 and 1") + self.alpha = alpha + self.lambda_reg = lambda_reg - def detect(self, window: npt.NDArray[np.float64]) -> int: - """Detect the number of change points in the given data window - using RULSIF. - - Args: - window (Iterable[float]): the data window to detect change points. - - Returns: - int: the number of detected change points. + def _compute_scores(self, window: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: """ - window_sample = np.array(window) - weights = self._calculate_weights( - test_value=window_sample, - reference_value=window_sample, - bandwidth=self.bandwidth, - objective_function=self._loss_function, - ) - - return np.count_nonzero(weights > self.threshold) + Computes RULSIF-based change point scores for each position. - def localize(self, window: npt.NDArray[np.float64]) -> list[int]: - """Localize the change points in the given data window using RULSIF. - - Args: - window (Iterable[float]): the data window to localize change points. - - Returns: - List[int]: the indices of the detected change points. + :param window: input data window (1D array) + :return: array of divergence scores at each index """ - window_sample = np.array(window) - weights = self._calculate_weights( - test_value=window_sample, - reference_value=window_sample, - bandwidth=self.bandwidth, - objective_function=self._loss_function, - ) - - return cast(list[int], np.where(weights > self.threshold)[0].tolist()) + n_points = window.shape[0] + scores: npt.NDArray[np.float64] = np.zeros(n_points, dtype=np.float64) + for i in range(self.min_window_size, n_points - self.min_window_size): + ref = window[:i] + test = window[i:] + K_ref = self._kernel_density_estimation(ref, self.bandwidth) + K_test = self._kernel_density_estimation(test, self.bandwidth) + H = ( + (1 - self.alpha) * (K_ref @ K_ref.T) / i + + self.alpha * (K_test @ K_test.T) / (n_points - i) + + self.lambda_reg * np.eye(K_ref.shape[0], dtype=np.float64) + ) + h = K_test.mean(axis=1) + theta = solve(H, h, assume_a='pos') + density_ratio = theta @ K_test + scores[i] = np.mean((density_ratio - 1) ** 2) + return scores From 155aabdf88a66905a36733be050dc9f47f6675d3 Mon Sep 17 00:00:00 2001 From: alistkova Date: Thu, 29 May 2025 22:39:02 +0300 Subject: [PATCH 30/31] fix: correct minimize call and remove unused type ignore --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index b0f8e639..dad9e21e 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -69,15 +69,19 @@ def _optimize_alpha( :param ref_density: KDE values for reference segment (before potential CP). :return: optimal alpha value for density ratio adjustment. """ - def loss(alpha: float) -> float: + def loss(alpha_array: npt.NDArray[np.float64]) -> float: + alpha = alpha_array[0] ratio = np.exp(np.log(test_density) - np.log(ref_density + 1e-10) - alpha) return float(-np.mean(np.log(ratio + 1e-10)) + self.regularization * alpha**2) + initial_alpha = np.array([0.0], dtype=np.float64) + bounds = [(0.0, None)] + res = minimize( loss, - x0=0.0, + x0=initial_alpha, method='L-BFGS-B', - bounds=[(0.0, None)], + bounds=bounds, options={'maxiter': self.max_iter} ) return float(res.x[0]) From 67add9b90367b0b80dde1283d7ed5c2b635cac26 Mon Sep 17 00:00:00 2001 From: Aleksandra Listkova <115529517+alistkova@users.noreply.github.com> Date: Thu, 29 May 2025 23:14:39 +0300 Subject: [PATCH 31/31] fix: delete unused type ignore comment --- pysatl_cpd/core/algorithms/kliep_algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pysatl_cpd/core/algorithms/kliep_algorithm.py b/pysatl_cpd/core/algorithms/kliep_algorithm.py index dad9e21e..ab44cb0d 100644 --- a/pysatl_cpd/core/algorithms/kliep_algorithm.py +++ b/pysatl_cpd/core/algorithms/kliep_algorithm.py @@ -8,7 +8,7 @@ import numpy as np import numpy.typing as npt -from scipy.optimize import minimize # type: ignore +from scipy.optimize import minimize from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm