Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ scikit-learn = ">=1.6.1,<2.0.0"
scipy = ">=1.15.2,<2.0.0"
seaborn = ">=0.13.2,<0.14.0"
scikit-fuzzy = "^0.5.0"
kneed = "^0.8.5"
pywavelets = "^1.9.0"
gdown = "^5.2.0"
xgboost = "^3.1.1"

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.4"
Expand Down
37 changes: 37 additions & 0 deletions rework_pysatl_mpest/preprocessing/components_family/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
components_family module for evaluating the family of components of a mixture

This module provides a ready-made model for determining the most likely families of mixture components.
The choice of a mixture component family is important when working with mixtures of distributions

**Purpose**

components_family module helps to speed up the search for the most suitable mixture component configuration
by narrowing down the search to a few options

**Usage Example**

.. code-block:: python
>>> import numpy as np
>>> from rework_pysatl_mpest.preprocessing.components_family import ComponentsFamily
>>> from rework_pysatl_mpest.preprocessing.components_family import XGBBaseModel

>>> # Create random sample
>>> X = np.linspace(-10, 10, 200)

>>> # Determine 5 possible configurations using XGBaseModel
>>> model = ComponentsFamily(XGBBaseModel, top_k=5)
>>> configurations = model.predict(X)

>>> print(f"Best 5 configurations: {configurations}")
>>> print(f"Best configuration: {configurations[0]}")
"""

__author__ = "Mark Dubrovchenko"
__copyright__ = "Copyright (c) 2025 PySATL project"
__license__ = "SPDX-License-Identifier: MIT"

from rework_pysatl_mpest.preprocessing.components_family.components_family import ComponentsFamily
from rework_pysatl_mpest.preprocessing.components_family.mixture_classifiers import XGBBaseModel

__all__ = ["ComponentsFamily", "XGBBaseModel"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""Module which contains collector of a vector of criterions for mixture classifier"""

__author__ = "Mark Dubrovchenko"
__copyright__ = "Copyright (c) 2025 PySATL project"
__license__ = "SPDX-License-Identifier: MIT"

import warnings
from math import ceil

import numpy as np
from rework_pysatl_mpest.preprocessing.components_family.criterions import base_criterions
from rework_pysatl_mpest.preprocessing.components_family.criterions.abstract_criterion import (
AHistClassifierCriterion,
APeaksClassifierCriterion,
ASampleClassifierCriterion,
)
from scipy.stats import iqr


class MixtureClassifierCriterions:
"""
MixtureClassifierCriterions

Parameters
----------
:criterions list[ASampleClassifierCriterion | APeaksClassifierCriterion | AHistClassifierCriterion]

— List of criterions for the mixture classifiers
"""

def __init__(
self,
criterions: list[
ASampleClassifierCriterion | APeaksClassifierCriterion | AHistClassifierCriterion
] = base_criterions,
) -> None:
self.criterions = criterions

@staticmethod
def _get_hist(X: np.ndarray) -> np.ndarray:
"""A function for constructing a histogram with constraints"""
n = X.size
bmin = 20
bmax = 150

h = 1 * iqr(X) * n ** (-1 / 3)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be h = 2 * IQR * n^(-1/3) straight to original Freedman-Diaconis rule, isn't it?

bins = ceil((X.max() - X.min()) / h) if h > 0 else bmin
nbins = max(bmin, min(bins, bmax))

hist = np.histogram(X, bins=nbins, density=True)[0]

return hist

@staticmethod
def _get_criterion(
X: np.ndarray,
hist: np.ndarray,
criterion: (ASampleClassifierCriterion | APeaksClassifierCriterion | AHistClassifierCriterion),
) -> float:
"""Function for obtaining a single criterion based on a sample"""

warnings.filterwarnings("ignore")

if isinstance(criterion, ASampleClassifierCriterion):
return criterion.score(X)

return criterion.score(hist)

def get_criterions(self, X: np.ndarray) -> dict[str, float]:
"""Function for evaluating a feature vector based on a sample"""

hist_list = self._get_hist(X)
return dict([(criterion.name, self._get_criterion(X, hist_list, criterion)) for criterion in self.criterions])
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Module which contains interface of the classifier model and supported classifier models"""

__author__ = "Mark Dubrovchenko"
__copyright__ = "Copyright (c) 2025 PySATL project"
__license__ = "SPDX-License-Identifier: MIT"

from rework_pysatl_mpest.preprocessing.components_family.classifier_models.classifier_interface import (
IClassifier,
)
from rework_pysatl_mpest.preprocessing.components_family.classifier_models.classifier_models import (
XGBClassifier,
)

__all__ = ["IClassifier", "XGBClassifier"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Module which contains interface of the classifier model"""

__author__ = "Mark Dubrovchenko"
__copyright__ = "Copyright (c) 2025 PySATL project"
__license__ = "SPDX-License-Identifier: MIT"

from abc import ABC, abstractmethod

import numpy as np


class IClassifier(ABC):
"""Class representing an interface for classification models"""

@property
@abstractmethod
def is_fitted(self) -> bool:
"""A property indicating whether the model has been trained"""

@abstractmethod
def predict(self, criterions: dict[str, float]) -> np.ndarray:
"""Abstract method for implementing a model prediction"""

@abstractmethod
def load_model(self, model_path: str) -> None:
"""An abstract method for implementing model loading"""
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Module which contains all supported classifier models"""

__author__ = "Mark Dubrovchenko"
__copyright__ = "Copyright (c) 2025 PySATL project"
__license__ = "SPDX-License-Identifier: MIT"

import numpy as np
import xgboost as xgb
from rework_pysatl_mpest.preprocessing.components_family.classifier_models.classifier_interface import (
IClassifier,
)


class XGBClassifier(IClassifier):
"""Implementation of XGBoosting-based classifier"""

def __init__(self) -> None:
self.model = xgb.Booster()
self._is_fitted: bool = False

@property
def is_fitted(self) -> bool:
return self._is_fitted

def load_model(self, model_path: str) -> None:
self.model.load_model(model_path)
self._is_fitted = True

def predict(self, criterions: dict[str, float]) -> np.ndarray:
feature_names = list(criterions.keys())
values = [criterions[name] for name in feature_names]
features = xgb.DMatrix([values], feature_names=feature_names)

return self.model.predict(features)
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Module which contains method for initial estimation of mixture components family based on mixture classifier"""

__author__ = "Mark Dubrovchenko"
__copyright__ = "Copyright (c) 2025 PySATL project"
__license__ = "SPDX-License-Identifier: MIT"

import numpy as np
from rework_pysatl_mpest.distributions import ContinuousDistribution
from rework_pysatl_mpest.preprocessing.components_family.mixture_classifiers.mixture_classifier import (
MixtureClassifierModel,
)
from rework_pysatl_mpest.preprocessing.components_number.abstract_estimator import AComponentsNumber


class ComponentsFamily:
"""
ComponentsFamily

Parameters
----------
:model: MixtureClassifierModel — Mixture Classifier Model
:top_k: int — Top k most likely mixtures
:components_number: int | None — Method for estimating number of components
:random_state: int | None — Determines random generation for some criterions
"""

def __init__(
self,
recognition_model: MixtureClassifierModel,
top_k: int,
components_number: AComponentsNumber | None = None,
state: int | None = None,
) -> None:
self.model = recognition_model
self.top_k = top_k
self.components_number = components_number
self.state = state

def predict(self, X: np.ndarray, k: int | list[int] | None = None) -> list[list[ContinuousDistribution]]:
"""
Function for evaluating the top k most probable configurations

Parameters
----------
:X: np.ndarray — Sample Data
:k: int | list[int] | None — The set number of components of the mixture

k is a specific number, or a number in a specified range, or None
(to determine the number of components using a specified method,
or if no method is specified, to use the entire range from 1 to 10 components)

Returns
----------
list[list[Distribution]]

— List of mixture configurations using distribution classes for further work with the mixture
"""

def __get_components_n(k: None | int | list[int]) -> list[int]:
"""Function that defines the boundaries of the possible number of mixture components"""
upper_bound = 10

if isinstance(k, int):
return [k]

if isinstance(k, list):
return k

if isinstance(self.components_number, AComponentsNumber):
comp_k = self.components_number.estimate(X)
return [max(comp_k - 1, 1), comp_k, min(comp_k + 1, upper_bound)]

return [i for i in range(1, upper_bound + 1)]

np.random.seed(self.state)

n = __get_components_n(k)
prob = self.model.predict(X)
result: list[list[ContinuousDistribution]] = []

for i in np.argsort(prob)[::-1]:
if len(result) == self.top_k:
break

components = self.model.transform(i)
if len(components) not in n:
continue

result.append(components)

return result
Loading