From 9c2d77cdc2aa6994f3ca27224cae087039c7154a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 10:48:16 +0200 Subject: [PATCH 01/54] add method for selection base on FDR --- src/hidimstat/base_variable_importance.py | 198 +++++++++++++++++++++- 1 file changed, 189 insertions(+), 9 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index b4f539024..e4967a727 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -3,6 +3,9 @@ from sklearn.base import BaseEstimator import numpy as np +from hidimstat.statistical_tools.multiple_testing import fdr_threshold +from hidimstat.statistical_tools.aggregation import quantile_aggregation + class BaseVariableImportance(BaseEstimator): """ @@ -34,15 +37,22 @@ def __init__(self): self.importances_ = None self.pvalues_ = None self.selections_ = None + self.test_scores_ = None + + def _check_importance(self): + """ + Checks if the importance scores and p-values have been computed. + """ + if self.importances_ is None: + raise ValueError( + "The importances need to be called before calling this method" + ) def selection( self, k_best=None, percentile=None, threshold=None, threshold_pvalue=None ): """ Selects features based on variable importance. - In case several arguments are different from None, - the returned selection is the conjunction of all of them. - Parameters ---------- k_best : int, optional, default=None @@ -53,7 +63,6 @@ def selection( Selects features with importance scores above the specified threshold. threshold_pvalue : float, optional, default=None Selects features with p-values below the specified threshold. - Returns ------- selection : array-like of shape (n_features,) @@ -123,11 +132,182 @@ def selection( return self.selections_ - def _check_importance(self): + def selection_fdr( + self, + fdr, + fdr_control="bhq", + evalues=False, + reshaping_function=None, + adaptive_aggregation=False, + gamma=0.5, + ): """ - Checks if the importance scores and p-values have been computed. + Performs feature selection based on False Discovery Rate (FDR) control. + + This method selects features by controlling the FDR using either p-values or e-values + derived from test scores. It supports different FDR control methods and optional + adaptive aggregation of the statistical values. + + Parameters + ---------- + fdr : float, default=None + The target false discovery rate level (between 0 and 1) + fdr_control: string, default="bhq" + The FDR control method to use. Options are: + - "bhq": Benjamini-Hochberg procedure + - 'bhy': Benjamini-Hochberg-Yekutieli procedure + - "ebh": e-BH procedure (only for e-values) + evalues: boolean, default=False + If True, uses e-values for selection. If False, uses p-values. + reshaping_function: callable, default=None + Reshaping function for BHY method, default uses sum of reciprocals + adaptive_aggregation: boolean, default=False + If True, uses adaptive weights for p-value aggregation. + Only applicable when evalues=False. + gamma: boolean, default=0.5 + The gamma parameter for quantile aggregation of p-values. + Only used when evalues=False. + + Returns + ------- + numpy.ndarray + Boolean array indicating selected features (True for selected, False for not selected) + + Raises + ------ + AssertionError + If test_scores_ is None or if incompatible combinations of parameters are provided """ - if self.importances_ is None: - raise ValueError( - "The importances need to be called before calling this method" + self._check_importance() + assert ( + self.test_scores_ is not None + ), "this method doesn't support selection base on FDR" + + if not evalues: + assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" + pvalues = np.array( + [ + _empirical_pval(self.test_scores_[i]) + for i in range(len(self.test_scores_)) + ] ) + aggregated_pval = quantile_aggregation( + pvalues, gamma=gamma, adaptive=adaptive_aggregation + ) + threshold = fdr_threshold( + aggregated_pval, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected = aggregated_pval <= threshold + else: + assert fdr_control == "ebh", "for e-value, the fdr control need to be 'ebh'" + ko_threshold = [] + for test_score in self.test_scores_: + ko_threshold.append(_estimated_threshold(test_score, fdr=fdr)) + evalues = np.array( + [ + _empirical_eval(self.test_scores_[i], ko_threshold[i]) + for i in range(len(self.test_scores_)) + ] + ) + aggregated_eval = np.mean(evalues, axis=0) + threshold = fdr_threshold( + aggregated_eval, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected = aggregated_eval >= threshold + return selected + + +def _estimated_threshold(test_score, fdr=0.1): + """ + Calculate the threshold based on the procedure stated in the knockoff article. + Original code: + https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + Vector of test statistic. + fdr : float + Desired controlled FDR (false discovery rate) level. + Returns + ------- + threshold : float or np.inf + Threshold level. + """ + offset = 1 # Offset equals 1 is the knockoff+ procedure. + + threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) + np.concatenate( + [[0], threshold_mesh, [np.inf]] + ) # if there is no solution, the threshold is inf + # find the right value of t for getting a good fdr + # Equation 1.8 of barber2015controlling and 3.10 in Candès 2018 + threshold = 0.0 + for threshold in threshold_mesh: + false_pos = np.sum(test_score <= -threshold) + selected = np.sum(test_score >= threshold) + if (offset + false_pos) / np.maximum(selected, 1) <= fdr: + break + return threshold + + +def _empirical_pval(test_score): + """ + Compute the empirical p-values from the test based on knockoff+. + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + Vector of test statistics. + Returns + ------- + pvals : 1D ndarray, shape (n_features, ) + Vector of empirical p-values. + """ + pvals = [] + n_features = test_score.size + + offset = 1 # Offset equals 1 is the knockoff+ procedure. + + test_score_inv = -test_score + for i in range(n_features): + if test_score[i] <= 0: + pvals.append(1) + else: + pvals.append( + (offset + np.sum(test_score_inv >= test_score[i])) / n_features + ) + + return np.array(pvals) + + +def _empirical_eval(test_score, ko_threshold): + """ + Compute the empirical e-values from the test based on knockoff. + Parameters + ---------- + test_score : 1D ndarray, shape (n_features, ) + Vector of test statistics. + ko_threshold : float + Threshold level. + Returns + ------- + evals : 1D ndarray, shape (n_features, ) + Vector of empirical e-values. + """ + evals = [] + n_features = test_score.size + + offset = 1 # Offset equals 1 is the knockoff+ procedure. + + for i in range(n_features): + if test_score[i] < ko_threshold: + evals.append(0) + else: + evals.append(n_features / (offset + np.sum(test_score <= -ko_threshold))) + + return np.array(evals) From 5314c37d15e3e3a607c90bf7c86748b68ea508cf Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 19:40:17 +0200 Subject: [PATCH 02/54] fix default of the qunatile aggragation --- src/hidimstat/statistical_tools/aggregation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/statistical_tools/aggregation.py b/src/hidimstat/statistical_tools/aggregation.py index a9a85a4e3..21aa44f3c 100644 --- a/src/hidimstat/statistical_tools/aggregation.py +++ b/src/hidimstat/statistical_tools/aggregation.py @@ -1,7 +1,7 @@ import numpy as np -def quantile_aggregation(pvals, gamma=0.05, adaptive=False): +def quantile_aggregation(pvals, gamma=0.5, adaptive=False): """ Implements the quantile aggregation method for p-values. @@ -15,7 +15,7 @@ def quantile_aggregation(pvals, gamma=0.05, adaptive=False): pvals : ndarray of shape (n_sampling*2, n_test) Matrix of p-values to aggregate. Each row represents a sampling instance and each column a hypothesis test. - gamma : float, default=0.05 + gamma : float, default=0.5 Quantile level for aggregation. Must be in range (0,1]. adaptive : bool, default=False If True, uses adaptive quantile aggregation which optimizes over multiple gamma values. From be837e097d208034167475ad9beac7ae1463c3d7 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 19:41:07 +0200 Subject: [PATCH 03/54] fix selection --- src/hidimstat/base_variable_importance.py | 20 +- test/test_base_variable_importance.py | 226 ++++++++++++++++++++++ 2 files changed, 239 insertions(+), 7 deletions(-) create mode 100644 test/test_base_variable_importance.py diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index e4967a727..b6e02eaa0 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -70,17 +70,23 @@ def selection( """ self._check_importance() if k_best is not None: - if not isinstance(k_best, str) and k_best > self.importances_.shape[1]: + if not isinstance(k_best, str) and k_best > self.importances_.shape[0]: warnings.warn( - f"k={k_best} is greater than n_features={self.importances_.shape[1]}. " + f"k={k_best} is greater than n_features={self.importances_.shape[0]}. " "All the features will be returned." ) - assert k_best > 0, "k_best needs to be positive and not null" + if isinstance(k_best, str): + assert k_best == "all" + else: + assert k_best >= 0, "k_best needs to be positive or null" if percentile is not None: assert ( - 0 < percentile and percentile < 100 + 0 <= percentile and percentile <= 100 ), "percentile needs to be between 0 and 100" if threshold_pvalue is not None: + assert ( + self.pvalues_ is not None + ), "This method doesn't support a threshold on p-values" assert ( 0 < threshold_pvalue and threshold_pvalue < 1 ), "threshold_pvalue needs to be between 0 and 1" @@ -105,9 +111,9 @@ def selection( elif percentile == 0: mask_percentile = np.zeros(len(self.importances_), dtype=bool) elif percentile is not None: - threshold = np.percentile(self.importances_, 100 - percentile) - mask_percentile = self.importances_ > threshold - ties = np.where(self.importances_ == threshold)[0] + threshold_percentile = np.percentile(self.importances_, 100 - percentile) + mask_percentile = self.importances_ > threshold_percentile + ties = np.where(self.importances_ == threshold_percentile)[0] if len(ties): max_feats = int(len(self.importances_) * percentile / 100) kept_ties = ties[: max_feats - mask_percentile.sum()] diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py new file mode 100644 index 000000000..e3395c4ea --- /dev/null +++ b/test/test_base_variable_importance.py @@ -0,0 +1,226 @@ +import pytest +import numpy as np + +from hidimstat import BaseVariableImportance + + +@pytest.fixture +def set_BaseVariableImportance(pvalues, test_score, seed): + nb_features = 100 + rng = np.random.RandomState(seed) + vi = BaseVariableImportance() + vi.importances_ = np.arange(nb_features) + rng.shuffle(vi.importances_) + if pvalues or test_score: + vi.pvalues_ = np.sort(rng.rand(nb_features))[vi.importances_] + if test_score: + vi.test_scores_ = [] + for i in range(10): + score = np.random.rand(nb_features) * 30 + vi.test_scores_.append(score) + for i in range(1, 30): + score = np.random.rand(nb_features) + 1 + score[-i:] = np.arange(30 - i, 30) * 2 + score[:i] = -np.arange(30 - i, 30) + vi.test_scores_.append(score[vi.importances_]) + return vi + + +@pytest.mark.parametrize( + "pvalues, test_score, seed", + [(False, False, 0), (True, False, 1), (True, True, 2)], + ids=["only importance", "p-value", "test-score"], +) +class TestSelection: + """Test selection base on importance""" + + def test_selection_k_best(self, set_BaseVariableImportance): + "test selection of the k_best" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 95 + selection = vi.selection(k_best=5) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_k_best_all(self, set_BaseVariableImportance): + "test selection to all base on string" + vi = set_BaseVariableImportance + true_value = np.ones_like(vi.importances_, dtype=bool) + selection = vi.selection(k_best="all") + np.testing.assert_array_equal(true_value, selection) + + def test_selection_k_best_none(self, set_BaseVariableImportance): + "test selection when there none" + vi = set_BaseVariableImportance + true_value = np.zeros_like(vi.importances_, dtype=bool) + selection = vi.selection(k_best=0) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile(self, set_BaseVariableImportance): + "test selection bae on percentile" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 50 + selection = vi.selection(percentile=50) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile_all(self, set_BaseVariableImportance): + "test selection when percentile is 100" + vi = set_BaseVariableImportance + true_value = np.ones_like(vi.importances_, dtype=bool) + selection = vi.selection(percentile=100) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile_none(self, set_BaseVariableImportance): + "test selection when percentile is 0" + vi = set_BaseVariableImportance + true_value = np.zeros_like(vi.importances_, dtype=bool) + selection = vi.selection(percentile=0) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_percentile_threshols_value(self, set_BaseVariableImportance): + "test selection when percentile when the percentile equal on value" + vi = set_BaseVariableImportance + mask = np.ones_like(vi.importances_, dtype=bool) + mask[np.where(vi.importances_ == 99)] = False + vi.importances_ = vi.importances_[mask] + true_value = vi.importances_ >= 50 + selection = vi.selection(percentile=50) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_threshold(self, set_BaseVariableImportance): + "test threshold on importance" + vi = set_BaseVariableImportance + true_value = vi.importances_ < 5 + selection = vi.selection(threshold=5) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_threshold_pvalue(self, set_BaseVariableImportance): + "test threshold vbse on pvalues" + vi = set_BaseVariableImportance + if vi.pvalues_ is not None: + true_value = vi.importances_ < 5 + print(vi.pvalues_) + selection = vi.selection( + threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] + ) + np.testing.assert_array_equal(true_value, selection) + + +@pytest.mark.parametrize( + "pvalues, test_score, seed", [(True, True, 10)], ids=["default"] +) +class TestSelectionFDR: + """Test selection base on fdr""" + + def test_selection_fdr_default(self, set_BaseVariableImportance): + "test selection of the default" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 85 + selection = vi.selection_fdr(0.2) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_fdr_adaptation(self, set_BaseVariableImportance): + "test selection of the adaptation" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 85 + selection = vi.selection_fdr(0.2, adaptive_aggregation=True) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_fdr_bhy(self, set_BaseVariableImportance): + "test selection of the adaptation" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 85 + selection = vi.selection_fdr(0.8, fdr_control="bhy") + np.testing.assert_array_equal(true_value, selection) + + def test_selection_fdr_ebh(self, set_BaseVariableImportance): + "test selection of the adaptation" + vi = set_BaseVariableImportance + true_value = vi.importances_ >= 2 + selection = vi.selection_fdr(0.037, fdr_control="ebh", evalues=True) + np.testing.assert_array_equal(true_value, selection) + + +@pytest.mark.parametrize( + "pvalues, test_score, seed", + [(False, False, 0), (True, False, 0), (True, True, 0)], + ids=["only importance", "p-value", "test-score"], +) +class TestBVIExceptions: + """Test class for BVI Exception""" + + def test_not_fit(self, pvalues, test_score, seed): + "test detection unfit" + vi = BaseVariableImportance() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi._check_importance() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi.selection() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi.selection_fdr(0.1) + + def test_selection_k_best(self, set_BaseVariableImportance): + "test selection k_best wrong" + vi = set_BaseVariableImportance + with pytest.raises(AssertionError, match="k_best needs to be positive or null"): + vi.selection(k_best=-10) + with pytest.warns(Warning, match="k=1000 is greater than n_features="): + vi.selection(k_best=1000) + + def test_selection_percentile(self, set_BaseVariableImportance): + "test selection percentile wrong" + vi = set_BaseVariableImportance + with pytest.raises( + AssertionError, match="percentile needs to be between 0 and 100" + ): + vi.selection(percentile=-1) + with pytest.raises( + AssertionError, match="percentile needs to be between 0 and 100" + ): + vi.selection(percentile=102) + + def test_selection_threshold(self, set_BaseVariableImportance): + "test selection threshold wrong" + vi = set_BaseVariableImportance + if vi.pvalues_ is None: + with pytest.raises( + AssertionError, + match="This method doesn't support a threshold on p-values", + ): + vi.selection(threshold_pvalue=-1) + else: + with pytest.raises( + AssertionError, match="threshold_pvalue needs to be between 0 and 1" + ): + vi.selection(threshold_pvalue=-1) + with pytest.raises( + AssertionError, match="threshold_pvalue needs to be between 0 and 1" + ): + vi.selection(threshold_pvalue=1.1) + + def test_selection_fdr_fdr_control(self, set_BaseVariableImportance): + "test selection fdr_control wrong" + vi = set_BaseVariableImportance + if vi.test_scores_ is None: + with pytest.raises( + AssertionError, + match="this method doesn't support selection base on FDR", + ): + vi.selection_fdr(fdr=0.1) + else: + with pytest.raises( + AssertionError, match="for e-value, the fdr control need to be 'ebh'" + ): + vi.selection_fdr(fdr=0.1, evalues=True) + with pytest.raises( + AssertionError, match="for p-value, the fdr control can't be 'ebh'" + ): + vi.selection_fdr(fdr=0.1, fdr_control="ebh", evalues=False) From 1a4259200f2e26c5b4bb9b0028470e8f8a42e417 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 19:46:36 +0200 Subject: [PATCH 04/54] update docstring --- test/test_base_variable_importance.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index e3395c4ea..0d4ff4a81 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -6,6 +6,22 @@ @pytest.fixture def set_BaseVariableImportance(pvalues, test_score, seed): + """Create a BaseVariableImportance instance with test data for testing purposes. + + Parameters + ---------- + pvalues : bool + If True, generate random p-values for testing. + test_score : bool + If True, generate random test scores for testing. + seed : int + Random seed for reproducibility. + + Returns + ------- + BaseVariableImportance + A BaseVariableImportance instance with test data. + """ nb_features = 100 rng = np.random.RandomState(seed) vi = BaseVariableImportance() @@ -14,6 +30,7 @@ def set_BaseVariableImportance(pvalues, test_score, seed): if pvalues or test_score: vi.pvalues_ = np.sort(rng.rand(nb_features))[vi.importances_] if test_score: + # TODO: this can be improved. vi.test_scores_ = [] for i in range(10): score = np.random.rand(nb_features) * 30 @@ -32,7 +49,7 @@ def set_BaseVariableImportance(pvalues, test_score, seed): ids=["only importance", "p-value", "test-score"], ) class TestSelection: - """Test selection base on importance""" + """Test selection based on importance""" def test_selection_k_best(self, set_BaseVariableImportance): "test selection of the k_best" @@ -109,7 +126,7 @@ def test_selection_threshold_pvalue(self, set_BaseVariableImportance): "pvalues, test_score, seed", [(True, True, 10)], ids=["default"] ) class TestSelectionFDR: - """Test selection base on fdr""" + """Test selection based on fdr""" def test_selection_fdr_default(self, set_BaseVariableImportance): "test selection of the default" @@ -126,14 +143,14 @@ def test_selection_fdr_adaptation(self, set_BaseVariableImportance): np.testing.assert_array_equal(true_value, selection) def test_selection_fdr_bhy(self, set_BaseVariableImportance): - "test selection of the adaptation" + "test selection with bhy" vi = set_BaseVariableImportance true_value = vi.importances_ >= 85 selection = vi.selection_fdr(0.8, fdr_control="bhy") np.testing.assert_array_equal(true_value, selection) def test_selection_fdr_ebh(self, set_BaseVariableImportance): - "test selection of the adaptation" + "test selection with e-values" vi = set_BaseVariableImportance true_value = vi.importances_ >= 2 selection = vi.selection_fdr(0.037, fdr_control="ebh", evalues=True) From 5854f2e7fdc3dffcb4a305504070a85529f1d082 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 28 Aug 2025 19:58:21 +0200 Subject: [PATCH 05/54] fix docstring --- src/hidimstat/base_variable_importance.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index b6e02eaa0..9e0f4dbb5 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -53,16 +53,18 @@ def selection( ): """ Selects features based on variable importance. + Parameters ---------- - k_best : int, optional, default=None + k_best : int, default=None Selects the top k features based on importance scores. - percentile : float, optional, default=None + percentile : float, default=None Selects features based on a specified percentile of importance scores. - threshold : float, optional, default=None + threshold : float, default=None Selects features with importance scores above the specified threshold. - threshold_pvalue : float, optional, default=None + threshold_pvalue : float, default=None Selects features with p-values below the specified threshold. + Returns ------- selection : array-like of shape (n_features,) @@ -182,7 +184,7 @@ def selection_fdr( Raises ------ AssertionError - If test_scores_ is None or if incompatible combinations of parameters are provided + If test_scores\_ is None or if incompatible combinations of parameters are provided """ self._check_importance() assert ( From 3c08f757398e799e964b4d82f2b3a83e1a71a0f6 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 29 Aug 2025 18:46:54 +0200 Subject: [PATCH 06/54] Add test for 1 test_score --- src/hidimstat/base_variable_importance.py | 40 +++++++++++------------ test/test_base_variable_importance.py | 9 +++++ 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 9e0f4dbb5..911435a8e 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -38,6 +38,9 @@ def __init__(self): self.pvalues_ = None self.selections_ = None self.test_scores_ = None + self.threshold_fdr_ = None + self.aggregated_pval_ = None + self.aggregated_eval_ = None def _check_importance(self): """ @@ -191,43 +194,38 @@ def selection_fdr( self.test_scores_ is not None ), "this method doesn't support selection base on FDR" - if not evalues: + if self.test_scores_.shape[0] == 1: + self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr) + selected = self.test_scores_[0] >= self.threshold_fdr_ + elif not evalues: assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" pvalues = np.array( - [ - _empirical_pval(self.test_scores_[i]) - for i in range(len(self.test_scores_)) - ] + [_empirical_pval(test_score) for test_score in self.test_scores_] ) - aggregated_pval = quantile_aggregation( + self.aggregated_pval_ = quantile_aggregation( pvalues, gamma=gamma, adaptive=adaptive_aggregation ) - threshold = fdr_threshold( - aggregated_pval, + self.threshold_fdr_ = fdr_threshold( + self.aggregated_pval_, fdr=fdr, method=fdr_control, reshaping_function=reshaping_function, ) - selected = aggregated_pval <= threshold + selected = self.aggregated_pval_ <= self.threshold_fdr_ else: assert fdr_control == "ebh", "for e-value, the fdr control need to be 'ebh'" - ko_threshold = [] + evalues = [] for test_score in self.test_scores_: - ko_threshold.append(_estimated_threshold(test_score, fdr=fdr)) - evalues = np.array( - [ - _empirical_eval(self.test_scores_[i], ko_threshold[i]) - for i in range(len(self.test_scores_)) - ] - ) - aggregated_eval = np.mean(evalues, axis=0) - threshold = fdr_threshold( - aggregated_eval, + ko_threshold = _estimated_threshold(test_score, fdr=fdr) + evalues.append(_empirical_eval(test_score, ko_threshold)) + self.aggregated_eval_ = np.mean(evalues, axis=0) + self.threshold_fdr_ = fdr_threshold( + self.aggregated_eval_, fdr=fdr, method=fdr_control, reshaping_function=reshaping_function, ) - selected = aggregated_eval >= threshold + selected = self.aggregated_eval_ >= self.threshold_fdr_ return selected diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 0d4ff4a81..5f154f91a 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -40,6 +40,7 @@ def set_BaseVariableImportance(pvalues, test_score, seed): score[-i:] = np.arange(30 - i, 30) * 2 score[:i] = -np.arange(30 - i, 30) vi.test_scores_.append(score[vi.importances_]) + vi.test_scores_ = np.array(vi.test_scores_) return vi @@ -135,6 +136,14 @@ def test_selection_fdr_default(self, set_BaseVariableImportance): selection = vi.selection_fdr(0.2) np.testing.assert_array_equal(true_value, selection) + def test_selection_fdr_default_1(self, set_BaseVariableImportance): + "test selection of the default" + vi = set_BaseVariableImportance + vi.test_scores_ = np.array([vi.test_scores_[0, :]]) + true_value = vi.importances_ > -1 # all selected + selection = vi.selection_fdr(0.2) + np.testing.assert_array_equal(true_value, selection) + def test_selection_fdr_adaptation(self, set_BaseVariableImportance): "test selection of the adaptation" vi = set_BaseVariableImportance From 7f3a117c00100a08cd86a375125ee502208b8295 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 1 Sep 2025 12:02:38 +0200 Subject: [PATCH 07/54] change the usage of test fdr without aggregation --- src/hidimstat/base_variable_importance.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 911435a8e..cd8d64b2a 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -195,7 +195,12 @@ def selection_fdr( ), "this method doesn't support selection base on FDR" if self.test_scores_.shape[0] == 1: - self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr) + self.threshold_fdr_ = fdr_threshold( + self.pvalues_, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) selected = self.test_scores_[0] >= self.threshold_fdr_ elif not evalues: assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" From 21250b4725c94dfd6bb4027c575f6c3d1654f4e8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 1 Sep 2025 18:51:11 +0200 Subject: [PATCH 08/54] remove a print in test --- test/test_base_variable_importance.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 5f154f91a..3f18b78dc 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -116,7 +116,6 @@ def test_selection_threshold_pvalue(self, set_BaseVariableImportance): vi = set_BaseVariableImportance if vi.pvalues_ is not None: true_value = vi.importances_ < 5 - print(vi.pvalues_) selection = vi.selection( threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] ) From 17d9d9506bd71d604633621fad084794bb822b28 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 2 Sep 2025 11:32:26 +0200 Subject: [PATCH 09/54] Update selection --- src/hidimstat/base_variable_importance.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index cd8d64b2a..e2d699ead 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -194,13 +194,20 @@ def selection_fdr( self.test_scores_ is not None ), "this method doesn't support selection base on FDR" - if self.test_scores_.shape[0] == 1: - self.threshold_fdr_ = fdr_threshold( - self.pvalues_, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) + if self.test_scores_ is None: + if self.pvalues_ is None: + raise ValueError( + "For using a selection with FDR, it require a method which compute at least FDR." + ) + else: + self.threshold_fdr_ = fdr_threshold( + self.pvalues_, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + elif self.test_scores_.shape[0] == 1: + self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr) selected = self.test_scores_[0] >= self.threshold_fdr_ elif not evalues: assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" From e8134d8a966ce74b43e549b4e5288b697756af50 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 18:53:29 +0200 Subject: [PATCH 10/54] remove function for knockoff --- src/hidimstat/base_variable_importance.py | 90 ----------------------- 1 file changed, 90 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index e2d699ead..fc5e947ab 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -239,93 +239,3 @@ def selection_fdr( ) selected = self.aggregated_eval_ >= self.threshold_fdr_ return selected - - -def _estimated_threshold(test_score, fdr=0.1): - """ - Calculate the threshold based on the procedure stated in the knockoff article. - Original code: - https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R - Parameters - ---------- - test_score : 1D ndarray, shape (n_features, ) - Vector of test statistic. - fdr : float - Desired controlled FDR (false discovery rate) level. - Returns - ------- - threshold : float or np.inf - Threshold level. - """ - offset = 1 # Offset equals 1 is the knockoff+ procedure. - - threshold_mesh = np.sort(np.abs(test_score[test_score != 0])) - np.concatenate( - [[0], threshold_mesh, [np.inf]] - ) # if there is no solution, the threshold is inf - # find the right value of t for getting a good fdr - # Equation 1.8 of barber2015controlling and 3.10 in Candès 2018 - threshold = 0.0 - for threshold in threshold_mesh: - false_pos = np.sum(test_score <= -threshold) - selected = np.sum(test_score >= threshold) - if (offset + false_pos) / np.maximum(selected, 1) <= fdr: - break - return threshold - - -def _empirical_pval(test_score): - """ - Compute the empirical p-values from the test based on knockoff+. - Parameters - ---------- - test_score : 1D ndarray, shape (n_features, ) - Vector of test statistics. - Returns - ------- - pvals : 1D ndarray, shape (n_features, ) - Vector of empirical p-values. - """ - pvals = [] - n_features = test_score.size - - offset = 1 # Offset equals 1 is the knockoff+ procedure. - - test_score_inv = -test_score - for i in range(n_features): - if test_score[i] <= 0: - pvals.append(1) - else: - pvals.append( - (offset + np.sum(test_score_inv >= test_score[i])) / n_features - ) - - return np.array(pvals) - - -def _empirical_eval(test_score, ko_threshold): - """ - Compute the empirical e-values from the test based on knockoff. - Parameters - ---------- - test_score : 1D ndarray, shape (n_features, ) - Vector of test statistics. - ko_threshold : float - Threshold level. - Returns - ------- - evals : 1D ndarray, shape (n_features, ) - Vector of empirical e-values. - """ - evals = [] - n_features = test_score.size - - offset = 1 # Offset equals 1 is the knockoff+ procedure. - - for i in range(n_features): - if test_score[i] < ko_threshold: - evals.append(0) - else: - evals.append(n_features / (offset + np.sum(test_score <= -ko_threshold))) - - return np.array(evals) From 51685e87738ce47740578b1214d58772b11bc20a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 19:14:01 +0200 Subject: [PATCH 11/54] update selection_fdr --- src/hidimstat/base_variable_importance.py | 87 +++++++---------------- 1 file changed, 24 insertions(+), 63 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index fc5e947ab..b56a47950 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -147,7 +147,6 @@ def selection_fdr( self, fdr, fdr_control="bhq", - evalues=False, reshaping_function=None, adaptive_aggregation=False, gamma=0.5, @@ -155,29 +154,24 @@ def selection_fdr( """ Performs feature selection based on False Discovery Rate (FDR) control. - This method selects features by controlling the FDR using either p-values or e-values - derived from test scores. It supports different FDR control methods and optional - adaptive aggregation of the statistical values. + This method selects features by controlling the FDR using either p-values. + It supports different FDR control methods and optional adaptive aggregation + of the statistical values. Parameters ---------- - fdr : float, default=None + fdr : float The target false discovery rate level (between 0 and 1) - fdr_control: string, default="bhq" + fdr_control: str, default="bhq" The FDR control method to use. Options are: - "bhq": Benjamini-Hochberg procedure - 'bhy': Benjamini-Hochberg-Yekutieli procedure - - "ebh": e-BH procedure (only for e-values) - evalues: boolean, default=False - If True, uses e-values for selection. If False, uses p-values. reshaping_function: callable, default=None Reshaping function for BHY method, default uses sum of reciprocals - adaptive_aggregation: boolean, default=False - If True, uses adaptive weights for p-value aggregation. - Only applicable when evalues=False. - gamma: boolean, default=0.5 - The gamma parameter for quantile aggregation of p-values. - Only used when evalues=False. + adaptive_aggregation: bool, default=False + If True, uses adaptive weights for p-value aggregation + gamma: float, default=0.5 + The gamma parameter for quantile aggregation of p-values (between 0 and 1) Returns ------- @@ -187,55 +181,22 @@ def selection_fdr( Raises ------ AssertionError - If test_scores\_ is None or if incompatible combinations of parameters are provided + If list_pvalues_ attribute is missing or fdr_control is invalid """ - self._check_importance() - assert ( - self.test_scores_ is not None + assert hasattr( + self, "list_pvalues_" ), "this method doesn't support selection base on FDR" + self._check_importance() + assert fdr_control == "bhq" and fdr_control == "bhy" - if self.test_scores_ is None: - if self.pvalues_ is None: - raise ValueError( - "For using a selection with FDR, it require a method which compute at least FDR." - ) - else: - self.threshold_fdr_ = fdr_threshold( - self.pvalues_, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - elif self.test_scores_.shape[0] == 1: - self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr) - selected = self.test_scores_[0] >= self.threshold_fdr_ - elif not evalues: - assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'" - pvalues = np.array( - [_empirical_pval(test_score) for test_score in self.test_scores_] - ) - self.aggregated_pval_ = quantile_aggregation( - pvalues, gamma=gamma, adaptive=adaptive_aggregation - ) - self.threshold_fdr_ = fdr_threshold( - self.aggregated_pval_, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - selected = self.aggregated_pval_ <= self.threshold_fdr_ - else: - assert fdr_control == "ebh", "for e-value, the fdr control need to be 'ebh'" - evalues = [] - for test_score in self.test_scores_: - ko_threshold = _estimated_threshold(test_score, fdr=fdr) - evalues.append(_empirical_eval(test_score, ko_threshold)) - self.aggregated_eval_ = np.mean(evalues, axis=0) - self.threshold_fdr_ = fdr_threshold( - self.aggregated_eval_, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - selected = self.aggregated_eval_ >= self.threshold_fdr_ + aggregated_pval = quantile_aggregation( + np.array(self.list_pvalues_), gamma=gamma, adaptive=adaptive_aggregation + ) + threshold_pval = fdr_threshold( + aggregated_pval, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected = aggregated_pval <= threshold_pval return selected From 39ec78fef4a7798be004bac95deaf375deef5b87 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 9 Sep 2025 19:38:46 +0200 Subject: [PATCH 12/54] fix selection --- src/hidimstat/base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index b56a47950..3b7882fc3 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -187,7 +187,7 @@ def selection_fdr( self, "list_pvalues_" ), "this method doesn't support selection base on FDR" self._check_importance() - assert fdr_control == "bhq" and fdr_control == "bhy" + assert fdr_control == "bhq" or fdr_control == "bhy" aggregated_pval = quantile_aggregation( np.array(self.list_pvalues_), gamma=gamma, adaptive=adaptive_aggregation From f3ff485aabf19059333a101498861e82ad43303d Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 11:21:06 +0200 Subject: [PATCH 13/54] improve selection --- src/hidimstat/base_variable_importance.py | 19 ++-- test/test_base_variable_importance.py | 103 ++++++++++++---------- 2 files changed, 70 insertions(+), 52 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 3b7882fc3..81667454c 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -183,15 +183,20 @@ def selection_fdr( AssertionError If list_pvalues_ attribute is missing or fdr_control is invalid """ - assert hasattr( - self, "list_pvalues_" - ), "this method doesn't support selection base on FDR" self._check_importance() - assert fdr_control == "bhq" or fdr_control == "bhy" + assert ( + fdr_control == "bhq" or fdr_control == "bhy" + ), "only 'bhq' and 'bhy' are supported" + assert ( + self.pvalues_ is not None + ), "this method doesn't support selection base on FDR" - aggregated_pval = quantile_aggregation( - np.array(self.list_pvalues_), gamma=gamma, adaptive=adaptive_aggregation - ) + if hasattr(self, "list_pvalues_"): + aggregated_pval = quantile_aggregation( + np.array(self.list_pvalues_), gamma=gamma, adaptive=adaptive_aggregation + ) + else: + aggregated_pval = self.pvalues_ threshold_pval = fdr_threshold( aggregated_pval, fdr=fdr, diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 3f18b78dc..16449f116 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -4,8 +4,22 @@ from hidimstat import BaseVariableImportance +def generate_list_pvalues_for_fdr(importances, factor=30): + nb_features = importances.shape[0] + result_list = [] + for i in range(10): + score = np.random.rand(nb_features) * factor + result_list.append(score) + for i in range(1, factor): + score = np.random.rand(nb_features) + 1 + score[-i:] = np.arange(factor - i, factor) * 2 + score[:i] = -np.arange(factor - i, factor) + result_list.append(np.flip(score)[importances]) + return np.array(result_list) / np.max(result_list) + + @pytest.fixture -def set_BaseVariableImportance(pvalues, test_score, seed): +def set_BaseVariableImportance(pvalues, list_pvalues, seed): """Create a BaseVariableImportance instance with test data for testing purposes. Parameters @@ -26,28 +40,19 @@ def set_BaseVariableImportance(pvalues, test_score, seed): rng = np.random.RandomState(seed) vi = BaseVariableImportance() vi.importances_ = np.arange(nb_features) - rng.shuffle(vi.importances_) - if pvalues or test_score: - vi.pvalues_ = np.sort(rng.rand(nb_features))[vi.importances_] - if test_score: - # TODO: this can be improved. - vi.test_scores_ = [] - for i in range(10): - score = np.random.rand(nb_features) * 30 - vi.test_scores_.append(score) - for i in range(1, 30): - score = np.random.rand(nb_features) + 1 - score[-i:] = np.arange(30 - i, 30) * 2 - score[:i] = -np.arange(30 - i, 30) - vi.test_scores_.append(score[vi.importances_]) - vi.test_scores_ = np.array(vi.test_scores_) + # rng.shuffle(vi.importances_) + list_pvalues_generated = generate_list_pvalues_for_fdr(vi.importances_) + if pvalues or list_pvalues: + vi.pvalues_ = np.mean(list_pvalues_generated, axis=0) + if list_pvalues: + vi.list_pvalues_ = list_pvalues_generated return vi @pytest.mark.parametrize( - "pvalues, test_score, seed", + "pvalues, list_pvalues, seed", [(False, False, 0), (True, False, 1), (True, True, 2)], - ids=["only importance", "p-value", "test-score"], + ids=["only importance", "p-value", "list_pvalues"], ) class TestSelection: """Test selection based on importance""" @@ -115,7 +120,7 @@ def test_selection_threshold_pvalue(self, set_BaseVariableImportance): "test threshold vbse on pvalues" vi = set_BaseVariableImportance if vi.pvalues_ is not None: - true_value = vi.importances_ < 5 + true_value = vi.importances_ > 5 selection = vi.selection( threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] ) @@ -123,7 +128,9 @@ def test_selection_threshold_pvalue(self, set_BaseVariableImportance): @pytest.mark.parametrize( - "pvalues, test_score, seed", [(True, True, 10)], ids=["default"] + "pvalues, list_pvalues, seed", + [(True, False, 10), (True, True, 10)], + ids=["pvalue_only", "list_pvalue"], ) class TestSelectionFDR: """Test selection based on fdr""" @@ -131,49 +138,58 @@ class TestSelectionFDR: def test_selection_fdr_default(self, set_BaseVariableImportance): "test selection of the default" vi = set_BaseVariableImportance - true_value = vi.importances_ >= 85 selection = vi.selection_fdr(0.2) - np.testing.assert_array_equal(true_value, selection) + assert np.all( + [ + i >= (vi.importances_ - np.sum(selection)) + for i in vi.importances_[selection] + ] + ) def test_selection_fdr_default_1(self, set_BaseVariableImportance): "test selection of the default" vi = set_BaseVariableImportance - vi.test_scores_ = np.array([vi.test_scores_[0, :]]) - true_value = vi.importances_ > -1 # all selected + vi.pvalues_ = np.random.rand(vi.importances_.shape[0]) * 30 + if hasattr(vi, "list_pvalues_"): + vi.list_pvalues_ = [ + np.random.rand(vi.importances_.shape[0]) * 30 for i in range(10) + ] + true_value = np.zeros_like(vi.importances_, dtype=bool) # selected any selection = vi.selection_fdr(0.2) np.testing.assert_array_equal(true_value, selection) def test_selection_fdr_adaptation(self, set_BaseVariableImportance): "test selection of the adaptation" vi = set_BaseVariableImportance - true_value = vi.importances_ >= 85 selection = vi.selection_fdr(0.2, adaptive_aggregation=True) - np.testing.assert_array_equal(true_value, selection) + assert np.all( + [ + i >= (vi.importances_ - np.sum(selection)) + for i in vi.importances_[selection] + ] + ) def test_selection_fdr_bhy(self, set_BaseVariableImportance): "test selection with bhy" vi = set_BaseVariableImportance - true_value = vi.importances_ >= 85 selection = vi.selection_fdr(0.8, fdr_control="bhy") - np.testing.assert_array_equal(true_value, selection) - - def test_selection_fdr_ebh(self, set_BaseVariableImportance): - "test selection with e-values" - vi = set_BaseVariableImportance - true_value = vi.importances_ >= 2 - selection = vi.selection_fdr(0.037, fdr_control="ebh", evalues=True) - np.testing.assert_array_equal(true_value, selection) + assert np.all( + [ + i >= (vi.importances_ - np.sum(selection)) + for i in vi.importances_[selection] + ] + ) @pytest.mark.parametrize( - "pvalues, test_score, seed", + "pvalues, list_pvalues, seed", [(False, False, 0), (True, False, 0), (True, True, 0)], - ids=["only importance", "p-value", "test-score"], + ids=["only importance", "p-value", "list_pvalues"], ) class TestBVIExceptions: """Test class for BVI Exception""" - def test_not_fit(self, pvalues, test_score, seed): + def test_not_fit(self, pvalues, list_pvalues, seed): "test detection unfit" vi = BaseVariableImportance() with pytest.raises( @@ -234,7 +250,7 @@ def test_selection_threshold(self, set_BaseVariableImportance): def test_selection_fdr_fdr_control(self, set_BaseVariableImportance): "test selection fdr_control wrong" vi = set_BaseVariableImportance - if vi.test_scores_ is None: + if vi.pvalues_ is None: with pytest.raises( AssertionError, match="this method doesn't support selection base on FDR", @@ -242,10 +258,7 @@ def test_selection_fdr_fdr_control(self, set_BaseVariableImportance): vi.selection_fdr(fdr=0.1) else: with pytest.raises( - AssertionError, match="for e-value, the fdr control need to be 'ebh'" - ): - vi.selection_fdr(fdr=0.1, evalues=True) - with pytest.raises( - AssertionError, match="for p-value, the fdr control can't be 'ebh'" + AssertionError, + match="only 'bhq' and 'bhy' are supported", ): - vi.selection_fdr(fdr=0.1, fdr_control="ebh", evalues=False) + vi.selection_fdr(fdr=0.1, fdr_control="ehb") From 817af116d677c19ffc6a815fa21787a622f5b1ff Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 11:22:49 +0200 Subject: [PATCH 14/54] fix some part of the selection --- test/test_base_variable_importance.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 16449f116..3251dbdac 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -5,6 +5,9 @@ def generate_list_pvalues_for_fdr(importances, factor=30): + """Generate values for applying FDR. + TODO: Improve data generation. + """ nb_features = importances.shape[0] result_list = [] for i in range(10): @@ -40,7 +43,7 @@ def set_BaseVariableImportance(pvalues, list_pvalues, seed): rng = np.random.RandomState(seed) vi = BaseVariableImportance() vi.importances_ = np.arange(nb_features) - # rng.shuffle(vi.importances_) + rng.shuffle(vi.importances_) list_pvalues_generated = generate_list_pvalues_for_fdr(vi.importances_) if pvalues or list_pvalues: vi.pvalues_ = np.mean(list_pvalues_generated, axis=0) From 7e256c2f3ec7a8604afdfafb860e5d9263edcf51 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 16:55:23 +0200 Subject: [PATCH 15/54] fix test --- test/test_base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 3251dbdac..fad684a8e 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -1,7 +1,7 @@ import pytest import numpy as np -from hidimstat import BaseVariableImportance +from hidimstat.base_variable_importance import BaseVariableImportance def generate_list_pvalues_for_fdr(importances, factor=30): From 5cc731c8070ae7c02976ef1db97361b7c91bc04a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 17:09:29 +0200 Subject: [PATCH 16/54] try to fix test --- test/test_base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index fad684a8e..497b3862f 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -175,7 +175,7 @@ def test_selection_fdr_adaptation(self, set_BaseVariableImportance): def test_selection_fdr_bhy(self, set_BaseVariableImportance): "test selection with bhy" vi = set_BaseVariableImportance - selection = vi.selection_fdr(0.8, fdr_control="bhy") + selection = vi.selection_fdr(0.2, fdr_control="bhy") assert np.all( [ i >= (vi.importances_ - np.sum(selection)) From 90e142520725a6055f96ef16d83cd77dfcf2bd22 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 17:16:58 +0200 Subject: [PATCH 17/54] fix seed in generation of data --- test/test_base_variable_importance.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 497b3862f..560aa28ac 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -4,17 +4,17 @@ from hidimstat.base_variable_importance import BaseVariableImportance -def generate_list_pvalues_for_fdr(importances, factor=30): +def generate_list_pvalues_for_fdr(rng, importances, factor=30): """Generate values for applying FDR. TODO: Improve data generation. """ nb_features = importances.shape[0] result_list = [] for i in range(10): - score = np.random.rand(nb_features) * factor + score = rng.rand(nb_features) * factor result_list.append(score) for i in range(1, factor): - score = np.random.rand(nb_features) + 1 + score = rng.rand(nb_features) + 1 score[-i:] = np.arange(factor - i, factor) * 2 score[:i] = -np.arange(factor - i, factor) result_list.append(np.flip(score)[importances]) @@ -44,7 +44,7 @@ def set_BaseVariableImportance(pvalues, list_pvalues, seed): vi = BaseVariableImportance() vi.importances_ = np.arange(nb_features) rng.shuffle(vi.importances_) - list_pvalues_generated = generate_list_pvalues_for_fdr(vi.importances_) + list_pvalues_generated = generate_list_pvalues_for_fdr(rng, vi.importances_) if pvalues or list_pvalues: vi.pvalues_ = np.mean(list_pvalues_generated, axis=0) if list_pvalues: From 21d061480442447c626e9acce87de44a8392f29a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 10 Sep 2025 17:37:28 +0200 Subject: [PATCH 18/54] fix docstring --- src/hidimstat/base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 81667454c..bbe672954 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -181,7 +181,7 @@ def selection_fdr( Raises ------ AssertionError - If list_pvalues_ attribute is missing or fdr_control is invalid + If list_pvalues\_ attribute is missing or fdr_control is invalid """ self._check_importance() assert ( From 5e19e1ba7c3cba772a7cc49e5da8fd83e11c154a Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 11 Sep 2025 11:17:30 +0200 Subject: [PATCH 19/54] Fix attribute in base_variable_importance --- src/hidimstat/base_variable_importance.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index bbe672954..c174982a2 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -20,8 +20,6 @@ class BaseVariableImportance(BaseEstimator): The computed importance scores for each feature. pvalues_ : array-like of shape (n_features,), default=None The computed p-values for each feature. - selections_ : array-like of shape (n_features,), default=None - Binary mask indicating selected features. Methods ------- @@ -36,11 +34,6 @@ def __init__(self): super().__init__() self.importances_ = None self.pvalues_ = None - self.selections_ = None - self.test_scores_ = None - self.threshold_fdr_ = None - self.aggregated_pval_ = None - self.aggregated_eval_ = None def _check_importance(self): """ @@ -137,11 +130,11 @@ def selection( else: mask_threshold_pvalue = np.ones(self.importances_.shape, dtype=bool) - self.selections_ = ( + selections = ( mask_k_best & mask_percentile & mask_threshold & mask_threshold_pvalue ) - return self.selections_ + return selections def selection_fdr( self, @@ -173,7 +166,7 @@ def selection_fdr( gamma: float, default=0.5 The gamma parameter for quantile aggregation of p-values (between 0 and 1) - Returns + Returnsgit ------- numpy.ndarray Boolean array indicating selected features (True for selected, False for not selected) From c0af81ae434818edb856153945f3260c213cb105 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 11 Sep 2025 16:09:01 +0200 Subject: [PATCH 20/54] change name --- src/hidimstat/knockoffs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/knockoffs.py b/src/hidimstat/knockoffs.py index 7dab07aeb..244baed96 100644 --- a/src/hidimstat/knockoffs.py +++ b/src/hidimstat/knockoffs.py @@ -462,13 +462,13 @@ def _stat_coefficient_diff(X, X_tilde, y, estimator, fdr, preconfigure_estimator test_score = np.abs(coef[:n_features]) - np.abs(coef[n_features:]) # Compute the threshold level and selecte the important variables - ko_thr = _knockoff_threshold(test_score, fdr=fdr) + ko_thr = _fdr_threshold_on_symmetric_null(test_score, fdr=fdr) selected = np.where(test_score >= ko_thr)[0] return test_score, ko_thr, selected -def _knockoff_threshold(test_score, fdr=0.1): +def _fdr_threshold_on_symmetric_null(test_score, fdr=0.1): """ Calculate the knockoff threshold based on the procedure stated in the article. From 08ddbaa087fc233ca761d3962f33b1a76ec33b4b Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 12 Sep 2025 10:55:55 +0200 Subject: [PATCH 21/54] fix docstrign --- src/hidimstat/base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index c174982a2..ef96c308a 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -166,7 +166,7 @@ def selection_fdr( gamma: float, default=0.5 The gamma parameter for quantile aggregation of p-values (between 0 and 1) - Returnsgit + Returns ------- numpy.ndarray Boolean array indicating selected features (True for selected, False for not selected) From 7d5838068ebd3182fe804d27d85fbccdb18afe23 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 17 Sep 2025 15:26:12 +0200 Subject: [PATCH 22/54] fix linter --- src/hidimstat/base_variable_importance.py | 2 +- test/test_base_variable_importance.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 37a349653..a4dcc6d59 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -3,8 +3,8 @@ import numpy as np from sklearn.base import BaseEstimator -from hidimstat.statistical_tools.multiple_testing import fdr_threshold from hidimstat.statistical_tools.aggregation import quantile_aggregation +from hidimstat.statistical_tools.multiple_testing import fdr_threshold class BaseVariableImportance(BaseEstimator): diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 560aa28ac..7d1244b59 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -1,5 +1,5 @@ -import pytest import numpy as np +import pytest from hidimstat.base_variable_importance import BaseVariableImportance From bc6d5c528ddc23c723b6d7b57f0440703d01df01 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 17 Sep 2025 16:54:40 +0200 Subject: [PATCH 23/54] Mixin for selectionfdr --- src/hidimstat/base_variable_importance.py | 27 ++++++++++++++++--- .../conditional_feature_importance.py | 3 ++- ...istilled_conditional_randomization_test.py | 4 +-- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index a4dcc6d59..83bb2eb49 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -136,6 +136,30 @@ def selection( return selections + +class MixinSelectionFDR: + """ + A mixin class that provides False Discovery Rate (FDR) based feature selection functionality. + This class implements methods for selecting features while controlling the False Discovery + Rate using either Benjamini-Hochberg (BH) or Benjamini-Hochberg-Yekutieli (BHY) procedures. + It can work with single p-values or lists of p-values, and supports adaptive aggregation + for multiple hypothesis testing scenarios. + Attributes + pvalues_ : numpy.ndarray + Array of p-values for each feature + list_pvalues_ : list of numpy.ndarray, optional + List of p-value arrays when multiple tests are performed per feature + Notes + ----- + This mixin should be used with classes that compute feature importance measures + that can be converted to p-values. The class requires either pvalues_ or + list_pvalues_ attribute to be set before calling selection methods. + See Also + -------- + selection_fdr : Main method for performing FDR-based feature selection + + """ + def selection_fdr( self, fdr, @@ -180,9 +204,6 @@ def selection_fdr( assert ( fdr_control == "bhq" or fdr_control == "bhy" ), "only 'bhq' and 'bhy' are supported" - assert ( - self.pvalues_ is not None - ), "this method doesn't support selection base on FDR" if hasattr(self, "list_pvalues_"): aggregated_pval = quantile_aggregation( diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 7ce11714d..883884220 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -4,11 +4,12 @@ from sklearn.metrics import root_mean_squared_error from sklearn.utils.validation import check_random_state +from hidimstat.base_variable_importance import MixinSelectionFDR from hidimstat.base_perturbation import BasePerturbation from hidimstat.conditional_sampling import ConditionalSampler -class CFI(BasePerturbation): +class CFI(BasePerturbation, MixinSelectionFDR): def __init__( self, estimator, diff --git a/src/hidimstat/distilled_conditional_randomization_test.py b/src/hidimstat/distilled_conditional_randomization_test.py index 31ceb2e2f..eafc0b97b 100644 --- a/src/hidimstat/distilled_conditional_randomization_test.py +++ b/src/hidimstat/distilled_conditional_randomization_test.py @@ -9,10 +9,10 @@ from hidimstat._utils.docstring import _aggregate_docstring from hidimstat._utils.utils import _check_vim_predict_method -from hidimstat.base_variable_importance import BaseVariableImportance +from hidimstat.base_variable_importance import BaseVariableImportance, MixinSelectionFDR -class D0CRT(BaseVariableImportance): +class D0CRT(BaseVariableImportance, MixinSelectionFDR): """ Implements distilled conditional randomization test (dCRT) without interactions. From 155c47a62640670e756de078e495d75b345fbf54 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 17 Sep 2025 17:42:00 +0200 Subject: [PATCH 24/54] fix tests --- test/test_base_variable_importance.py | 113 ++++++++++++++++++-------- 1 file changed, 79 insertions(+), 34 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 7d1244b59..2da616c55 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -1,7 +1,34 @@ import numpy as np import pytest -from hidimstat.base_variable_importance import BaseVariableImportance +from hidimstat.base_variable_importance import BaseVariableImportance, MixinSelectionFDR + + +@pytest.fixture +def set_BaseVariableImportance(seed): + """Create a BaseVariableImportance instance with test data for testing purposes. + + Parameters + ---------- + pvalues : bool + If True, generate random p-values for testing. + test_score : bool + If True, generate random test scores for testing. + seed : int + Random seed for reproducibility. + + Returns + ------- + BaseVariableImportance + A BaseVariableImportance instance with test data. + """ + nb_features = 100 + rng = np.random.RandomState(seed) + vi = BaseVariableImportance() + vi.importances_ = np.arange(nb_features) + rng.shuffle(vi.importances_) + vi.pvalues_ = np.flip(np.sort(rng.random(nb_features)))[vi.importances_] + return vi def generate_list_pvalues_for_fdr(rng, importances, factor=30): @@ -22,8 +49,8 @@ def generate_list_pvalues_for_fdr(rng, importances, factor=30): @pytest.fixture -def set_BaseVariableImportance(pvalues, list_pvalues, seed): - """Create a BaseVariableImportance instance with test data for testing purposes. +def set_BaseselectionFDR(pvalues, list_pvalues, seed): + """Create a BaseVariableImportance with selectionFDR instance with test data for testing purposes. Parameters ---------- @@ -41,7 +68,11 @@ def set_BaseVariableImportance(pvalues, list_pvalues, seed): """ nb_features = 100 rng = np.random.RandomState(seed) - vi = BaseVariableImportance() + + class SelectionFDR(BaseVariableImportance, MixinSelectionFDR): + pass + + vi = SelectionFDR() vi.importances_ = np.arange(nb_features) rng.shuffle(vi.importances_) list_pvalues_generated = generate_list_pvalues_for_fdr(rng, vi.importances_) @@ -53,9 +84,9 @@ def set_BaseVariableImportance(pvalues, list_pvalues, seed): @pytest.mark.parametrize( - "pvalues, list_pvalues, seed", - [(False, False, 0), (True, False, 1), (True, True, 2)], - ids=["only importance", "p-value", "list_pvalues"], + "seed", + [0, 2], + ids=["default_seed", "another seed"], ) class TestSelection: """Test selection based on importance""" @@ -122,12 +153,11 @@ def test_selection_threshold(self, set_BaseVariableImportance): def test_selection_threshold_pvalue(self, set_BaseVariableImportance): "test threshold vbse on pvalues" vi = set_BaseVariableImportance - if vi.pvalues_ is not None: - true_value = vi.importances_ > 5 - selection = vi.selection( - threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] - ) - np.testing.assert_array_equal(true_value, selection) + true_value = vi.importances_ > 5 + selection = vi.selection( + threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] + ) + np.testing.assert_array_equal(true_value, selection) @pytest.mark.parametrize( @@ -138,9 +168,9 @@ def test_selection_threshold_pvalue(self, set_BaseVariableImportance): class TestSelectionFDR: """Test selection based on fdr""" - def test_selection_fdr_default(self, set_BaseVariableImportance): + def test_selection_fdr_default(self, set_BaseselectionFDR): "test selection of the default" - vi = set_BaseVariableImportance + vi = set_BaseselectionFDR selection = vi.selection_fdr(0.2) assert np.all( [ @@ -149,9 +179,9 @@ def test_selection_fdr_default(self, set_BaseVariableImportance): ] ) - def test_selection_fdr_default_1(self, set_BaseVariableImportance): + def test_selection_fdr_default_1(self, set_BaseselectionFDR): "test selection of the default" - vi = set_BaseVariableImportance + vi = set_BaseselectionFDR vi.pvalues_ = np.random.rand(vi.importances_.shape[0]) * 30 if hasattr(vi, "list_pvalues_"): vi.list_pvalues_ = [ @@ -161,9 +191,9 @@ def test_selection_fdr_default_1(self, set_BaseVariableImportance): selection = vi.selection_fdr(0.2) np.testing.assert_array_equal(true_value, selection) - def test_selection_fdr_adaptation(self, set_BaseVariableImportance): + def test_selection_fdr_adaptation(self, set_BaseselectionFDR): "test selection of the adaptation" - vi = set_BaseVariableImportance + vi = set_BaseselectionFDR selection = vi.selection_fdr(0.2, adaptive_aggregation=True) assert np.all( [ @@ -172,9 +202,9 @@ def test_selection_fdr_adaptation(self, set_BaseVariableImportance): ] ) - def test_selection_fdr_bhy(self, set_BaseVariableImportance): + def test_selection_fdr_bhy(self, set_BaseselectionFDR): "test selection with bhy" - vi = set_BaseVariableImportance + vi = set_BaseselectionFDR selection = vi.selection_fdr(0.2, fdr_control="bhy") assert np.all( [ @@ -185,14 +215,14 @@ def test_selection_fdr_bhy(self, set_BaseVariableImportance): @pytest.mark.parametrize( - "pvalues, list_pvalues, seed", - [(False, False, 0), (True, False, 0), (True, True, 0)], - ids=["only importance", "p-value", "list_pvalues"], + "seed", + [0], + ids=["default_seed"], ) class TestBVIExceptions: """Test class for BVI Exception""" - def test_not_fit(self, pvalues, list_pvalues, seed): + def test_not_fit(self, seed): "test detection unfit" vi = BaseVariableImportance() with pytest.raises( @@ -205,11 +235,6 @@ def test_not_fit(self, pvalues, list_pvalues, seed): match="The importances need to be called before calling this method", ): vi.selection() - with pytest.raises( - ValueError, - match="The importances need to be called before calling this method", - ): - vi.selection_fdr(0.1) def test_selection_k_best(self, set_BaseVariableImportance): "test selection k_best wrong" @@ -250,13 +275,33 @@ def test_selection_threshold(self, set_BaseVariableImportance): ): vi.selection(threshold_pvalue=1.1) - def test_selection_fdr_fdr_control(self, set_BaseVariableImportance): + +@pytest.mark.parametrize( + "pvalues, list_pvalues, seed", + [(False, False, 0), (True, False, 0), (True, True, 0)], + ids=["only importance", "p-value", "list_pvalues"], +) +class TestSelectionFDRExceptions: + def test_not_fit(self, pvalues, list_pvalues, seed): + "test detection unfit" + + class SelectionFDR(BaseVariableImportance, MixinSelectionFDR): + pass + + vi = SelectionFDR() + with pytest.raises( + ValueError, + match="The importances need to be called before calling this method", + ): + vi.selection_fdr(0.1) + + def test_selection_fdr_fdr_control(self, set_BaseselectionFDR): "test selection fdr_control wrong" - vi = set_BaseVariableImportance + vi = set_BaseselectionFDR if vi.pvalues_ is None: with pytest.raises( - AssertionError, - match="this method doesn't support selection base on FDR", + TypeError, + match="object of type 'NoneType' has no len()", ): vi.selection_fdr(fdr=0.1) else: From f90e6bc43b1135ef3529e6b1aade8a823ca1993e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 17 Sep 2025 17:45:43 +0200 Subject: [PATCH 25/54] fix format --- src/hidimstat/conditional_feature_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index 883884220..b9113f1cf 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -4,8 +4,8 @@ from sklearn.metrics import root_mean_squared_error from sklearn.utils.validation import check_random_state -from hidimstat.base_variable_importance import MixinSelectionFDR from hidimstat.base_perturbation import BasePerturbation +from hidimstat.base_variable_importance import MixinSelectionFDR from hidimstat.conditional_sampling import ConditionalSampler From 710bec48302c339261fdba0b80e6bab1485eb2aa Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 23 Sep 2025 18:58:43 +0200 Subject: [PATCH 26/54] put back the selection_fdr in base class --- src/hidimstat/base_variable_importance.py | 68 +++--------- .../conditional_feature_importance.py | 3 +- ...istilled_conditional_randomization_test.py | 4 +- test/test_base_variable_importance.py | 100 +++--------------- 4 files changed, 37 insertions(+), 138 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 83bb2eb49..c0cd5f32a 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -3,7 +3,6 @@ import numpy as np from sklearn.base import BaseEstimator -from hidimstat.statistical_tools.aggregation import quantile_aggregation from hidimstat.statistical_tools.multiple_testing import fdr_threshold @@ -136,86 +135,53 @@ def selection( return selections - -class MixinSelectionFDR: - """ - A mixin class that provides False Discovery Rate (FDR) based feature selection functionality. - This class implements methods for selecting features while controlling the False Discovery - Rate using either Benjamini-Hochberg (BH) or Benjamini-Hochberg-Yekutieli (BHY) procedures. - It can work with single p-values or lists of p-values, and supports adaptive aggregation - for multiple hypothesis testing scenarios. - Attributes - pvalues_ : numpy.ndarray - Array of p-values for each feature - list_pvalues_ : list of numpy.ndarray, optional - List of p-value arrays when multiple tests are performed per feature - Notes - ----- - This mixin should be used with classes that compute feature importance measures - that can be converted to p-values. The class requires either pvalues_ or - list_pvalues_ attribute to be set before calling selection methods. - See Also - -------- - selection_fdr : Main method for performing FDR-based feature selection - - """ - def selection_fdr( self, fdr, fdr_control="bhq", reshaping_function=None, - adaptive_aggregation=False, - gamma=0.5, ): """ Performs feature selection based on False Discovery Rate (FDR) control. - This method selects features by controlling the FDR using either p-values. - It supports different FDR control methods and optional adaptive aggregation - of the statistical values. - Parameters ---------- fdr : float The target false discovery rate level (between 0 and 1) - fdr_control: str, default="bhq" - The FDR control method to use. Options are: - - "bhq": Benjamini-Hochberg procedure + fdr_control: {'bhq', 'bhy'}, default='bhq' + The FDR control method to use: + - 'bhq': Benjamini-Hochberg procedure - 'bhy': Benjamini-Hochberg-Yekutieli procedure - reshaping_function: callable, default=None - Reshaping function for BHY method, default uses sum of reciprocals - adaptive_aggregation: bool, default=False - If True, uses adaptive weights for p-value aggregation - gamma: float, default=0.5 - The gamma parameter for quantile aggregation of p-values (between 0 and 1) + reshaping_function: callable, optional + Optional reshaping function for FDR control methods. + If None, defaults to sum of reciprocals for 'bhy'. Returns ------- - numpy.ndarray - Boolean array indicating selected features (True for selected, False for not selected) + selected : ndarray of bool + Boolean mask of selected features. + True indicates selected features, False indicates non-selected features. Raises ------ + ValueError + If importances_ haven't been computed yet AssertionError - If list_pvalues\_ attribute is missing or fdr_control is invalid + If pvalues_ are missing or fdr_control is invalid """ self._check_importance() + assert ( + self.pvalues_ is not None + ), "FDR-based selection requires p-values to be computed first. The current method does not support p-values." assert ( fdr_control == "bhq" or fdr_control == "bhy" ), "only 'bhq' and 'bhy' are supported" - if hasattr(self, "list_pvalues_"): - aggregated_pval = quantile_aggregation( - np.array(self.list_pvalues_), gamma=gamma, adaptive=adaptive_aggregation - ) - else: - aggregated_pval = self.pvalues_ threshold_pval = fdr_threshold( - aggregated_pval, + self.pvalues_, fdr=fdr, method=fdr_control, reshaping_function=reshaping_function, ) - selected = aggregated_pval <= threshold_pval + selected = self.pvalues_ <= threshold_pval return selected diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py index b9113f1cf..7ce11714d 100644 --- a/src/hidimstat/conditional_feature_importance.py +++ b/src/hidimstat/conditional_feature_importance.py @@ -5,11 +5,10 @@ from sklearn.utils.validation import check_random_state from hidimstat.base_perturbation import BasePerturbation -from hidimstat.base_variable_importance import MixinSelectionFDR from hidimstat.conditional_sampling import ConditionalSampler -class CFI(BasePerturbation, MixinSelectionFDR): +class CFI(BasePerturbation): def __init__( self, estimator, diff --git a/src/hidimstat/distilled_conditional_randomization_test.py b/src/hidimstat/distilled_conditional_randomization_test.py index eafc0b97b..31ceb2e2f 100644 --- a/src/hidimstat/distilled_conditional_randomization_test.py +++ b/src/hidimstat/distilled_conditional_randomization_test.py @@ -9,10 +9,10 @@ from hidimstat._utils.docstring import _aggregate_docstring from hidimstat._utils.utils import _check_vim_predict_method -from hidimstat.base_variable_importance import BaseVariableImportance, MixinSelectionFDR +from hidimstat.base_variable_importance import BaseVariableImportance -class D0CRT(BaseVariableImportance, MixinSelectionFDR): +class D0CRT(BaseVariableImportance): """ Implements distilled conditional randomization test (dCRT) without interactions. diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 2da616c55..d3ee25d8d 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from hidimstat.base_variable_importance import BaseVariableImportance, MixinSelectionFDR +from hidimstat.base_variable_importance import BaseVariableImportance @pytest.fixture @@ -31,58 +31,6 @@ def set_BaseVariableImportance(seed): return vi -def generate_list_pvalues_for_fdr(rng, importances, factor=30): - """Generate values for applying FDR. - TODO: Improve data generation. - """ - nb_features = importances.shape[0] - result_list = [] - for i in range(10): - score = rng.rand(nb_features) * factor - result_list.append(score) - for i in range(1, factor): - score = rng.rand(nb_features) + 1 - score[-i:] = np.arange(factor - i, factor) * 2 - score[:i] = -np.arange(factor - i, factor) - result_list.append(np.flip(score)[importances]) - return np.array(result_list) / np.max(result_list) - - -@pytest.fixture -def set_BaseselectionFDR(pvalues, list_pvalues, seed): - """Create a BaseVariableImportance with selectionFDR instance with test data for testing purposes. - - Parameters - ---------- - pvalues : bool - If True, generate random p-values for testing. - test_score : bool - If True, generate random test scores for testing. - seed : int - Random seed for reproducibility. - - Returns - ------- - BaseVariableImportance - A BaseVariableImportance instance with test data. - """ - nb_features = 100 - rng = np.random.RandomState(seed) - - class SelectionFDR(BaseVariableImportance, MixinSelectionFDR): - pass - - vi = SelectionFDR() - vi.importances_ = np.arange(nb_features) - rng.shuffle(vi.importances_) - list_pvalues_generated = generate_list_pvalues_for_fdr(rng, vi.importances_) - if pvalues or list_pvalues: - vi.pvalues_ = np.mean(list_pvalues_generated, axis=0) - if list_pvalues: - vi.list_pvalues_ = list_pvalues_generated - return vi - - @pytest.mark.parametrize( "seed", [0, 2], @@ -161,16 +109,16 @@ def test_selection_threshold_pvalue(self, set_BaseVariableImportance): @pytest.mark.parametrize( - "pvalues, list_pvalues, seed", - [(True, False, 10), (True, True, 10)], - ids=["pvalue_only", "list_pvalue"], + "seed", + [0], + ids=["default_seed"], ) class TestSelectionFDR: """Test selection based on fdr""" - def test_selection_fdr_default(self, set_BaseselectionFDR): + def test_selection_fdr_default(self, set_BaseVariableImportance): "test selection of the default" - vi = set_BaseselectionFDR + vi = set_BaseVariableImportance selection = vi.selection_fdr(0.2) assert np.all( [ @@ -179,9 +127,9 @@ def test_selection_fdr_default(self, set_BaseselectionFDR): ] ) - def test_selection_fdr_default_1(self, set_BaseselectionFDR): + def test_selection_fdr_default_1(self, set_BaseVariableImportance): "test selection of the default" - vi = set_BaseselectionFDR + vi = set_BaseVariableImportance vi.pvalues_ = np.random.rand(vi.importances_.shape[0]) * 30 if hasattr(vi, "list_pvalues_"): vi.list_pvalues_ = [ @@ -191,20 +139,9 @@ def test_selection_fdr_default_1(self, set_BaseselectionFDR): selection = vi.selection_fdr(0.2) np.testing.assert_array_equal(true_value, selection) - def test_selection_fdr_adaptation(self, set_BaseselectionFDR): - "test selection of the adaptation" - vi = set_BaseselectionFDR - selection = vi.selection_fdr(0.2, adaptive_aggregation=True) - assert np.all( - [ - i >= (vi.importances_ - np.sum(selection)) - for i in vi.importances_[selection] - ] - ) - - def test_selection_fdr_bhy(self, set_BaseselectionFDR): + def test_selection_fdr_bhy(self, set_BaseVariableImportance): "test selection with bhy" - vi = set_BaseselectionFDR + vi = set_BaseVariableImportance selection = vi.selection_fdr(0.2, fdr_control="bhy") assert np.all( [ @@ -277,27 +214,24 @@ def test_selection_threshold(self, set_BaseVariableImportance): @pytest.mark.parametrize( - "pvalues, list_pvalues, seed", - [(False, False, 0), (True, False, 0), (True, True, 0)], - ids=["only importance", "p-value", "list_pvalues"], + "seed", + [0], + ids=["default_seed"], ) class TestSelectionFDRExceptions: - def test_not_fit(self, pvalues, list_pvalues, seed): + def test_not_fit(self, seed): "test detection unfit" - class SelectionFDR(BaseVariableImportance, MixinSelectionFDR): - pass - - vi = SelectionFDR() + vi = BaseVariableImportance() with pytest.raises( ValueError, match="The importances need to be called before calling this method", ): vi.selection_fdr(0.1) - def test_selection_fdr_fdr_control(self, set_BaseselectionFDR): + def test_selection_fdr_fdr_control(self, set_BaseVariableImportance): "test selection fdr_control wrong" - vi = set_BaseselectionFDR + vi = set_BaseVariableImportance if vi.pvalues_ is None: with pytest.raises( TypeError, From e66af143efe2e012dea651c91950ad28afc31736 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Tue, 23 Sep 2025 19:04:33 +0200 Subject: [PATCH 27/54] fix error of docstring --- src/hidimstat/base_variable_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index c0cd5f32a..01781a077 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -165,9 +165,9 @@ def selection_fdr( Raises ------ ValueError - If importances_ haven't been computed yet + If `importances_` haven't been computed yet AssertionError - If pvalues_ are missing or fdr_control is invalid + If `pvalues_` are missing or fdr_control is invalid """ self._check_importance() assert ( From f680738aaff33523f1eaf2a1109fd2394e9fcf71 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 24 Sep 2025 11:51:52 +0200 Subject: [PATCH 28/54] Apply suggestion from @bthirion Co-authored-by: bthirion --- src/hidimstat/base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 01781a077..ba6c5e21a 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -152,7 +152,7 @@ def selection_fdr( The FDR control method to use: - 'bhq': Benjamini-Hochberg procedure - 'bhy': Benjamini-Hochberg-Yekutieli procedure - reshaping_function: callable, optional + reshaping_function: callable or None, default=None Optional reshaping function for FDR control methods. If None, defaults to sum of reciprocals for 'bhy'. From f079d242cec1208e04446b5a8aec1b6a611f7269 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 24 Sep 2025 11:52:06 +0200 Subject: [PATCH 29/54] Apply suggestion from @bthirion Co-authored-by: bthirion --- test/test_base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index d3ee25d8d..18b6e1e0a 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -5,7 +5,7 @@ @pytest.fixture -def set_BaseVariableImportance(seed): +def _set_variable_importance(seed): """Create a BaseVariableImportance instance with test data for testing purposes. Parameters From beed44fc703f2ec57b2b24095eb5fe57db37a7e0 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 24 Sep 2025 11:52:19 +0200 Subject: [PATCH 30/54] Apply suggestion from @bthirion Co-authored-by: bthirion --- test/test_base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 18b6e1e0a..86232e5bc 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -22,7 +22,7 @@ def _set_variable_importance(seed): BaseVariableImportance A BaseVariableImportance instance with test data. """ - nb_features = 100 + n_features = 100 rng = np.random.RandomState(seed) vi = BaseVariableImportance() vi.importances_ = np.arange(nb_features) From ab262ad4385704c5e9d5630fa1f3768db12d0aae Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 24 Sep 2025 11:52:30 +0200 Subject: [PATCH 31/54] Apply suggestion from @bthirion Co-authored-by: bthirion --- test/test_base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 86232e5bc..ec032c5b9 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -25,7 +25,7 @@ def _set_variable_importance(seed): n_features = 100 rng = np.random.RandomState(seed) vi = BaseVariableImportance() - vi.importances_ = np.arange(nb_features) + vi.importances_ = np.arange(n_features) rng.shuffle(vi.importances_) vi.pvalues_ = np.flip(np.sort(rng.random(nb_features)))[vi.importances_] return vi From ba43d4a9612de68c268698d9512ca6a2e034f573 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Wed, 24 Sep 2025 11:52:38 +0200 Subject: [PATCH 32/54] Apply suggestion from @bthirion Co-authored-by: bthirion --- test/test_base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index ec032c5b9..bfd962529 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -27,7 +27,7 @@ def _set_variable_importance(seed): vi = BaseVariableImportance() vi.importances_ = np.arange(n_features) rng.shuffle(vi.importances_) - vi.pvalues_ = np.flip(np.sort(rng.random(nb_features)))[vi.importances_] + vi.pvalues_ = np.flip(np.sort(rng.random(n_features)))[vi.importances_] return vi From e9a4432c76517b23d7cb6a8e4a59c85f1a60f34f Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Wed, 24 Sep 2025 12:00:10 +0200 Subject: [PATCH 33/54] chaneg name of fixture --- test/test_base_variable_importance.py | 64 +++++++++++++-------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index bfd962529..473c8e7d2 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -39,51 +39,51 @@ def _set_variable_importance(seed): class TestSelection: """Test selection based on importance""" - def test_selection_k_best(self, set_BaseVariableImportance): + def test_selection_k_best(self, _set_variable_importance): "test selection of the k_best" - vi = set_BaseVariableImportance + vi = _set_variable_importance true_value = vi.importances_ >= 95 selection = vi.selection(k_best=5) np.testing.assert_array_equal(true_value, selection) - def test_selection_k_best_all(self, set_BaseVariableImportance): + def test_selection_k_best_all(self, _set_variable_importance): "test selection to all base on string" - vi = set_BaseVariableImportance + vi = _set_variable_importance true_value = np.ones_like(vi.importances_, dtype=bool) selection = vi.selection(k_best="all") np.testing.assert_array_equal(true_value, selection) - def test_selection_k_best_none(self, set_BaseVariableImportance): + def test_selection_k_best_none(self, _set_variable_importance): "test selection when there none" - vi = set_BaseVariableImportance + vi = _set_variable_importance true_value = np.zeros_like(vi.importances_, dtype=bool) selection = vi.selection(k_best=0) np.testing.assert_array_equal(true_value, selection) - def test_selection_percentile(self, set_BaseVariableImportance): + def test_selection_percentile(self, _set_variable_importance): "test selection bae on percentile" - vi = set_BaseVariableImportance + vi = _set_variable_importance true_value = vi.importances_ >= 50 selection = vi.selection(percentile=50) np.testing.assert_array_equal(true_value, selection) - def test_selection_percentile_all(self, set_BaseVariableImportance): + def test_selection_percentile_all(self, _set_variable_importance): "test selection when percentile is 100" - vi = set_BaseVariableImportance + vi = _set_variable_importance true_value = np.ones_like(vi.importances_, dtype=bool) selection = vi.selection(percentile=100) np.testing.assert_array_equal(true_value, selection) - def test_selection_percentile_none(self, set_BaseVariableImportance): + def test_selection_percentile_none(self, _set_variable_importance): "test selection when percentile is 0" - vi = set_BaseVariableImportance + vi = _set_variable_importance true_value = np.zeros_like(vi.importances_, dtype=bool) selection = vi.selection(percentile=0) np.testing.assert_array_equal(true_value, selection) - def test_selection_percentile_threshols_value(self, set_BaseVariableImportance): + def test_selection_percentile_threshols_value(self, _set_variable_importance): "test selection when percentile when the percentile equal on value" - vi = set_BaseVariableImportance + vi = _set_variable_importance mask = np.ones_like(vi.importances_, dtype=bool) mask[np.where(vi.importances_ == 99)] = False vi.importances_ = vi.importances_[mask] @@ -91,16 +91,16 @@ def test_selection_percentile_threshols_value(self, set_BaseVariableImportance): selection = vi.selection(percentile=50) np.testing.assert_array_equal(true_value, selection) - def test_selection_threshold(self, set_BaseVariableImportance): + def test_selection_threshold(self, _set_variable_importance): "test threshold on importance" - vi = set_BaseVariableImportance + vi = _set_variable_importance true_value = vi.importances_ < 5 selection = vi.selection(threshold=5) np.testing.assert_array_equal(true_value, selection) - def test_selection_threshold_pvalue(self, set_BaseVariableImportance): + def test_selection_threshold_pvalue(self, _set_variable_importance): "test threshold vbse on pvalues" - vi = set_BaseVariableImportance + vi = _set_variable_importance true_value = vi.importances_ > 5 selection = vi.selection( threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] @@ -116,9 +116,9 @@ def test_selection_threshold_pvalue(self, set_BaseVariableImportance): class TestSelectionFDR: """Test selection based on fdr""" - def test_selection_fdr_default(self, set_BaseVariableImportance): + def test_selection_fdr_default(self, _set_variable_importance): "test selection of the default" - vi = set_BaseVariableImportance + vi = _set_variable_importance selection = vi.selection_fdr(0.2) assert np.all( [ @@ -127,9 +127,9 @@ def test_selection_fdr_default(self, set_BaseVariableImportance): ] ) - def test_selection_fdr_default_1(self, set_BaseVariableImportance): + def test_selection_fdr_default_1(self, _set_variable_importance): "test selection of the default" - vi = set_BaseVariableImportance + vi = _set_variable_importance vi.pvalues_ = np.random.rand(vi.importances_.shape[0]) * 30 if hasattr(vi, "list_pvalues_"): vi.list_pvalues_ = [ @@ -139,9 +139,9 @@ def test_selection_fdr_default_1(self, set_BaseVariableImportance): selection = vi.selection_fdr(0.2) np.testing.assert_array_equal(true_value, selection) - def test_selection_fdr_bhy(self, set_BaseVariableImportance): + def test_selection_fdr_bhy(self, _set_variable_importance): "test selection with bhy" - vi = set_BaseVariableImportance + vi = _set_variable_importance selection = vi.selection_fdr(0.2, fdr_control="bhy") assert np.all( [ @@ -173,17 +173,17 @@ def test_not_fit(self, seed): ): vi.selection() - def test_selection_k_best(self, set_BaseVariableImportance): + def test_selection_k_best(self, _set_variable_importance): "test selection k_best wrong" - vi = set_BaseVariableImportance + vi = _set_variable_importance with pytest.raises(AssertionError, match="k_best needs to be positive or null"): vi.selection(k_best=-10) with pytest.warns(Warning, match="k=1000 is greater than n_features="): vi.selection(k_best=1000) - def test_selection_percentile(self, set_BaseVariableImportance): + def test_selection_percentile(self, _set_variable_importance): "test selection percentile wrong" - vi = set_BaseVariableImportance + vi = _set_variable_importance with pytest.raises( AssertionError, match="percentile needs to be between 0 and 100" ): @@ -193,9 +193,9 @@ def test_selection_percentile(self, set_BaseVariableImportance): ): vi.selection(percentile=102) - def test_selection_threshold(self, set_BaseVariableImportance): + def test_selection_threshold(self, _set_variable_importance): "test selection threshold wrong" - vi = set_BaseVariableImportance + vi = _set_variable_importance if vi.pvalues_ is None: with pytest.raises( AssertionError, @@ -229,9 +229,9 @@ def test_not_fit(self, seed): ): vi.selection_fdr(0.1) - def test_selection_fdr_fdr_control(self, set_BaseVariableImportance): + def test_selection_fdr_fdr_control(self, _set_variable_importance): "test selection fdr_control wrong" - vi = set_BaseVariableImportance + vi = _set_variable_importance if vi.pvalues_ is None: with pytest.raises( TypeError, From 53ed88865db5b21f483eaab29a41886790d30eed Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 29 Sep 2025 11:34:51 +0200 Subject: [PATCH 34/54] remove all from k_best --- src/hidimstat/base_variable_importance.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index ba6c5e21a..ab22f3fd4 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -67,15 +67,12 @@ def selection( """ self._check_importance() if k_best is not None: - if not isinstance(k_best, str) and k_best > self.importances_.shape[0]: + assert k_best >= 0, "k_best needs to be positive or null" + if k_best > self.importances_.shape[0]: warnings.warn( f"k={k_best} is greater than n_features={self.importances_.shape[0]}. " "All the features will be returned." ) - if isinstance(k_best, str): - assert k_best == "all" - else: - assert k_best >= 0, "k_best needs to be positive or null" if percentile is not None: assert ( 0 <= percentile and percentile <= 100 @@ -89,9 +86,7 @@ def selection( ), "threshold_pvalue needs to be between 0 and 1" # base on SelectKBest of Scikit-Learn - if k_best == "all": - mask_k_best = np.ones(self.importances_.shape, dtype=bool) - elif k_best == 0: + if k_best == 0: mask_k_best = np.zeros(self.importances_.shape, dtype=bool) elif k_best is not None: mask_k_best = np.zeros(self.importances_.shape, dtype=bool) From dddbb4a45f7deb8d114544cde3a10e54c7acda22 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 29 Sep 2025 11:43:59 +0200 Subject: [PATCH 35/54] rename the variable --- test/test_base_variable_importance.py | 66 +++++++++++++-------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 473c8e7d2..97de9a459 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -5,7 +5,7 @@ @pytest.fixture -def _set_variable_importance(seed): +def set_100_variable_sorted(seed): """Create a BaseVariableImportance instance with test data for testing purposes. Parameters @@ -39,51 +39,51 @@ def _set_variable_importance(seed): class TestSelection: """Test selection based on importance""" - def test_selection_k_best(self, _set_variable_importance): + def test_selection_k_best(self, set_100_variable_sorted): "test selection of the k_best" - vi = _set_variable_importance + vi = set_100_variable_sorted true_value = vi.importances_ >= 95 selection = vi.selection(k_best=5) np.testing.assert_array_equal(true_value, selection) - def test_selection_k_best_all(self, _set_variable_importance): + def test_selection_k_best_all(self, set_100_variable_sorted): "test selection to all base on string" - vi = _set_variable_importance + vi = set_100_variable_sorted true_value = np.ones_like(vi.importances_, dtype=bool) selection = vi.selection(k_best="all") np.testing.assert_array_equal(true_value, selection) - def test_selection_k_best_none(self, _set_variable_importance): + def test_selection_k_best_none(self, set_100_variable_sorted): "test selection when there none" - vi = _set_variable_importance + vi = set_100_variable_sorted true_value = np.zeros_like(vi.importances_, dtype=bool) selection = vi.selection(k_best=0) np.testing.assert_array_equal(true_value, selection) - def test_selection_percentile(self, _set_variable_importance): + def test_selection_percentile(self, set_100_variable_sorted): "test selection bae on percentile" - vi = _set_variable_importance + vi = set_100_variable_sorted true_value = vi.importances_ >= 50 selection = vi.selection(percentile=50) np.testing.assert_array_equal(true_value, selection) - def test_selection_percentile_all(self, _set_variable_importance): + def test_selection_percentile_all(self, set_100_variable_sorted): "test selection when percentile is 100" - vi = _set_variable_importance + vi = set_100_variable_sorted true_value = np.ones_like(vi.importances_, dtype=bool) selection = vi.selection(percentile=100) np.testing.assert_array_equal(true_value, selection) - def test_selection_percentile_none(self, _set_variable_importance): + def test_selection_percentile_none(self, set_100_variable_sorted): "test selection when percentile is 0" - vi = _set_variable_importance + vi = set_100_variable_sorted true_value = np.zeros_like(vi.importances_, dtype=bool) selection = vi.selection(percentile=0) np.testing.assert_array_equal(true_value, selection) - def test_selection_percentile_threshols_value(self, _set_variable_importance): + def test_selection_percentile_threshols_value(self, set_100_variable_sorted): "test selection when percentile when the percentile equal on value" - vi = _set_variable_importance + vi = set_100_variable_sorted mask = np.ones_like(vi.importances_, dtype=bool) mask[np.where(vi.importances_ == 99)] = False vi.importances_ = vi.importances_[mask] @@ -91,16 +91,16 @@ def test_selection_percentile_threshols_value(self, _set_variable_importance): selection = vi.selection(percentile=50) np.testing.assert_array_equal(true_value, selection) - def test_selection_threshold(self, _set_variable_importance): + def test_selection_threshold(self, set_100_variable_sorted): "test threshold on importance" - vi = _set_variable_importance + vi = set_100_variable_sorted true_value = vi.importances_ < 5 selection = vi.selection(threshold=5) np.testing.assert_array_equal(true_value, selection) - def test_selection_threshold_pvalue(self, _set_variable_importance): + def test_selection_threshold_pvalue(self, set_100_variable_sorted): "test threshold vbse on pvalues" - vi = _set_variable_importance + vi = set_100_variable_sorted true_value = vi.importances_ > 5 selection = vi.selection( threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] @@ -116,9 +116,9 @@ def test_selection_threshold_pvalue(self, _set_variable_importance): class TestSelectionFDR: """Test selection based on fdr""" - def test_selection_fdr_default(self, _set_variable_importance): + def test_selection_fdr_default(self, set_100_variable_sorted): "test selection of the default" - vi = _set_variable_importance + vi = set_100_variable_sorted selection = vi.selection_fdr(0.2) assert np.all( [ @@ -127,9 +127,9 @@ def test_selection_fdr_default(self, _set_variable_importance): ] ) - def test_selection_fdr_default_1(self, _set_variable_importance): + def test_selection_fdr_default_1(self, set_100_variable_sorted): "test selection of the default" - vi = _set_variable_importance + vi = set_100_variable_sorted vi.pvalues_ = np.random.rand(vi.importances_.shape[0]) * 30 if hasattr(vi, "list_pvalues_"): vi.list_pvalues_ = [ @@ -139,9 +139,9 @@ def test_selection_fdr_default_1(self, _set_variable_importance): selection = vi.selection_fdr(0.2) np.testing.assert_array_equal(true_value, selection) - def test_selection_fdr_bhy(self, _set_variable_importance): + def test_selection_fdr_bhy(self, set_100_variable_sorted): "test selection with bhy" - vi = _set_variable_importance + vi = set_100_variable_sorted selection = vi.selection_fdr(0.2, fdr_control="bhy") assert np.all( [ @@ -173,17 +173,17 @@ def test_not_fit(self, seed): ): vi.selection() - def test_selection_k_best(self, _set_variable_importance): + def test_selection_k_best(self, set_100_variable_sorted): "test selection k_best wrong" - vi = _set_variable_importance + vi = set_100_variable_sorted with pytest.raises(AssertionError, match="k_best needs to be positive or null"): vi.selection(k_best=-10) with pytest.warns(Warning, match="k=1000 is greater than n_features="): vi.selection(k_best=1000) - def test_selection_percentile(self, _set_variable_importance): + def test_selection_percentile(self, set_100_variable_sorted): "test selection percentile wrong" - vi = _set_variable_importance + vi = set_100_variable_sorted with pytest.raises( AssertionError, match="percentile needs to be between 0 and 100" ): @@ -193,9 +193,9 @@ def test_selection_percentile(self, _set_variable_importance): ): vi.selection(percentile=102) - def test_selection_threshold(self, _set_variable_importance): + def test_selection_threshold(self, set_100_variable_sorted): "test selection threshold wrong" - vi = _set_variable_importance + vi = set_100_variable_sorted if vi.pvalues_ is None: with pytest.raises( AssertionError, @@ -229,9 +229,9 @@ def test_not_fit(self, seed): ): vi.selection_fdr(0.1) - def test_selection_fdr_fdr_control(self, _set_variable_importance): + def test_selection_fdr_fdr_control(self, set_100_variable_sorted): "test selection fdr_control wrong" - vi = _set_variable_importance + vi = set_100_variable_sorted if vi.pvalues_ is None: with pytest.raises( TypeError, From c06e1b9d7e65567b82f9fa6297fe81fcf1271cb0 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 29 Sep 2025 11:44:49 +0200 Subject: [PATCH 36/54] chnage borm for percentil --- src/hidimstat/base_variable_importance.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index ab22f3fd4..8486a067c 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -67,7 +67,7 @@ def selection( """ self._check_importance() if k_best is not None: - assert k_best >= 0, "k_best needs to be positive or null" + assert k_best >= 1, "k_best needs to be positive or None" if k_best > self.importances_.shape[0]: warnings.warn( f"k={k_best} is greater than n_features={self.importances_.shape[0]}. " @@ -75,8 +75,10 @@ def selection( ) if percentile is not None: assert ( - 0 <= percentile and percentile <= 100 - ), "percentile needs to be between 0 and 100" + 0 < percentile < 100 + ), "percentile must be between 0 and 100 (exclusive). Got {}.".format( + percentile + ) if threshold_pvalue is not None: assert ( self.pvalues_ is not None @@ -86,9 +88,7 @@ def selection( ), "threshold_pvalue needs to be between 0 and 1" # base on SelectKBest of Scikit-Learn - if k_best == 0: - mask_k_best = np.zeros(self.importances_.shape, dtype=bool) - elif k_best is not None: + if k_best is not None: mask_k_best = np.zeros(self.importances_.shape, dtype=bool) # Request a stable sort. Mergesort takes more memory (~40MB per @@ -98,11 +98,7 @@ def selection( mask_k_best = np.ones(self.importances_.shape, dtype=bool) # base on SelectPercentile of Scikit-Learn - if percentile == 100: - mask_percentile = np.ones(len(self.importances_), dtype=bool) - elif percentile == 0: - mask_percentile = np.zeros(len(self.importances_), dtype=bool) - elif percentile is not None: + if percentile is not None: threshold_percentile = np.percentile(self.importances_, 100 - percentile) mask_percentile = self.importances_ > threshold_percentile ties = np.where(self.importances_ == threshold_percentile)[0] From 66ec73e3447a01db140ed467cdfa57dd7014e3a6 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Mon, 29 Sep 2025 12:01:54 +0200 Subject: [PATCH 37/54] fix tests --- test/test_base_variable_importance.py | 35 ++++++++++++++++----------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 97de9a459..d9e4a1118 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -46,18 +46,11 @@ def test_selection_k_best(self, set_100_variable_sorted): selection = vi.selection(k_best=5) np.testing.assert_array_equal(true_value, selection) - def test_selection_k_best_all(self, set_100_variable_sorted): - "test selection to all base on string" - vi = set_100_variable_sorted - true_value = np.ones_like(vi.importances_, dtype=bool) - selection = vi.selection(k_best="all") - np.testing.assert_array_equal(true_value, selection) - def test_selection_k_best_none(self, set_100_variable_sorted): "test selection when there none" vi = set_100_variable_sorted - true_value = np.zeros_like(vi.importances_, dtype=bool) - selection = vi.selection(k_best=0) + true_value = np.ones_like(vi.importances_, dtype=bool) + selection = vi.selection(k_best=None) np.testing.assert_array_equal(true_value, selection) def test_selection_percentile(self, set_100_variable_sorted): @@ -71,14 +64,16 @@ def test_selection_percentile_all(self, set_100_variable_sorted): "test selection when percentile is 100" vi = set_100_variable_sorted true_value = np.ones_like(vi.importances_, dtype=bool) - selection = vi.selection(percentile=100) + true_value[np.argsort(vi.importances_)[0]] = False + selection = vi.selection(percentile=99.99) np.testing.assert_array_equal(true_value, selection) def test_selection_percentile_none(self, set_100_variable_sorted): "test selection when percentile is 0" vi = set_100_variable_sorted true_value = np.zeros_like(vi.importances_, dtype=bool) - selection = vi.selection(percentile=0) + true_value[np.argsort(vi.importances_)[-1:]] = True + selection = vi.selection(percentile=0.1) np.testing.assert_array_equal(true_value, selection) def test_selection_percentile_threshols_value(self, set_100_variable_sorted): @@ -176,7 +171,7 @@ def test_not_fit(self, seed): def test_selection_k_best(self, set_100_variable_sorted): "test selection k_best wrong" vi = set_100_variable_sorted - with pytest.raises(AssertionError, match="k_best needs to be positive or null"): + with pytest.raises(AssertionError, match="k_best needs to be positive"): vi.selection(k_best=-10) with pytest.warns(Warning, match="k=1000 is greater than n_features="): vi.selection(k_best=1000) @@ -185,13 +180,25 @@ def test_selection_percentile(self, set_100_variable_sorted): "test selection percentile wrong" vi = set_100_variable_sorted with pytest.raises( - AssertionError, match="percentile needs to be between 0 and 100" + AssertionError, + match="percentile must be between 0 and 100 \(exclusive\). Got -1.", ): vi.selection(percentile=-1) with pytest.raises( - AssertionError, match="percentile needs to be between 0 and 100" + AssertionError, + match="percentile must be between 0 and 100 \(exclusive\). Got 102.", ): vi.selection(percentile=102) + with pytest.raises( + AssertionError, + match="percentile must be between 0 and 100 \(exclusive\). Got 0.", + ): + vi.selection(percentile=0) + with pytest.raises( + AssertionError, + match="percentile must be between 0 and 100 \(exclusive\). Got 100", + ): + vi.selection(percentile=100) def test_selection_threshold(self, set_100_variable_sorted): "test selection threshold wrong" From 65f6fd0e7857fd1874cd46ff3d8f2918b0d0f495 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 2 Oct 2025 17:22:12 +0200 Subject: [PATCH 38/54] improve selection method --- src/hidimstat/base_variable_importance.py | 230 +++++++++++++++------- 1 file changed, 162 insertions(+), 68 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 8486a067c..9fbabcfb8 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -6,6 +6,80 @@ from hidimstat.statistical_tools.multiple_testing import fdr_threshold +def _selection_multy_criteria( + values, k_best=None, percentile=None, threshold_max=None, threshold_min=None +): + """ + Helper function for selecting features based on multiple criteria. + + Parameters + ---------- + values : array-like of shape (n_features,) + Values to use for feature selection (e.g., importance scores or p-values) + k_best : int, default=None + Selects the top k features based on values. + percentile : float, default=None + Selects features based on a specified percentile of values. + threshold_max : float, default=None + Selects features with values below the specified maximum threshold. + threshold_min : float, default=None + Selects features with values above the specified minimum threshold. + + Returns + ------- + selections : array-like of shape (n_features,) + Boolean array indicating the selected features. + """ + if k_best is not None: + assert k_best >= 1, "k_best needs to be positive or None" + if k_best > values.shape[0]: + warnings.warn( + f"k={k_best} is greater than n_features={values.shape[0]}. " + "All the features will be returned." + ) + if percentile is not None: + assert ( + 0 < percentile < 100 + ), "percentile must be between 0 and 100 (exclusive). Got {}.".format( + percentile + ) + + # base on SelectKBest of Scikit-Learn + if k_best is not None: + mask_k_best = np.zeros_like(values, dtype=bool) + + # Request a stable sort. Mergesort takes more memory (~40MB per + # megafeature on x86-64). + mask_k_best[np.argsort(values, kind="mergesort")[-k_best:]] = 1 + else: + mask_k_best = np.ones_like(values, dtype=bool) + + # base on SelectPercentile of Scikit-Learn + if percentile is not None: + threshold_percentile = np.percentile(values, 100 - percentile) + mask_percentile = values > threshold_percentile + ties = np.where(values == threshold_percentile)[0] + if len(ties): + max_feats = int(len(values) * percentile / 100) + kept_ties = ties[: max_feats - mask_percentile.sum()] + mask_percentile[kept_ties] = True + else: + mask_percentile = np.ones_like(values, dtype=bool) + + if threshold_max is not None: + mask_threshold_max = values < threshold_max + else: + mask_threshold_max = np.ones_like(values, dtype=bool) + + if threshold_min is not None: + mask_threshold_min = values > threshold_min + else: + mask_threshold_min = np.ones_like(values, dtype=bool) + + selections = mask_k_best & mask_percentile & mask_threshold_max & mask_threshold_min + return selections + + class BaseVariableImportance(BaseEstimator): """ Base class for variable importance methods. @@ -43,8 +117,8 @@ def _check_importance(self): "The importances need to be called before calling this method" ) - def selection( - self, k_best=None, percentile=None, threshold=None, threshold_pvalue=None + def importance_selection( + self, k_best=None, percentile=None, threshold_max=None, threshold_min=None ): """ Selects features based on variable importance. @@ -55,10 +129,10 @@ def selection( Selects the top k features based on importance scores. percentile : float, default=None Selects features based on a specified percentile of importance scores. - threshold : float, default=None - Selects features with importance scores above the specified threshold. - threshold_pvalue : float, default=None - Selects features with p-values below the specified threshold. + threshold_max : float, default=None + Selects features with importance scores below the specified maximum threshold. + threshold_min : float, default=None + Selects features with importance scores above the specified minimum threshold. Returns ------- @@ -66,71 +140,67 @@ def selection( Binary array indicating the selected features. """ self._check_importance() - if k_best is not None: - assert k_best >= 1, "k_best needs to be positive or None" - if k_best > self.importances_.shape[0]: - warnings.warn( - f"k={k_best} is greater than n_features={self.importances_.shape[0]}. " - "All the features will be returned." - ) - if percentile is not None: - assert ( - 0 < percentile < 100 - ), "percentile must be between 0 and 100 (exclusive). Got {}.".format( - percentile - ) - if threshold_pvalue is not None: - assert ( - self.pvalues_ is not None - ), "This method doesn't support a threshold on p-values" - assert ( - 0 < threshold_pvalue and threshold_pvalue < 1 - ), "threshold_pvalue needs to be between 0 and 1" - - # base on SelectKBest of Scikit-Learn - if k_best is not None: - mask_k_best = np.zeros(self.importances_.shape, dtype=bool) - - # Request a stable sort. Mergesort takes more memory (~40MB per - # megafeature on x86-64). - mask_k_best[np.argsort(self.importances_, kind="mergesort")[-k_best:]] = 1 - else: - mask_k_best = np.ones(self.importances_.shape, dtype=bool) - - # base on SelectPercentile of Scikit-Learn - if percentile is not None: - threshold_percentile = np.percentile(self.importances_, 100 - percentile) - mask_percentile = self.importances_ > threshold_percentile - ties = np.where(self.importances_ == threshold_percentile)[0] - if len(ties): - max_feats = int(len(self.importances_) * percentile / 100) - kept_ties = ties[: max_feats - mask_percentile.sum()] - mask_percentile[kept_ties] = True - else: - mask_percentile = np.ones(self.importances_.shape, dtype=bool) + return _selection_multy_criteria( + self.importances_, + k_best=k_best, + percentile=percentile, + threshold_max=threshold_max, + threshold_min=threshold_min, + ) - if threshold is not None: - mask_threshold = self.importances_ < threshold - else: - mask_threshold = np.ones(self.importances_.shape, dtype=bool) + def pvalue_selection( + self, + k_best=None, + percentile=None, + threshold_max=None, + threshold_min=None, + alternative_hypothesis=False, + ): + """ + Selects features based on p-values. - # base on SelectFpr of Scikit-Learn - if threshold_pvalue is not None: - mask_threshold_pvalue = self.pvalues_ < threshold_pvalue - else: - mask_threshold_pvalue = np.ones(self.importances_.shape, dtype=bool) + Parameters + ---------- + k_best : int, default=None + Selects the k features with lowest p-values. + percentile : float, default=None + Selects features based on a specified percentile of p-values. + threshold_max : float, default=None + Selects features with p-values below the specified maximum threshold. + threshold_min : float, default=None + Selects features with p-values above the specified minimum threshold. - selections = ( - mask_k_best & mask_percentile & mask_threshold & mask_threshold_pvalue + Returns + ------- + selection : array-like of shape (n_features,) + Binary array indicating the selected features. + """ + self._check_importance() + assert ( + self.pvalues_ is not None + ), "The selection on p-value can't be done because the current method does not compute p-values." + if threshold_min is not None: + assert ( + 0 < threshold_min and threshold_min < 1 + ), "threshold_min needs to be between 0 and 1" + if threshold_max is not None: + assert ( + 0 < threshold_max and threshold_max < 1 + ), "threshold_max needs to be between 0 and 1" + return _selection_multy_criteria( + self.pvalues_ if not alternative_hypothesis else 1 - self.pvalues_, + k_best=k_best, + percentile=percentile, + threshold_max=threshold_max, + threshold_min=threshold_min, ) - return selections - def selection_fdr( self, fdr, fdr_control="bhq", reshaping_function=None, + alternative_hippothesis=False, ): """ Performs feature selection based on False Discovery Rate (FDR) control. @@ -146,6 +216,10 @@ def selection_fdr( reshaping_function: callable or None, default=None Optional reshaping function for FDR control methods. If None, defaults to sum of reciprocals for 'bhy'. + alternative_hippothesis: bool or None, default=False + If False, selects features with small p-values. + If True, selects features with large p-values (close to 1). + If None, selects features that have either small or large p-values. Returns ------- @@ -168,11 +242,31 @@ def selection_fdr( fdr_control == "bhq" or fdr_control == "bhy" ), "only 'bhq' and 'bhy' are supported" - threshold_pval = fdr_threshold( - self.pvalues_, - fdr=fdr, - method=fdr_control, - reshaping_function=reshaping_function, - ) - selected = self.pvalues_ <= threshold_pval + # selection on pvalue + if alternative_hippothesis is None or not alternative_hippothesis: + threshold_pvalues = fdr_threshold( + self.pvalues_, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected_pvalues = self.pvalues_ <= threshold_pvalues + else: + selected_pvalues = np.ones_like(self.pvalues_, type=bool) + + # selection on 1-pvalue + if alternative_hippothesis is None or alternative_hippothesis: + threshold_one_minus_pvalues = fdr_threshold( + 1 - self.pvalues_, + fdr=fdr, + method=fdr_control, + reshaping_function=reshaping_function, + ) + selected_one_minus_pvalues = ( + 1 - self.pvalues_ + ) <= threshold_one_minus_pvalues + else: + selected_one_minus_pvalues = np.ones_like(self.pvalues_, type=bool) + + selected = selected_pvalues & selected_one_minus_pvalues return selected From 20a9b0e250eaaabeb610604365594382f56ecd48 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 2 Oct 2025 18:10:58 +0200 Subject: [PATCH 39/54] update test and the changement of signature --- src/hidimstat/base_variable_importance.py | 30 ++-- ...istilled_conditional_randomization_test.py | 14 +- test/test_base_variable_importance.py | 129 ++++++++++-------- ...istilled_conditional_randomization_test.py | 33 +++-- 4 files changed, 116 insertions(+), 90 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 9fbabcfb8..02159c518 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -43,6 +43,10 @@ def _selection_multy_criteria( ), "percentile must be between 0 and 100 (exclusive). Got {}.".format( percentile ) + if threshold_max is not None and threshold_min is not None: + assert ( + threshold_max > threshold_min + ), "threshold_max needs to be higher than threshold_min " # base on SelectKBest of Scikit-Learn if k_best is not None: @@ -166,14 +170,16 @@ def pvalue_selection( percentile : float, default=None Selects features based on a specified percentile of p-values. threshold_max : float, default=None - Selects features with p-values below the specified maximum threshold. + Selects features with p-values below the specified maximum threshold (0 to 1). threshold_min : float, default=None - Selects features with p-values above the specified minimum threshold. + Selects features with p-values above the specified minimum threshold (0 to 1). + alternative_hypothesis : bool, default=False + If True, selects based on 1-pvalues instead of p-values. Returns ------- selection : array-like of shape (n_features,) - Binary array indicating the selected features. + Binary array indicating the selected features (True for selected). """ self._check_importance() assert ( @@ -187,6 +193,9 @@ def pvalue_selection( assert ( 0 < threshold_max and threshold_max < 1 ), "threshold_max needs to be between 0 and 1" + assert alternative_hypothesis is None or isinstance( + alternative_hypothesis, bool + ), "alternative_hippothesis can have only three values: True, False and None." return _selection_multy_criteria( self.pvalues_ if not alternative_hypothesis else 1 - self.pvalues_, k_best=k_best, @@ -195,12 +204,12 @@ def pvalue_selection( threshold_min=threshold_min, ) - def selection_fdr( + def fdr_selection( self, fdr, fdr_control="bhq", reshaping_function=None, - alternative_hippothesis=False, + alternative_hypothesis=False, ): """ Performs feature selection based on False Discovery Rate (FDR) control. @@ -241,9 +250,12 @@ def selection_fdr( assert ( fdr_control == "bhq" or fdr_control == "bhy" ), "only 'bhq' and 'bhy' are supported" + assert alternative_hypothesis is None or isinstance( + alternative_hypothesis, bool + ), "alternative_hippothesis can have only three values: True, False and None." # selection on pvalue - if alternative_hippothesis is None or not alternative_hippothesis: + if alternative_hypothesis is None or not alternative_hypothesis: threshold_pvalues = fdr_threshold( self.pvalues_, fdr=fdr, @@ -252,10 +264,10 @@ def selection_fdr( ) selected_pvalues = self.pvalues_ <= threshold_pvalues else: - selected_pvalues = np.ones_like(self.pvalues_, type=bool) + selected_pvalues = np.ones_like(self.pvalues_, dtype=bool) # selection on 1-pvalue - if alternative_hippothesis is None or alternative_hippothesis: + if alternative_hypothesis is None or alternative_hypothesis: threshold_one_minus_pvalues = fdr_threshold( 1 - self.pvalues_, fdr=fdr, @@ -266,7 +278,7 @@ def selection_fdr( 1 - self.pvalues_ ) <= threshold_one_minus_pvalues else: - selected_one_minus_pvalues = np.ones_like(self.pvalues_, type=bool) + selected_one_minus_pvalues = np.ones_like(self.pvalues_, dtype=bool) selected = selected_pvalues & selected_one_minus_pvalues return selected diff --git a/src/hidimstat/distilled_conditional_randomization_test.py b/src/hidimstat/distilled_conditional_randomization_test.py index ec4bea7f8..b8e4b9fb2 100644 --- a/src/hidimstat/distilled_conditional_randomization_test.py +++ b/src/hidimstat/distilled_conditional_randomization_test.py @@ -665,8 +665,9 @@ def d0crt( reuse_screening_model=True, k_best=None, percentile=None, - threshold=None, - threshold_pvalue=None, + threshold_min=None, + threshold_max=None, + alternative_hypothesis=False, ): methods = D0CRT( estimator=estimator, @@ -686,11 +687,12 @@ def d0crt( random_state=random_state, ) methods.fit_importance(X, y, cv=cv) - selection = methods.selection( + selection = methods.pvalue_selection( k_best=k_best, percentile=percentile, - threshold=threshold, - threshold_pvalue=threshold_pvalue, + threshold_min=threshold_min, + threshold_max=threshold_max, + alternative_hypothesis=alternative_hypothesis, ) return selection, methods.importances_, methods.pvalues_ @@ -701,7 +703,7 @@ def d0crt( D0CRT.__doc__, D0CRT.__init__.__doc__, D0CRT.fit_importance.__doc__, - D0CRT.selection.__doc__, + D0CRT.pvalue_selection.__doc__, ], """ Returns diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index d9e4a1118..fe0641132 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -43,21 +43,21 @@ def test_selection_k_best(self, set_100_variable_sorted): "test selection of the k_best" vi = set_100_variable_sorted true_value = vi.importances_ >= 95 - selection = vi.selection(k_best=5) + selection = vi.importance_selection(k_best=5) np.testing.assert_array_equal(true_value, selection) def test_selection_k_best_none(self, set_100_variable_sorted): "test selection when there none" vi = set_100_variable_sorted true_value = np.ones_like(vi.importances_, dtype=bool) - selection = vi.selection(k_best=None) + selection = vi.importance_selection(k_best=None) np.testing.assert_array_equal(true_value, selection) def test_selection_percentile(self, set_100_variable_sorted): "test selection bae on percentile" vi = set_100_variable_sorted true_value = vi.importances_ >= 50 - selection = vi.selection(percentile=50) + selection = vi.importance_selection(percentile=50) np.testing.assert_array_equal(true_value, selection) def test_selection_percentile_all(self, set_100_variable_sorted): @@ -65,7 +65,7 @@ def test_selection_percentile_all(self, set_100_variable_sorted): vi = set_100_variable_sorted true_value = np.ones_like(vi.importances_, dtype=bool) true_value[np.argsort(vi.importances_)[0]] = False - selection = vi.selection(percentile=99.99) + selection = vi.importance_selection(percentile=99.99) np.testing.assert_array_equal(true_value, selection) def test_selection_percentile_none(self, set_100_variable_sorted): @@ -73,7 +73,7 @@ def test_selection_percentile_none(self, set_100_variable_sorted): vi = set_100_variable_sorted true_value = np.zeros_like(vi.importances_, dtype=bool) true_value[np.argsort(vi.importances_)[-1:]] = True - selection = vi.selection(percentile=0.1) + selection = vi.importance_selection(percentile=0.1) np.testing.assert_array_equal(true_value, selection) def test_selection_percentile_threshols_value(self, set_100_variable_sorted): @@ -83,23 +83,21 @@ def test_selection_percentile_threshols_value(self, set_100_variable_sorted): mask[np.where(vi.importances_ == 99)] = False vi.importances_ = vi.importances_[mask] true_value = vi.importances_ >= 50 - selection = vi.selection(percentile=50) + selection = vi.importance_selection(percentile=50) np.testing.assert_array_equal(true_value, selection) - def test_selection_threshold(self, set_100_variable_sorted): - "test threshold on importance" + def test_selection_threshold_min(self, set_100_variable_sorted): + "test threshold minimal on importance" vi = set_100_variable_sorted - true_value = vi.importances_ < 5 - selection = vi.selection(threshold=5) + true_value = vi.importances_ > 5 + selection = vi.importance_selection(threshold_min=5) np.testing.assert_array_equal(true_value, selection) - def test_selection_threshold_pvalue(self, set_100_variable_sorted): - "test threshold vbse on pvalues" + def test_selection_threshold_max(self, set_100_variable_sorted): + "test threshold maximal on importance" vi = set_100_variable_sorted - true_value = vi.importances_ > 5 - selection = vi.selection( - threshold_pvalue=vi.pvalues_[np.argsort(vi.importances_)[5]] - ) + true_value = vi.importances_ < 5 + selection = vi.importance_selection(threshold_max=5) np.testing.assert_array_equal(true_value, selection) @@ -114,7 +112,7 @@ class TestSelectionFDR: def test_selection_fdr_default(self, set_100_variable_sorted): "test selection of the default" vi = set_100_variable_sorted - selection = vi.selection_fdr(0.2) + selection = vi.fdr_selection(0.2) assert np.all( [ i >= (vi.importances_ - np.sum(selection)) @@ -126,18 +124,14 @@ def test_selection_fdr_default_1(self, set_100_variable_sorted): "test selection of the default" vi = set_100_variable_sorted vi.pvalues_ = np.random.rand(vi.importances_.shape[0]) * 30 - if hasattr(vi, "list_pvalues_"): - vi.list_pvalues_ = [ - np.random.rand(vi.importances_.shape[0]) * 30 for i in range(10) - ] true_value = np.zeros_like(vi.importances_, dtype=bool) # selected any - selection = vi.selection_fdr(0.2) + selection = vi.fdr_selection(0.2) np.testing.assert_array_equal(true_value, selection) def test_selection_fdr_bhy(self, set_100_variable_sorted): "test selection with bhy" vi = set_100_variable_sorted - selection = vi.selection_fdr(0.2, fdr_control="bhy") + selection = vi.fdr_selection(0.2, fdr_control="bhy") assert np.all( [ i >= (vi.importances_ - np.sum(selection)) @@ -166,15 +160,15 @@ def test_not_fit(self, seed): ValueError, match="The importances need to be called before calling this method", ): - vi.selection() + vi.importance_selection() def test_selection_k_best(self, set_100_variable_sorted): "test selection k_best wrong" vi = set_100_variable_sorted with pytest.raises(AssertionError, match="k_best needs to be positive"): - vi.selection(k_best=-10) + vi.importance_selection(k_best=-10) with pytest.warns(Warning, match="k=1000 is greater than n_features="): - vi.selection(k_best=1000) + vi.importance_selection(k_best=1000) def test_selection_percentile(self, set_100_variable_sorted): "test selection percentile wrong" @@ -183,41 +177,57 @@ def test_selection_percentile(self, set_100_variable_sorted): AssertionError, match="percentile must be between 0 and 100 \(exclusive\). Got -1.", ): - vi.selection(percentile=-1) + vi.importance_selection(percentile=-1) with pytest.raises( AssertionError, match="percentile must be between 0 and 100 \(exclusive\). Got 102.", ): - vi.selection(percentile=102) + vi.importance_selection(percentile=102) with pytest.raises( AssertionError, match="percentile must be between 0 and 100 \(exclusive\). Got 0.", ): - vi.selection(percentile=0) + vi.importance_selection(percentile=0) with pytest.raises( AssertionError, match="percentile must be between 0 and 100 \(exclusive\). Got 100", ): - vi.selection(percentile=100) + vi.importance_selection(percentile=100) + + def test_selection_pvalue_None(self, set_100_variable_sorted): + "test selection on pvalue without it" + vi = set_100_variable_sorted + vi.pvalues_ = None + with pytest.raises( + AssertionError, + match="The selection on p-value can't be done because the current method does not compute p-values.", + ): + vi.pvalue_selection(threshold_min=-1) def test_selection_threshold(self, set_100_variable_sorted): "test selection threshold wrong" vi = set_100_variable_sorted - if vi.pvalues_ is None: - with pytest.raises( - AssertionError, - match="This method doesn't support a threshold on p-values", - ): - vi.selection(threshold_pvalue=-1) - else: - with pytest.raises( - AssertionError, match="threshold_pvalue needs to be between 0 and 1" - ): - vi.selection(threshold_pvalue=-1) - with pytest.raises( - AssertionError, match="threshold_pvalue needs to be between 0 and 1" - ): - vi.selection(threshold_pvalue=1.1) + with pytest.raises( + AssertionError, match="threshold_min needs to be between 0 and 1" + ): + vi.pvalue_selection(threshold_min=-1) + with pytest.raises( + AssertionError, match="threshold_min needs to be between 0 and 1" + ): + vi.pvalue_selection(threshold_min=1.1) + with pytest.raises( + AssertionError, match="threshold_max needs to be between 0 and 1" + ): + vi.pvalue_selection(threshold_max=-1) + with pytest.raises( + AssertionError, match="threshold_max needs to be between 0 and 1" + ): + vi.pvalue_selection(threshold_max=1.1) + with pytest.raises( + AssertionError, + match="threshold_max needs to be higher than threshold_min", + ): + vi.pvalue_selection(threshold_max=0.5, threshold_min=0.9) @pytest.mark.parametrize( @@ -234,20 +244,23 @@ def test_not_fit(self, seed): ValueError, match="The importances need to be called before calling this method", ): - vi.selection_fdr(0.1) + vi.fdr_selection(0.1) + + def test_selection_fdr_pvalue_None(self, set_100_variable_sorted): + "test selection fdr without pvalue" + vi = set_100_variable_sorted + vi.pvalues_ = None + with pytest.raises( + AssertionError, + match="FDR-based selection requires p-values to be computed first. The current method does not support p-values.", + ): + vi.fdr_selection(fdr=0.1) def test_selection_fdr_fdr_control(self, set_100_variable_sorted): "test selection fdr_control wrong" vi = set_100_variable_sorted - if vi.pvalues_ is None: - with pytest.raises( - TypeError, - match="object of type 'NoneType' has no len()", - ): - vi.selection_fdr(fdr=0.1) - else: - with pytest.raises( - AssertionError, - match="only 'bhq' and 'bhy' are supported", - ): - vi.selection_fdr(fdr=0.1, fdr_control="ehb") + with pytest.raises( + AssertionError, + match="only 'bhq' and 'bhy' are supported", + ): + vi.fdr_selection(fdr=0.1, fdr_control="ehb") diff --git a/test/test_distilled_conditional_randomization_test.py b/test/test_distilled_conditional_randomization_test.py index 7c868faa5..e8c65192a 100644 --- a/test/test_distilled_conditional_randomization_test.py +++ b/test/test_distilled_conditional_randomization_test.py @@ -4,7 +4,6 @@ import numpy as np import pytest -from numpy.random import RandomState from sklearn.covariance import LedoitWolf from sklearn.datasets import make_classification, make_regression from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor @@ -33,13 +32,13 @@ def test_dcrt_lasso_screening(generate_regression_dataset): screening_threshold=None, ) pvalue_no_screening = d0crt_no_screening.fit_importance(X, y) - sv_no_screening = d0crt_no_screening.selection(threshold_pvalue=0.05) + sv_no_screening = d0crt_no_screening.pvalue_selection(threshold_max=0.05) d0crt_screening = D0CRT( estimator=LassoCV(n_jobs=1), screening_threshold=10, ) pvalue_screening = d0crt_screening.fit_importance(X, y) - sv_screening = d0crt_screening.selection(threshold_pvalue=0.05) + sv_screening = d0crt_screening.pvalue_selection(threshold_max=0.05) assert np.sum(d0crt_no_screening.importances_ != 0) <= 10 assert np.sum(d0crt_screening.importances_ != 0) <= 10 assert len(sv_no_screening) <= 10 @@ -57,7 +56,7 @@ def test_dcrt_lasso_screening(generate_regression_dataset): ) d0crt_no_screening.fit_importance(X, y) pvalue_no_screening = d0crt_no_screening.importance(X, y) - sv_no_screening = d0crt_no_screening.selection(threshold_pvalue=0.05) + sv_no_screening = d0crt_no_screening.pvalue_selection(threshold_max=0.05) assert len(sv_no_screening) <= 10 assert len(pvalue_no_screening) == 10 assert len(d0crt_no_screening.importances_) == 10 @@ -79,7 +78,7 @@ def test_dcrt_lasso_with_estimed_coefficient(generate_regression_dataset): ) d0crt.fit(X, y) pvalue = d0crt.importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt.importances_) == 10 @@ -97,7 +96,7 @@ def test_dcrt_lasso_with_refit(generate_regression_dataset): screening_threshold=None, ) pvalue = d0crt_refit.fit_importance(X, y) - sv = d0crt_refit.selection(threshold_pvalue=0.05) + sv = d0crt_refit.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt_refit.importances_) == 10 @@ -115,7 +114,7 @@ def test_dcrt_lasso_with_no_cv(generate_regression_dataset): screening_threshold=None, ) pvalue = d0crt_use_cv.fit_importance(X, y) - sv = d0crt_use_cv.selection(threshold_pvalue=0.05) + sv = d0crt_use_cv.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt_use_cv.importances_) == 10 @@ -135,7 +134,7 @@ def test_dcrt_lasso_with_covariance(generate_regression_dataset): screening_threshold=None, ) pvalue = d0crt_covariance.fit_importance(X, y) - sv = d0crt_covariance.selection(threshold_pvalue=0.05) + sv = d0crt_covariance.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt_covariance.importances_) == 10 @@ -153,7 +152,7 @@ def test_dcrt_lasso_center(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt.importances_) == 10 @@ -171,7 +170,7 @@ def test_dcrt_lasso_refit(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert len(sv) <= 10 assert len(pvalue) == 10 assert len(d0crt.importances_) == 10 @@ -199,7 +198,7 @@ def test_dcrt_distillation_x_different(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.where(d0crt.importances_ != 0)[0].shape[0] <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -217,7 +216,7 @@ def test_dcrt_distillation_y_different(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.where(d0crt.importances_ != 0)[0].shape[0] <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -237,7 +236,7 @@ def test_dcrt_lasso_fit_with_no_cv(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.sum(d0crt.importances_ != 0) <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -257,7 +256,7 @@ def test_dcrt_RF_regression(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.where(d0crt.importances_ != 0)[0].shape[0] <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -276,7 +275,7 @@ def test_dcrt_RF_classification(): scaled_statistics=True, ) pvalue = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.where(d0crt.importances_ != 0)[0].shape[0] <= 10 assert len(sv) <= 10 assert len(pvalue) == 10 @@ -460,7 +459,7 @@ def test_d0crt_linear(): screening_threshold=90, ) importances = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.mean(importances[important_ids]) > np.mean(importances[~important_ids]) assert np.array_equal(np.where(sv)[0], important_ids) @@ -486,7 +485,7 @@ def test_d0crt_rf(): random_state=0, ) importances = d0crt.fit_importance(X, y) - sv = d0crt.selection(threshold_pvalue=0.05) + sv = d0crt.pvalue_selection(threshold_max=0.05) assert np.mean(importances[important_ids]) > np.mean(importances[~important_ids]) assert np.array_equal(np.where(sv)[0], important_ids) From 6982b55d0dc2892dd3968cd784244f8f87faaf06 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 2 Oct 2025 18:59:53 +0200 Subject: [PATCH 40/54] improve coverage --- src/hidimstat/base_variable_importance.py | 7 +-- test/test_base_variable_importance.py | 55 +++++++++++++++++++++ test/test_conditional_feature_importance.py | 2 - 3 files changed, 59 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 02159c518..74b6daa86 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -244,6 +244,7 @@ def fdr_selection( If `pvalues_` are missing or fdr_control is invalid """ self._check_importance() + assert 0 < fdr and fdr < 1, "FDR needs to be between 0 and 1 excluded" assert ( self.pvalues_ is not None ), "FDR-based selection requires p-values to be computed first. The current method does not support p-values." @@ -264,7 +265,7 @@ def fdr_selection( ) selected_pvalues = self.pvalues_ <= threshold_pvalues else: - selected_pvalues = np.ones_like(self.pvalues_, dtype=bool) + selected_pvalues = np.zeros_like(self.pvalues_, dtype=bool) # selection on 1-pvalue if alternative_hypothesis is None or alternative_hypothesis: @@ -278,7 +279,7 @@ def fdr_selection( 1 - self.pvalues_ ) <= threshold_one_minus_pvalues else: - selected_one_minus_pvalues = np.ones_like(self.pvalues_, dtype=bool) + selected_one_minus_pvalues = np.zeros_like(self.pvalues_, dtype=bool) - selected = selected_pvalues & selected_one_minus_pvalues + selected = selected_pvalues | selected_one_minus_pvalues return selected diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index fe0641132..df23d12c1 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -139,6 +139,42 @@ def test_selection_fdr_bhy(self, set_100_variable_sorted): ] ) + def test_selection_fdr_alternative_hypothesis(self, set_100_variable_sorted): + "test selection fdr_control wrong" + vi = set_100_variable_sorted + with pytest.raises( + AssertionError, + match="alternative_hippothesis can have only three values: True, False and None.", + ): + vi.fdr_selection(fdr=0.1, alternative_hypothesis="alt") + + def test_selection_fdr_pvalue(self, set_100_variable_sorted): + "test selection fdr without 1-pvalue" + vi = set_100_variable_sorted + true_value = np.arange(100) <= 4 + selection = vi.fdr_selection(fdr=0.9, alternative_hypothesis=False) + np.testing.assert_equal( + true_value, np.flip(selection[np.argsort(vi.importances_)]) + ) + + def test_selection_fdr_one_minus_pvalue(self, set_100_variable_sorted): + "test selection fdr without 1-pvalue" + vi = set_100_variable_sorted + true_value = np.arange(100) >= 34 + selection = vi.fdr_selection(fdr=0.9, alternative_hypothesis=True) + np.testing.assert_equal( + true_value, np.flip(selection[np.argsort(vi.importances_)]) + ) + + def test_selection_fdr_two_side(self, set_100_variable_sorted): + "test selection fdr without 1-pvalue" + vi = set_100_variable_sorted + true_value = np.logical_or(np.arange(100) <= 4, np.arange(100) >= 34) + selection = vi.fdr_selection(fdr=0.9, alternative_hypothesis=None) + np.testing.assert_equal( + true_value, np.flip(selection[np.argsort(vi.importances_)]) + ) + @pytest.mark.parametrize( "seed", @@ -246,6 +282,25 @@ def test_not_fit(self, seed): ): vi.fdr_selection(0.1) + def test_selection_fdr_wrong_fdr(self, set_100_variable_sorted): + "test selection fdr with wrong fdr" + vi = set_100_variable_sorted + with pytest.raises( + AssertionError, + match="FDR needs to be between 0 and 1 excluded", + ): + vi.fdr_selection(fdr=0.0) + with pytest.raises( + AssertionError, + match="FDR needs to be between 0 and 1 excluded", + ): + vi.fdr_selection(fdr=1.0) + with pytest.raises( + AssertionError, + match="FDR needs to be between 0 and 1 excluded", + ): + vi.fdr_selection(fdr=-1.0) + def test_selection_fdr_pvalue_None(self, set_100_variable_sorted): "test selection fdr without pvalue" vi = set_100_variable_sorted diff --git a/test/test_conditional_feature_importance.py b/test/test_conditional_feature_importance.py index a2eb15d37..89bbf074e 100644 --- a/test/test_conditional_feature_importance.py +++ b/test/test_conditional_feature_importance.py @@ -531,7 +531,6 @@ def test_incompatible_imputer(self, data_generator): imputation_model_continuous="invalid_imputer", method="predict", ) - cfi.fit(X, y) with pytest.raises(AssertionError, match="Categorial imputation model invalid"): cfi = CFI( @@ -539,7 +538,6 @@ def test_incompatible_imputer(self, data_generator): imputation_model_categorical="invalid_imputer", method="predict", ) - cfi.fit(X, y) def test_invalid_groups_format(self, data_generator): """Test when groups are provided in invalid format""" From 3b89e1e991fd7f5004f355544fed367216b2d892 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 2 Oct 2025 19:15:56 +0200 Subject: [PATCH 41/54] change defautl value --- src/hidimstat/base_variable_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index f296e9a1a..08615fdc5 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -157,7 +157,7 @@ def pvalue_selection( self, k_best=None, percentile=None, - threshold_max=None, + threshold_max=0.05, threshold_min=None, alternative_hypothesis=False, ): @@ -170,7 +170,7 @@ def pvalue_selection( Selects the k features with lowest p-values. percentile : float, default=None Selects features based on a specified percentile of p-values. - threshold_max : float, default=None + threshold_max : float, default=0.05 Selects features with p-values below the specified maximum threshold (0 to 1). threshold_min : float, default=None Selects features with p-values above the specified minimum threshold (0 to 1). From 79a58b6a81b2526c5a77c58725710fa925cea48b Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 3 Oct 2025 14:30:22 +0200 Subject: [PATCH 42/54] Update src/hidimstat/base_variable_importance.py Co-authored-by: bthirion --- src/hidimstat/base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 08615fdc5..5fb06be52 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -7,7 +7,7 @@ from hidimstat.statistical_tools.multiple_testing import fdr_threshold -def _selection_multy_criteria( +def _selection_multi_criteria( values, k_best=None, percentile=None, threshold_max=None, threshold_min=None ): """ From 7e5442be20df94221c9828f4e58eb978fa3e70f5 Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 3 Oct 2025 14:51:50 +0200 Subject: [PATCH 43/54] Update src/hidimstat/base_variable_importance.py Co-authored-by: bthirion --- src/hidimstat/base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 5fb06be52..2fea45904 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -49,7 +49,7 @@ def _selection_multi_criteria( threshold_max > threshold_min ), "threshold_max needs to be higher than threshold_min " - # base on SelectKBest of Scikit-Learn + # based on SelectKBest in Scikit-Learn if k_best is not None: mask_k_best = np.zeros_like(values, dtype=bool) From 9da36072299a68d98c61b6ba1b59059676e09b2f Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 3 Oct 2025 14:52:24 +0200 Subject: [PATCH 44/54] Update src/hidimstat/base_variable_importance.py Co-authored-by: bthirion --- src/hidimstat/base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 2fea45904..5df4e53e6 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -59,7 +59,7 @@ def _selection_multi_criteria( else: mask_k_best = np.ones_like(values, dtype=bool) - # base on SelectPercentile of Scikit-Learn + # based on SelectPercentile in Scikit-Learn if percentile is not None: threshold_percentile = np.percentile(values, 100 - percentile) mask_percentile = values > threshold_percentile From ed39b3d68810c528ee48f40f97951e589b44183a Mon Sep 17 00:00:00 2001 From: lionel kusch Date: Fri, 3 Oct 2025 14:52:54 +0200 Subject: [PATCH 45/54] Update src/hidimstat/base_variable_importance.py Co-authored-by: Joseph Paillard --- src/hidimstat/base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 5df4e53e6..6fef023ae 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -115,7 +115,7 @@ def __init__(self): def _check_importance(self): """ - Checks if the importance scores and p-values have been computed. + Checks if the importance scores have been computed. """ if self.importances_ is None: raise ValueError( From d86644db6069c230ec4a60bae7a3a9ecb89f02b5 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 3 Oct 2025 14:57:36 +0200 Subject: [PATCH 46/54] update following the comments --- src/hidimstat/base_variable_importance.py | 34 +++++++++++++++++------ 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 6fef023ae..c61be7999 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -8,7 +8,12 @@ def _selection_multi_criteria( - values, k_best=None, percentile=None, threshold_max=None, threshold_min=None + values, + k_best=None, + k_lowest=None, + percentile=None, + threshold_max=None, + threshold_min=None, ): """ Helper function for selecting features based on multiple criteria. @@ -19,6 +24,8 @@ def _selection_multi_criteria( Values to use for feature selection (e.g., importance scores or p-values) k_best : int, default=None Selects the top k features based on values. + k_lowest : int, default=None + Selects the lowest k features based on values. percentile : float, default=None Selects features based on a specified percentile of values. threshold_max : float, default=None @@ -28,7 +35,7 @@ def _selection_multi_criteria( Returns ------- - selections : array-like of shape (n_features,) + selection : array-like of shape (n_features,) Boolean array indicating the selected features. """ if k_best is not None: @@ -59,6 +66,15 @@ def _selection_multi_criteria( else: mask_k_best = np.ones_like(values, dtype=bool) + if k_lowest is not None: + mask_k_lowest = np.zeros_like(values, dtype=bool) + + # Request a stable sort. Mergesort takes more memory (~40MB per + # megafeature on x86-64). + mask_k_lowest[np.argsort(values, kind="mergesort")[:mask_k_lowest]] = 1 + else: + mask_k_lowest = np.ones_like(values, dtype=bool) + # based on SelectPercentile in Scikit-Learn if percentile is not None: threshold_percentile = np.percentile(values, 100 - percentile) @@ -81,8 +97,8 @@ def _selection_multi_criteria( else: mask_threshold_min = np.ones_like(values, dtype=bool) - selections = mask_k_best & mask_percentile & mask_threshold_max & mask_threshold_min - return selections + selection = mask_k_best & mask_percentile & mask_threshold_max & mask_threshold_min + return selection class BaseVariableImportance(BaseEstimator): @@ -145,7 +161,7 @@ def importance_selection( Binary array indicating the selected features. """ self._check_importance() - return _selection_multy_criteria( + return _selection_multi_criteria( self.importances_, k_best=k_best, percentile=percentile, @@ -155,7 +171,7 @@ def importance_selection( def pvalue_selection( self, - k_best=None, + k_lowest=None, percentile=None, threshold_max=0.05, threshold_min=None, @@ -166,7 +182,7 @@ def pvalue_selection( Parameters ---------- - k_best : int, default=None + k_lowest : int, default=None Selects the k features with lowest p-values. percentile : float, default=None Selects features based on a specified percentile of p-values. @@ -197,9 +213,9 @@ def pvalue_selection( assert alternative_hypothesis is None or isinstance( alternative_hypothesis, bool ), "alternative_hippothesis can have only three values: True, False and None." - return _selection_multy_criteria( + return _selection_multi_criteria( self.pvalues_ if not alternative_hypothesis else 1 - self.pvalues_, - k_best=k_best, + k_lowest=k_lowest, percentile=percentile, threshold_max=threshold_max, threshold_min=threshold_min, From 981266001a45510d511eead741c7c438e30cbbf3 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 3 Oct 2025 15:16:24 +0200 Subject: [PATCH 47/54] fix bug --- src/hidimstat/distilled_conditional_randomization_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hidimstat/distilled_conditional_randomization_test.py b/src/hidimstat/distilled_conditional_randomization_test.py index 621bf2822..c3cd1102d 100644 --- a/src/hidimstat/distilled_conditional_randomization_test.py +++ b/src/hidimstat/distilled_conditional_randomization_test.py @@ -664,7 +664,7 @@ def d0crt( scaled_statistics=False, random_state=None, reuse_screening_model=True, - k_best=None, + k_lowest=None, percentile=None, threshold_min=None, threshold_max=None, @@ -689,7 +689,7 @@ def d0crt( ) methods.fit_importance(X, y, cv=cv) selection = methods.pvalue_selection( - k_best=k_best, + k_lowest=k_lowest, percentile=percentile, threshold_min=threshold_min, threshold_max=threshold_max, From b28965ceb88344a82bab79419507c684463e9cf8 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 9 Oct 2025 14:37:42 +0200 Subject: [PATCH 48/54] selection one criteria --- src/hidimstat/base_variable_importance.py | 65 ++++++++++------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 516d42dd2..420b42abc 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -9,7 +9,7 @@ from hidimstat._utils.exception import InternalError -def _selection_multi_criteria( +def _selection_generic( values, k_best=None, k_lowest=None, @@ -40,6 +40,13 @@ def _selection_multi_criteria( selection : array-like of shape (n_features,) Boolean array indicating the selected features. """ + n_criteria = np.sum( + [ + criteria is not None + for criteria in [k_best, k_lowest, percentile, threshold_max, threshold_min] + ] + ) + assert n_criteria <= 1, "Only support selection based on one criteria." if k_best is not None: assert k_best >= 1, "k_best needs to be positive or None" if k_best > values.shape[0]: @@ -47,38 +54,27 @@ def _selection_multi_criteria( f"k={k_best} is greater than n_features={values.shape[0]}. " "All the features will be returned." ) - if percentile is not None: - assert ( - 0 < percentile < 100 - ), "percentile must be between 0 and 100 (exclusive). Got {}.".format( - percentile - ) - if threshold_max is not None and threshold_min is not None: - assert ( - threshold_max > threshold_min - ), "threshold_max needs to be higher than threshold_min " - - # based on SelectKBest in Scikit-Learn - if k_best is not None: mask_k_best = np.zeros_like(values, dtype=bool) + # based on SelectKBest in Scikit-Learn # Request a stable sort. Mergesort takes more memory (~40MB per # megafeature on x86-64). mask_k_best[np.argsort(values, kind="mergesort")[-k_best:]] = 1 - else: - mask_k_best = np.ones_like(values, dtype=bool) - - if k_lowest is not None: + return mask_k_best + elif k_lowest is not None: mask_k_lowest = np.zeros_like(values, dtype=bool) # Request a stable sort. Mergesort takes more memory (~40MB per # megafeature on x86-64). mask_k_lowest[np.argsort(values, kind="mergesort")[:mask_k_lowest]] = 1 - else: - mask_k_lowest = np.ones_like(values, dtype=bool) - - # based on SelectPercentile in Scikit-Learn - if percentile is not None: + return mask_k_lowest + elif percentile is not None: + assert ( + 0 < percentile < 100 + ), "percentile must be between 0 and 100 (exclusive). Got {}.".format( + percentile + ) + # based on SelectPercentile in Scikit-Learn threshold_percentile = np.percentile(values, 100 - percentile) mask_percentile = values > threshold_percentile ties = np.where(values == threshold_percentile)[0] @@ -86,21 +82,16 @@ def _selection_multi_criteria( max_feats = int(len(values) * percentile / 100) kept_ties = ties[: max_feats - mask_percentile.sum()] mask_percentile[kept_ties] = True - else: - mask_percentile = np.ones_like(values, dtype=bool) - - if threshold_max is not None: + return mask_percentile + elif threshold_max is not None: mask_threshold_max = values < threshold_max - else: - mask_threshold_max = np.ones_like(values, dtype=bool) - - if threshold_min is not None: + return mask_threshold_max + elif threshold_min is not None: mask_threshold_min = values > threshold_min + return mask_threshold_min else: - mask_threshold_min = np.ones_like(values, dtype=bool) - - selection = mask_k_best & mask_percentile & mask_threshold_max & mask_threshold_min - return selection + no_mask = np.ones_like(values, dtype=bool) + return no_mask class BaseVariableImportance(BaseEstimator): @@ -163,7 +154,7 @@ def importance_selection( Binary array indicating the selected features. """ self._check_importance() - return _selection_multi_criteria( + return _selection_generic( self.importances_, k_best=k_best, percentile=percentile, @@ -215,7 +206,7 @@ def pvalue_selection( assert alternative_hypothesis is None or isinstance( alternative_hypothesis, bool ), "alternative_hippothesis can have only three values: True, False and None." - return _selection_multi_criteria( + return _selection_generic( self.pvalues_ if not alternative_hypothesis else 1 - self.pvalues_, k_lowest=k_lowest, percentile=percentile, From c7e8d6994d47a3a83c280a60679dfd0d8efe7f66 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 9 Oct 2025 15:11:39 +0200 Subject: [PATCH 49/54] fix tests --- test/test_base_variable_importance.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index df23d12c1..7f096daa3 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -260,8 +260,7 @@ def test_selection_threshold(self, set_100_variable_sorted): ): vi.pvalue_selection(threshold_max=1.1) with pytest.raises( - AssertionError, - match="threshold_max needs to be higher than threshold_min", + AssertionError, match="Only support selection based on one criteria." ): vi.pvalue_selection(threshold_max=0.5, threshold_min=0.9) From 529d28aa2cdf331a107d3e57e45b18af27dd5843 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 9 Oct 2025 15:20:52 +0200 Subject: [PATCH 50/54] fix format --- src/hidimstat/base_variable_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index 420b42abc..f1a6fc99d 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -5,8 +5,8 @@ import pandas as pd from sklearn.base import BaseEstimator -from hidimstat.statistical_tools.multiple_testing import fdr_threshold from hidimstat._utils.exception import InternalError +from hidimstat.statistical_tools.multiple_testing import fdr_threshold def _selection_generic( From b633e15b1fbc56a8722bbeaf5d18b8c586a4589e Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Thu, 9 Oct 2025 16:04:55 +0200 Subject: [PATCH 51/54] fix k_lowest --- src/hidimstat/base_variable_importance.py | 9 ++++++- test/test_base_variable_importance.py | 30 ++++++++++++++++++++--- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py index f1a6fc99d..e7b9a7e25 100644 --- a/src/hidimstat/base_variable_importance.py +++ b/src/hidimstat/base_variable_importance.py @@ -62,11 +62,18 @@ def _selection_generic( mask_k_best[np.argsort(values, kind="mergesort")[-k_best:]] = 1 return mask_k_best elif k_lowest is not None: + assert k_lowest >= 1, "k_lowest needs to be positive or None" + if k_lowest > values.shape[0]: + warnings.warn( + f"k={k_lowest} is greater than n_features={values.shape[0]}. " + "All the features will be returned." + ) mask_k_lowest = np.zeros_like(values, dtype=bool) + # based on SelectKBest in Scikit-Learn # Request a stable sort. Mergesort takes more memory (~40MB per # megafeature on x86-64). - mask_k_lowest[np.argsort(values, kind="mergesort")[:mask_k_lowest]] = 1 + mask_k_lowest[np.argsort(values, kind="mergesort")[:k_lowest]] = 1 return mask_k_lowest elif percentile is not None: assert ( diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 7f096daa3..b8b4bca1d 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -53,6 +53,20 @@ def test_selection_k_best_none(self, set_100_variable_sorted): selection = vi.importance_selection(k_best=None) np.testing.assert_array_equal(true_value, selection) + def test_selection_k_lowest(self, set_100_variable_sorted): + "test selection of the k_lowest" + vi = set_100_variable_sorted + true_value = vi.pvalues_ < vi.pvalues_[np.argsort(vi.pvalues_)[5]] + selection = vi.pvalue_selection(k_lowest=5, threshold_max=None) + np.testing.assert_array_equal(true_value, selection) + + def test_selection_k_lowest_none(self, set_100_variable_sorted): + "test selection when there none" + vi = set_100_variable_sorted + true_value = np.ones_like(vi.pvalues_ > 0, dtype=bool) + selection = vi.pvalue_selection(k_lowest=None, threshold_max=None) + np.testing.assert_array_equal(true_value, selection) + def test_selection_percentile(self, set_100_variable_sorted): "test selection bae on percentile" vi = set_100_variable_sorted @@ -206,27 +220,35 @@ def test_selection_k_best(self, set_100_variable_sorted): with pytest.warns(Warning, match="k=1000 is greater than n_features="): vi.importance_selection(k_best=1000) + def test_selection_k_lowest(self, set_100_variable_sorted): + "test selection k_lowest wrong" + vi = set_100_variable_sorted + with pytest.raises(AssertionError, match="k_lowest needs to be positive"): + vi.pvalue_selection(k_lowest=-10, threshold_max=None) + with pytest.warns(Warning, match="k=1000 is greater than n_features="): + vi.pvalue_selection(k_lowest=1000, threshold_max=None) + def test_selection_percentile(self, set_100_variable_sorted): "test selection percentile wrong" vi = set_100_variable_sorted with pytest.raises( AssertionError, - match="percentile must be between 0 and 100 \(exclusive\). Got -1.", + match=r"percentile must be between 0 and 100 \(exclusive\). Got -1.", ): vi.importance_selection(percentile=-1) with pytest.raises( AssertionError, - match="percentile must be between 0 and 100 \(exclusive\). Got 102.", + match=r"percentile must be between 0 and 100 \(exclusive\). Got 102.", ): vi.importance_selection(percentile=102) with pytest.raises( AssertionError, - match="percentile must be between 0 and 100 \(exclusive\). Got 0.", + match=r"percentile must be between 0 and 100 \(exclusive\). Got 0.", ): vi.importance_selection(percentile=0) with pytest.raises( AssertionError, - match="percentile must be between 0 and 100 \(exclusive\). Got 100", + match=r"percentile must be between 0 and 100 \(exclusive\). Got 100", ): vi.importance_selection(percentile=100) From 246bfb6eecb76d50818f573e1b96bd6e59ecc571 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 16:35:03 +0200 Subject: [PATCH 52/54] remove randomization in tests --- test/test_base_variable_importance.py | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index b8b4bca1d..8807a7306 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -5,7 +5,7 @@ @pytest.fixture -def set_100_variable_sorted(seed): +def set_100_variable_sorted(): """Create a BaseVariableImportance instance with test data for testing purposes. Parameters @@ -22,6 +22,7 @@ def set_100_variable_sorted(seed): BaseVariableImportance A BaseVariableImportance instance with test data. """ + seed = 0 n_features = 100 rng = np.random.RandomState(seed) vi = BaseVariableImportance() @@ -31,11 +32,6 @@ def set_100_variable_sorted(seed): return vi -@pytest.mark.parametrize( - "seed", - [0, 2], - ids=["default_seed", "another seed"], -) class TestSelection: """Test selection based on importance""" @@ -115,11 +111,6 @@ def test_selection_threshold_max(self, set_100_variable_sorted): np.testing.assert_array_equal(true_value, selection) -@pytest.mark.parametrize( - "seed", - [0], - ids=["default_seed"], -) class TestSelectionFDR: """Test selection based on fdr""" @@ -190,15 +181,10 @@ def test_selection_fdr_two_side(self, set_100_variable_sorted): ) -@pytest.mark.parametrize( - "seed", - [0], - ids=["default_seed"], -) class TestBVIExceptions: """Test class for BVI Exception""" - def test_not_fit(self, seed): + def test_not_fit(self): "test detection unfit" vi = BaseVariableImportance() with pytest.raises( @@ -287,13 +273,8 @@ def test_selection_threshold(self, set_100_variable_sorted): vi.pvalue_selection(threshold_max=0.5, threshold_min=0.9) -@pytest.mark.parametrize( - "seed", - [0], - ids=["default_seed"], -) class TestSelectionFDRExceptions: - def test_not_fit(self, seed): + def test_not_fit(self): "test detection unfit" vi = BaseVariableImportance() From 62f71a4ba66af077acfd6c97a2da8fc64be482ce Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 16:36:42 +0200 Subject: [PATCH 53/54] move all the tests for base importance in one file --- test/test_base_importance.py | 74 --------------------------- test/test_base_variable_importance.py | 74 ++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 76 deletions(-) delete mode 100644 test/test_base_importance.py diff --git a/test/test_base_importance.py b/test/test_base_importance.py deleted file mode 100644 index f50487fbb..000000000 --- a/test/test_base_importance.py +++ /dev/null @@ -1,74 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import pytest - -from hidimstat.base_variable_importance import BaseVariableImportance - - -def test_plot_importance_axis(): - """Test argument axis of plot function""" - n_features = 10 - vi = BaseVariableImportance() - # Make the plot independent of data / randomness to test only the plotting function - vi.importances_ = np.arange(n_features) - ax_1 = vi.plot_importance(ax=None) - assert isinstance(ax_1, plt.Axes) - - _, ax_2 = plt.subplots() - vi.importances_ = np.random.standard_normal((3, n_features)) - ax_2_bis = vi.plot_importance(ax=ax_2) - assert isinstance(ax_2_bis, plt.Axes) - assert ax_2_bis == ax_2 - - -def test_plot_importance_ascending(): - """Test argument ascending of plot function""" - n_features = 10 - vi = BaseVariableImportance() - - # Make the plot independent of data / randomness to test only the plotting function - vi.importances_ = np.arange(n_features) - np.random.shuffle(vi.importances_) - - ax_decending = vi.plot_importance(ascending=False) - assert np.all( - ax_decending.containers[0].datavalues == np.flip(np.sort(vi.importances_)) - ) - - ax_ascending = vi.plot_importance(ascending=True) - assert np.all(ax_ascending.containers[0].datavalues == np.sort(vi.importances_)) - - -def test_plot_importance_feature_names(): - """Test argument feature of plot function""" - n_features = 10 - vi = BaseVariableImportance() - - # Make the plot independent of data / randomness to test only the plotting function - vi.importances_ = np.arange(n_features) - np.random.shuffle(vi.importances_) - - features_name = [str(j) for j in np.flip(np.argsort(vi.importances_))] - ax_none = vi.plot_importance(feature_names=None) - assert np.all( - np.array([label.get_text() for label in ax_none.get_yticklabels()]) - == features_name - ) - - features_name = ["features_" + str(j) for j in np.flip(np.sort(vi.importances_))] - ax_setup = vi.plot_importance(feature_names=features_name) - assert np.all( - np.array([label.get_text() for label in ax_setup.get_yticklabels()]) - == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) - ) - - vi.features_groups = {str(j * 2): [] for j in np.flip(np.sort(vi.importances_))} - features_name = [str(j * 2) for j in np.flip(np.sort(vi.importances_))] - ax_none_group = vi.plot_importance(feature_names=None) - assert np.all( - np.array([label.get_text() for label in ax_none_group.get_yticklabels()]) - == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) - ) - - with pytest.raises(ValueError, match="feature_names should be a list"): - ax_none_group = vi.plot_importance(feature_names="ttt") diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 8807a7306..220ac4507 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -1,3 +1,4 @@ +import matplotlib.pyplot as plt import numpy as np import pytest @@ -184,7 +185,7 @@ def test_selection_fdr_two_side(self, set_100_variable_sorted): class TestBVIExceptions: """Test class for BVI Exception""" - def test_not_fit(self): + def test_not_fit(self, seed): "test detection unfit" vi = BaseVariableImportance() with pytest.raises( @@ -274,7 +275,7 @@ def test_selection_threshold(self, set_100_variable_sorted): class TestSelectionFDRExceptions: - def test_not_fit(self): + def test_not_fit(self, seed): "test detection unfit" vi = BaseVariableImportance() @@ -321,3 +322,72 @@ def test_selection_fdr_fdr_control(self, set_100_variable_sorted): match="only 'bhq' and 'bhy' are supported", ): vi.fdr_selection(fdr=0.1, fdr_control="ehb") + + +def test_plot_importance_axis(): + """Test argument axis of plot function""" + n_features = 10 + vi = BaseVariableImportance() + # Make the plot independent of data / randomness to test only the plotting function + vi.importances_ = np.arange(n_features) + ax_1 = vi.plot_importance(ax=None) + assert isinstance(ax_1, plt.Axes) + + _, ax_2 = plt.subplots() + vi.importances_ = np.random.standard_normal((3, n_features)) + ax_2_bis = vi.plot_importance(ax=ax_2) + assert isinstance(ax_2_bis, plt.Axes) + assert ax_2_bis == ax_2 + + +def test_plot_importance_ascending(): + """Test argument ascending of plot function""" + n_features = 10 + vi = BaseVariableImportance() + + # Make the plot independent of data / randomness to test only the plotting function + vi.importances_ = np.arange(n_features) + np.random.shuffle(vi.importances_) + + ax_decending = vi.plot_importance(ascending=False) + assert np.all( + ax_decending.containers[0].datavalues == np.flip(np.sort(vi.importances_)) + ) + + ax_ascending = vi.plot_importance(ascending=True) + assert np.all(ax_ascending.containers[0].datavalues == np.sort(vi.importances_)) + + +def test_plot_importance_feature_names(): + """Test argument feature of plot function""" + n_features = 10 + vi = BaseVariableImportance() + + # Make the plot independent of data / randomness to test only the plotting function + vi.importances_ = np.arange(n_features) + np.random.shuffle(vi.importances_) + + features_name = [str(j) for j in np.flip(np.argsort(vi.importances_))] + ax_none = vi.plot_importance(feature_names=None) + assert np.all( + np.array([label.get_text() for label in ax_none.get_yticklabels()]) + == features_name + ) + + features_name = ["features_" + str(j) for j in np.flip(np.sort(vi.importances_))] + ax_setup = vi.plot_importance(feature_names=features_name) + assert np.all( + np.array([label.get_text() for label in ax_setup.get_yticklabels()]) + == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) + ) + + vi.features_groups = {str(j * 2): [] for j in np.flip(np.sort(vi.importances_))} + features_name = [str(j * 2) for j in np.flip(np.sort(vi.importances_))] + ax_none_group = vi.plot_importance(feature_names=None) + assert np.all( + np.array([label.get_text() for label in ax_none_group.get_yticklabels()]) + == np.flip(np.array(features_name)[np.argsort(vi.importances_)]) + ) + + with pytest.raises(ValueError, match="feature_names should be a list"): + ax_none_group = vi.plot_importance(feature_names="ttt") From f10bf0677c6596440d045f6db5b957b8a5ef0698 Mon Sep 17 00:00:00 2001 From: kusch lionel Date: Fri, 10 Oct 2025 16:51:17 +0200 Subject: [PATCH 54/54] fix seed --- test/test_base_variable_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_base_variable_importance.py b/test/test_base_variable_importance.py index 220ac4507..a507f26de 100644 --- a/test/test_base_variable_importance.py +++ b/test/test_base_variable_importance.py @@ -185,7 +185,7 @@ def test_selection_fdr_two_side(self, set_100_variable_sorted): class TestBVIExceptions: """Test class for BVI Exception""" - def test_not_fit(self, seed): + def test_not_fit(self): "test detection unfit" vi = BaseVariableImportance() with pytest.raises( @@ -275,7 +275,7 @@ def test_selection_threshold(self, set_100_variable_sorted): class TestSelectionFDRExceptions: - def test_not_fit(self, seed): + def test_not_fit(self): "test detection unfit" vi = BaseVariableImportance()