MNT synchronize imblearn.pipeline with sklearn.pipeline (#620)

glemaitre · web-flow · commit 65c80798dfb8 · 2019-11-01T00:23:51.000+01:00
diff --git a/doc/whats_new/v0.6.rst b/doc/whats_new/v0.6.rst
@@ -31,6 +31,9 @@ Maintenance
   :class:`sklearn.utils._testing.SkipTest`.
   :pr:`617` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- Synchronize :mod:`imblearn.pipeline` with :mod:`sklearn.pipeline`.
+  :pr:`617` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Deprecation
 ...........
 
diff --git a/imblearn/pipeline.py b/imblearn/pipeline.py
@@ -15,6 +15,7 @@
 
 from sklearn import pipeline
 from sklearn.base import clone
+from sklearn.utils import Bunch, _print_elapsed_time
 from sklearn.utils.metaestimators import if_delegate_has_method
 from sklearn.utils.validation import check_memory
 
@@ -57,10 +58,13 @@ class Pipeline(pipeline.Pipeline):
         inspect estimators within the pipeline. Caching the
         transformers is advantageous when fitting is time consuming.
 
+    verbose : boolean, optional (default=False)
+        If True, the time elapsed while fitting each step will be printed as it
+        is completed.
 
     Attributes
     ----------
-    named_steps : dict
+    named_steps : bunch object, a dictionary with attribute access
         Read-only attribute to access any step parameter by user given name.
         Keys are step names and values are steps parameters.
 
@@ -178,9 +182,23 @@ def _fit(self, X, y=None, **fit_params):
             name: {} for name, step in self.steps if step is not None
         }
         for pname, pval in fit_params.items():
+            if '__' not in pname:
+                raise ValueError(
+                    "Pipeline.fit does not accept the {} parameter. "
+                    "You can pass parameters to specific steps of your "
+                    "pipeline using the stepname__parameter format, e.g. "
+                    "`Pipeline.fit(X, y, logisticregression__sample_weight"
+                    "=sample_weight)`.".format(pname))
             step, param = pname.split("__", 1)
             fit_params_steps[step][param] = pval
-        for step_idx, name, transformer in self._iter(with_final=False):
+        for (step_idx,
+             name,
+             transformer) in self._iter(with_final=False,
+                                        filter_passthrough=False):
+            if (transformer is None or transformer == 'passthrough'):
+                with _print_elapsed_time('Pipeline',
+                                         self._log_message(step_idx)):
+                    continue
             if hasattr(memory, "location"):
                 # joblib >= 0.12
                 if memory.location is None:
@@ -202,11 +220,17 @@ def _fit(self, X, y=None, **fit_params):
                 cloned_transformer, "fit_transform"
             ):
                 X, fitted_transformer = fit_transform_one_cached(
-                    cloned_transformer, None, X, y, **fit_params_steps[name]
+                    cloned_transformer, X, y, None,
+                    message_clsname='Pipeline',
+                    message=self._log_message(step_idx),
+                    **fit_params_steps[name]
                 )
             elif hasattr(cloned_transformer, "fit_resample"):
                 X, y, fitted_transformer = fit_resample_one_cached(
-                    cloned_transformer, X, y, **fit_params_steps[name]
+                    cloned_transformer, X, y,
+                    message_clsname='Pipeline',
+                    message=self._log_message(step_idx),
+                    **fit_params_steps[name]
                 )
             # Replace the transformer of the step with the fitted
             # transformer. This is necessary when loading the transformer
@@ -245,8 +269,10 @@ def fit(self, X, y=None, **fit_params):
 
         """
         Xt, yt, fit_params = self._fit(X, y, **fit_params)
-        if self._final_estimator != "passthrough":
-            self._final_estimator.fit(Xt, yt, **fit_params)
+        with _print_elapsed_time('Pipeline',
+                                 self._log_message(len(self.steps) - 1)):
+            if self._final_estimator != "passthrough":
+                self._final_estimator.fit(Xt, yt, **fit_params)
         return self
 
     def fit_transform(self, X, y=None, **fit_params):
@@ -279,12 +305,14 @@ def fit_transform(self, X, y=None, **fit_params):
         """
         last_step = self._final_estimator
         Xt, yt, fit_params = self._fit(X, y, **fit_params)
-        if last_step == "passthrough":
-            return Xt
-        elif hasattr(last_step, "fit_transform"):
-            return last_step.fit_transform(Xt, yt, **fit_params)
-        else:
-            return last_step.fit(Xt, yt, **fit_params).transform(Xt)
+        with _print_elapsed_time('Pipeline',
+                                 self._log_message(len(self.steps) - 1)):
+            if last_step == "passthrough":
+                return Xt
+            elif hasattr(last_step, "fit_transform"):
+                return last_step.fit_transform(Xt, yt, **fit_params)
+            else:
+                return last_step.fit(Xt, yt, **fit_params).transform(Xt)
 
     def fit_resample(self, X, y=None, **fit_params):
         """Fit the model and sample with the final estimator
@@ -319,10 +347,12 @@ def fit_resample(self, X, y=None, **fit_params):
         """
         last_step = self._final_estimator
         Xt, yt, fit_params = self._fit(X, y, **fit_params)
-        if last_step == "passthrough":
-            return Xt
-        elif hasattr(last_step, "fit_resample"):
-            return last_step.fit_resample(Xt, yt, **fit_params)
+        with _print_elapsed_time('Pipeline',
+                                 self._log_message(len(self.steps) - 1)):
+            if last_step == "passthrough":
+                return Xt
+            elif hasattr(last_step, "fit_resample"):
+                return last_step.fit_resample(Xt, yt, **fit_params)
 
     @if_delegate_has_method(delegate="_final_estimator")
     def predict(self, X, **predict_params):
@@ -384,7 +414,10 @@ def fit_predict(self, X, y=None, **fit_params):
         y_pred : array-like
         """
         Xt, yt, fit_params = self._fit(X, y, **fit_params)
-        return self.steps[-1][-1].fit_predict(Xt, yt, **fit_params)
+        with _print_elapsed_time('Pipeline',
+                                 self._log_message(len(self.steps) - 1)):
+            y_pred = self.steps[-1][-1].fit_predict(Xt, yt, **fit_params)
+        return y_pred
 
     @if_delegate_has_method(delegate="_final_estimator")
     def predict_proba(self, X):
@@ -575,22 +608,55 @@ def score(self, X, y=None, sample_weight=None):
             score_params["sample_weight"] = sample_weight
         return self.steps[-1][-1].score(Xt, y, **score_params)
 
+    @if_delegate_has_method(delegate='_final_estimator')
+    def score_samples(self, X):
+        """Apply transforms, and score_samples of the final estimator.
+        Parameters
+        ----------
+        X : iterable
+            Data to predict on. Must fulfill input requirements of first step
+            of the pipeline.
+        Returns
+        -------
+        y_score : ndarray, shape (n_samples,)
+        """
+        Xt = X
+        for _, _, transformer in self._iter(with_final=False):
+            if hasattr(transformer, "fit_resample"):
+                pass
+            else:
+                Xt = transformer.transform(Xt)
+        return self.steps[-1][-1].score_samples(Xt)
+
 
-def _fit_transform_one(transformer, weight, X, y, **fit_params):
-    if hasattr(transformer, "fit_transform"):
-        res = transformer.fit_transform(X, y, **fit_params)
-    else:
-        res = transformer.fit(X, y, **fit_params).transform(X)
+def _fit_transform_one(transformer,
+                       X,
+                       y,
+                       weight,
+                       message_clsname='',
+                       message=None,
+                       **fit_params):
+    with _print_elapsed_time(message_clsname, message):
+        if hasattr(transformer, "fit_transform"):
+            res = transformer.fit_transform(X, y, **fit_params)
+        else:
+            res = transformer.fit(X, y, **fit_params).transform(X)
     # if we have a weight for this transformer, multiply output
     if weight is None:
         return res, transformer
     return res * weight, transformer
 
 
-def _fit_resample_one(sampler, X, y, **fit_params):
-    X_res, y_res = sampler.fit_resample(X, y, **fit_params)
+def _fit_resample_one(sampler,
+                      X,
+                      y,
+                      message_clsname='',
+                      message=None,
+                      **fit_params):
+    with _print_elapsed_time(message_clsname, message):
+        X_res, y_res = sampler.fit_resample(X, y, **fit_params)
 
-    return X_res, y_res, sampler
+        return X_res, y_res, sampler
 
 
 def make_pipeline(*steps, **kwargs):
@@ -614,6 +680,10 @@ def make_pipeline(*steps, **kwargs):
         inspect estimators within the pipeline. Caching the
         transformers is advantageous when fitting is time consuming.
 
+    verbose : boolean, optional (default=False)
+        If True, the time elapsed while fitting each step will be printed as it
+        is completed.
+
     Returns
     -------
     p : Pipeline
@@ -637,8 +707,11 @@ def make_pipeline(*steps, **kwargs):
              verbose=False)
     """
     memory = kwargs.pop("memory", None)
+    verbose = kwargs.pop('verbose', False)
     if kwargs:
         raise TypeError(
             'Unknown keyword arguments: "{}"'.format(list(kwargs.keys())[0])
         )
-    return Pipeline(pipeline._name_estimators(steps), memory=memory)
+    return Pipeline(
+        pipeline._name_estimators(steps), memory=memory, verbose=verbose
+    )
diff --git a/imblearn/tests/test_pipeline.py b/imblearn/tests/test_pipeline.py
@@ -5,9 +5,11 @@
 #          Christos Aridas
 # License: MIT
 
-from tempfile import mkdtemp
+import itertools
+import re
 import shutil
 import time
+from tempfile import mkdtemp
 
 import numpy as np
 import pytest
@@ -29,12 +31,12 @@
 from sklearn.feature_selection import SelectKBest, f_classif
 from sklearn.datasets import load_iris, make_classification
 from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import FeatureUnion
 
+from imblearn.datasets import make_imbalance
 from imblearn.pipeline import Pipeline, make_pipeline
-from imblearn.under_sampling import (
-    RandomUnderSampler,
-    EditedNearestNeighbours as ENN,
-)
+from imblearn.under_sampling import RandomUnderSampler
+from imblearn.under_sampling import EditedNearestNeighbours as ENN
 
 
 JUNK_FOOD_DOCS = (
@@ -1261,3 +1263,82 @@ def test_score_samples_on_pipeline_without_score_samples():
         "'score_samples'",
     ):
         pipe.score_samples(X)
+
+
+def test_pipeline_param_error():
+    clf = make_pipeline(LogisticRegression())
+    with pytest.raises(ValueError, match="Pipeline.fit does not accept "
+                                         "the sample_weight parameter"):
+        clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1])
+
+
+parameter_grid_test_verbose = ((est, pattern, method) for
+                               (est, pattern), method in itertools.product(
+    [
+     (Pipeline([('transf', Transf()), ('clf', FitParamT())]),
+      r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n'
+      r'\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$'),
+     (Pipeline([('transf', Transf()), ('noop', None),
+               ('clf', FitParamT())]),
+      r'\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n'
+      r'\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n'
+      r'\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$'),
+     (Pipeline([('transf', Transf()), ('noop', 'passthrough'),
+               ('clf', FitParamT())]),
+      r'\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n'
+      r'\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n'
+      r'\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$'),
+     (Pipeline([('transf', Transf()), ('clf', None)]),
+      r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n'
+      r'\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$'),
+     (Pipeline([('transf', None), ('mult', Mult())]),
+      r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n'
+      r'\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$'),
+     (Pipeline([('transf', 'passthrough'), ('mult', Mult())]),
+      r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n'
+      r'\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$'),
+     (FeatureUnion([('mult1', Mult()), ('mult2', Mult())]),
+      r'\[FeatureUnion\].*\(step 1 of 2\) Processing mult1.* total=.*\n'
+      r'\[FeatureUnion\].*\(step 2 of 2\) Processing mult2.* total=.*\n$'),
+     (FeatureUnion([('mult1', 'drop'), ('mult2', Mult()), ('mult3', 'drop')]),
+      r'\[FeatureUnion\].*\(step 1 of 1\) Processing mult2.* total=.*\n$')
+    ], ['fit', 'fit_transform', 'fit_predict'])
+    if hasattr(est, method) and not (
+        method == 'fit_transform' and hasattr(est, 'steps') and
+        isinstance(est.steps[-1][1], FitParamT))
+)
+
+
+@pytest.mark.parametrize('est, pattern, method', parameter_grid_test_verbose)
+def test_verbose(est, method, pattern, capsys):
+    func = getattr(est, method)
+
+    X = [[1, 2, 3], [4, 5, 6]]
+    y = [[7], [8]]
+
+    est.set_params(verbose=False)
+    func(X, y)
+    assert not capsys.readouterr().out, 'Got output for verbose=False'
+
+    est.set_params(verbose=True)
+    func(X, y)
+    assert re.match(pattern, capsys.readouterr().out)
+
+
+def test_pipeline_score_samples_pca_lof():
+    X, y = load_iris(return_X_y=True)
+    sampling_strategy = {0: 50, 1: 30, 2: 20}
+    X, y = make_imbalance(X, y, sampling_strategy=sampling_strategy)
+    # Test that the score_samples method is implemented on a pipeline.
+    # Test that the score_samples method on pipeline yields same results as
+    # applying transform and score_samples steps separately.
+    rus = RandomUnderSampler()
+    pca = PCA(svd_solver='full', n_components='mle', whiten=True)
+    lof = LocalOutlierFactor(novelty=True)
+    pipe = Pipeline([('rus', rus), ('pca', pca), ('lof', lof)])
+    pipe.fit(X, y)
+    # Check the shapes
+    assert pipe.score_samples(X).shape == (X.shape[0],)
+    # Check the values
+    lof.fit(pca.fit_transform(X))
+    assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X)))
diff --git a/setup.cfg b/setup.cfg
@@ -28,7 +28,6 @@ addopts =
     --ignore examples
     --ignore maint_tools
     --doctest-modules
-    --disable-pytest-warnings
     -rs
 
 filterwarnings =