ENH make RandomUnderSampler accept dask array

glemaitre · glemaitre · commit 95247e6e1fe7 · 2020-11-05T20:14:23.000+01:00
diff --git a/imblearn/dask/__init__.py b/imblearn/dask/__init__.py
diff --git a/imblearn/dask/_support.py b/imblearn/dask/_support.py
@@ -0,0 +1,13 @@
+_REGISTERED_DASK_CONTAINER = []
+
+try:
+    from dask import array, dataframe
+    _REGISTERED_DASK_CONTAINER += [
+        array.Array, dataframe.Series, dataframe.DataFrame,
+    ]
+except ImportError:
+    pass
+
+
+def is_dask_container(container):
+    return isinstance(container, tuple(_REGISTERED_DASK_CONTAINER))
diff --git a/imblearn/dask/tests/__init__.py b/imblearn/dask/tests/__init__.py
diff --git a/imblearn/dask/tests/test_utils.py b/imblearn/dask/tests/test_utils.py
@@ -0,0 +1,47 @@
+import numpy as np
+import pytest
+from dask import array
+from dask_ml.datasets import make_classification
+
+from imblearn.dask.utils import is_multilabel
+from imblearn.dask.utils import type_of_target
+
+
+def test_type_of_target_error():
+    y = np.arange(10)
+
+    err_msg = "Expected a Dask array, series or dataframe."
+    with pytest.raises(ValueError, match=err_msg):
+        type_of_target(y)
+
+
+@pytest.mark.parametrize(
+    "y, expected_result",
+    [
+        (array.from_array(np.array([0, 1, 0, 1])), False),
+        (array.from_array(np.array([[1, 0], [0, 0]])), True),
+        (array.from_array(np.array([[1], [0], [0]])), False),
+        (array.from_array(np.array([[1, 0, 0]])), True),
+    ]
+)
+def test_is_multilabel(y, expected_result):
+    assert is_multilabel(y) is expected_result
+
+
+@pytest.mark.parametrize(
+    "y, expected_type_of_target",
+    [
+        (array.from_array(np.array([[1, 0], [0, 0]])), "multilabel-indicator"),
+        (array.from_array(np.array([[1, 0, 0]])), "multilabel-indicator"),
+        (array.from_array(np.array([[[1, 2]]])), "unknown"),
+        (array.from_array(np.array([[]])), "unknown"),
+        (array.from_array(np.array([.1, .2, 3])), "continuous"),
+        (array.from_array(np.array([[.1, .2, 3]])), "continuous-multioutput"),
+        (array.from_array(np.array([[1., .2]])), "continuous-multioutput"),
+        (array.from_array(np.array([1, 2])), "binary"),
+        (array.from_array(np.array(["a", "b"])), "binary"),
+    ]
+)
+def test_type_of_target(y, expected_type_of_target):
+    target_type = type_of_target(y)
+    assert target_type == expected_type_of_target
diff --git a/imblearn/dask/utils.py b/imblearn/dask/utils.py
@@ -0,0 +1,66 @@
+import warnings
+
+from dask import dataframe
+from dask import array
+from sklearn.exceptions import DataConversionWarning
+from sklearn.utils.multiclass import _is_integral_float
+
+
+def is_multilabel(y):
+    if not (y.ndim == 2 and y.shape[1] > 1):
+        return False
+
+    labels = array.unique(y).compute()
+
+    return len(labels) < 3 and (
+        y.dtype.kind in 'biu' or _is_integral_float(labels)
+    )
+
+
+def type_of_target(y):
+    if is_multilabel(y):
+        return 'multilabel-indicator'
+
+    if y.ndim > 2:
+        return 'unknown'
+
+    if y.ndim == 2 and y.shape[1] == 0:
+        return 'unknown'  # [[]]
+
+    if y.ndim == 2 and y.shape[1] > 1:
+        # [[1, 2], [1, 2]]
+        suffix = "-multioutput"
+    else:
+        # [1, 2, 3] or [[1], [2], [3]]
+        suffix = ""
+
+    # check float and contains non-integer float values
+    if y.dtype.kind == 'f' and array.any(y != y.astype(int)):
+        # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
+        # NOTE: we don't check for infinite values
+        return 'continuous' + suffix
+
+    labels = array.unique(y).compute()
+    if (len((labels)) > 2) or (y.ndim >= 2 and len(y[0]) > 1):
+        # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
+        return 'multiclass' + suffix
+    # [1, 2] or [["a"], ["b"]]
+    return 'binary'
+
+
+def column_or_1d(y, *, warn=False):
+    shape = y.shape
+    if len(shape) == 1:
+        return y.ravel()
+    if len(shape) == 2 and shape[1] == 1:
+        if warn:
+            warnings.warn(
+                "A column-vector y was passed when a 1d array was  expected. "
+                "Please change the shape of y to (n_samples, ), for example "
+                "using ravel().", DataConversionWarning, stacklevel=2
+            )
+        return y.ravel()
+
+    raise ValueError(
+        f"y should be a 1d array. Got an array of shape {shape} instead."
+    )
diff --git a/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py b/imblearn/under_sampling/_prototype_selection/_random_under_sampler.py
@@ -10,6 +10,7 @@
 from sklearn.utils import _safe_indexing
 
 from ..base import BaseUnderSampler
+from ...dask._support import is_dask_container
 from ...utils import check_target_type
 from ...utils import Substitution
 from ...utils._docstring import _random_state_docstring
@@ -80,44 +81,66 @@ def __init__(
         self.replacement = replacement
 
     def _check_X_y(self, X, y):
-        y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
-        X, y = self._validate_data(
-            X, y, reset=True, accept_sparse=["csr", "csc"], dtype=None,
-            force_all_finite=False,
+        y, binarize_y, self._uniques = check_target_type(
+            y,
+            indicate_one_vs_all=True,
+            return_unique=True,
         )
+        if not any([is_dask_container(arr) for arr in (X, y)]):
+            X, y = self._validate_data(
+                X,
+                y,
+                reset=True,
+                accept_sparse=["csr", "csc"],
+                dtype=None,
+                force_all_finite=False,
+            )
         return X, y, binarize_y
 
+    @staticmethod
+    def _find_target_class_indices(y, target_class):
+        target_class_indices = np.flatnonzero(y == target_class)
+        if is_dask_container(y):
+            return target_class_indices.compute()
+        return target_class_indices
+
     def _fit_resample(self, X, y):
         random_state = check_random_state(self.random_state)
 
-        idx_under = np.empty((0,), dtype=int)
+        idx_under = []
 
-        for target_class in np.unique(y):
+        for target_class in self._uniques:
+            target_class_indices = self._find_target_class_indices(
+                y, target_class
+            )
             if target_class in self.sampling_strategy_.keys():
                 n_samples = self.sampling_strategy_[target_class]
                 index_target_class = random_state.choice(
-                    range(np.count_nonzero(y == target_class)),
+                    target_class_indices.size,
                     size=n_samples,
                     replace=self.replacement,
                 )
             else:
                 index_target_class = slice(None)
 
-            idx_under = np.concatenate(
-                (
-                    idx_under,
-                    np.flatnonzero(y == target_class)[index_target_class],
-                ),
-                axis=0,
-            )
+            selected_indices = target_class_indices[index_target_class]
+            idx_under.append(selected_indices)
 
-        self.sample_indices_ = idx_under
+        self.sample_indices_ = np.hstack(idx_under)
+        self.sample_indices_.sort()
 
-        return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under)
+        return (
+            _safe_indexing(X, self.sample_indices_),
+            _safe_indexing(y, self.sample_indices_)
+        )
 
     def _more_tags(self):
         return {
-            "X_types": ["2darray", "string"],
+            "X_types": [
+                "2darray",
+                "string",
+                "dask-array",
+            ],
             "sample_indices": True,
             "allow_nan": True,
         }
diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py
@@ -14,10 +14,13 @@
 from sklearn.base import clone
 from sklearn.neighbors._base import KNeighborsMixin
 from sklearn.neighbors import NearestNeighbors
-from sklearn.utils import column_or_1d
-from sklearn.utils.multiclass import type_of_target
 
+from ..dask._support import is_dask_container
 from ..exceptions import raise_isinstance_error
+from .wrapper import _is_multiclass_encoded
+from .wrapper import column_or_1d
+from .wrapper import type_of_target
+from .wrapper import unique
 
 SAMPLING_KIND = (
     "over-sampling",
@@ -99,10 +102,12 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0):
 
 def _count_class_sample(y):
     unique, counts = np.unique(y, return_counts=True)
+    if is_dask_container(unique):
+        unique, counts = unique.compute(), counts.compute()
     return dict(zip(unique, counts))
 
 
-def check_target_type(y, indicate_one_vs_all=False):
+def check_target_type(y, indicate_one_vs_all=False, return_unique=False):
     """Check the target types to be conform to the current samplers.
 
     The current samplers should be compatible with ``'binary'``,
@@ -116,18 +121,24 @@ def check_target_type(y, indicate_one_vs_all=False):
     indicate_one_vs_all : bool, default=False
         Either to indicate if the targets are encoded in a one-vs-all fashion.
 
+    return_unique : bool, default=False
+        Either to return or not the unique values in y.
+
     Returns
     -------
     y : ndarray
         The returned target.
 
+    y_unique : ndarray
+        The unique values in `y`.
+
     is_one_vs_all : bool, optional
         Indicate if the target was originally encoded in a one-vs-all fashion.
         Only returned if ``indicate_multilabel=True``.
     """
     type_y = type_of_target(y)
     if type_y == "multilabel-indicator":
-        if np.any(y.sum(axis=1) > 1):
+        if not _is_multiclass_encoded(y):
             raise ValueError(
                 "Imbalanced-learn currently supports binary, multiclass and "
                 "binarized encoded multiclasss targets. Multilabel and "
@@ -137,7 +148,13 @@ def check_target_type(y, indicate_one_vs_all=False):
     else:
         y = column_or_1d(y)
 
-    return (y, type_y == "multilabel-indicator") if indicate_one_vs_all else y
+    output = [y]
+    if indicate_one_vs_all:
+        output += [type_y == "multilabel-indicator"]
+    if return_unique:
+        output += [unique(y)]
+
+    return output
 
 
 def _sampling_strategy_all(y, sampling_type):
diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py
@@ -51,13 +51,16 @@ def _set_checking_parameters(estimator):
 
 
 def _yield_sampler_checks(sampler):
+    tags = sampler._get_tags()
     yield check_target_type
     yield check_samplers_one_label
     yield check_samplers_fit
     yield check_samplers_fit_resample
     yield check_samplers_sampling_strategy_fit_resample
     yield check_samplers_sparse
     yield check_samplers_pandas
+    if "dask-array" in tags["X_types"]:
+        yield check_samplers_dask_array
     yield check_samplers_list
     yield check_samplers_multiclass_ova
     yield check_samplers_preserve_dtype
@@ -290,6 +293,30 @@ def check_samplers_pandas(name, sampler):
     assert_allclose(y_res_s.to_numpy(), y_res)
 
 
+def check_samplers_dask_array(name, sampler):
+    dask = pytest.importorskip("dask")
+    # Check that the samplers handle pandas dataframe and pandas series
+    X, y = make_classification(
+        n_samples=1000,
+        n_classes=3,
+        n_informative=4,
+        weights=[0.2, 0.3, 0.5],
+        random_state=0,
+    )
+    X_dask = dask.array.from_array(X, chunks=100)
+    y_dask = dask.array.from_array(y, chunks=100)
+
+    X_res_dask, y_res_dask = sampler.fit_resample(X_dask, y_dask)
+    X_res, y_res = sampler.fit_resample(X, y)
+
+    # check that we return the same type for dataframes or series types
+    assert isinstance(X_res_dask, dask.array.Array)
+    assert isinstance(y_res_dask, dask.array.Array)
+
+    assert_allclose(X_res_dask, X_res)
+    assert_allclose(y_res_dask, y_res)
+
+
 def check_samplers_list(name, sampler):
     # Check that the can samplers handle simple lists
     X, y = make_classification(
diff --git a/imblearn/utils/wrapper.py b/imblearn/utils/wrapper.py
@@ -0,0 +1,37 @@
+import numpy as np
+
+from sklearn.utils.multiclass import type_of_target as sklearn_type_of_target
+from sklearn.utils.validation import column_or_1d as sklearn_column_or_1d
+
+from ..dask._support import is_dask_container
+
+
+def type_of_target(y):
+    if is_dask_container(y):
+        from ..dask.utils import type_of_target as dask_type_of_target
+
+        return dask_type_of_target(y)
+    return sklearn_type_of_target(y)
+
+
+def _is_multiclass_encoded(y):
+    if is_dask_container(y):
+        from dask import array
+
+        return array.all(y.sum(axis=1) == 1).compute()
+    return np.all(y.sum(axis=1) == 1)
+
+
+def column_or_1d(y, *, warn=False):
+    if is_dask_container(y):
+        from ..dask.utils import column_or_1d as dask_column_or_1d
+
+        return dask_column_or_1d(y, warn=warn)
+    return sklearn_column_or_1d(y, warn=warn)
+
+
+def unique(*args, **kwargs):
+    output = np.unique(args, kwargs)
+    if is_dask_container(output):
+        return (arr.compute() for arr in output)
+    return output