From e868fbc055148484faaecb0f68f42900c187e5e3 Mon Sep 17 00:00:00 2001 From: zf2209 <55263735+zf2209@users.noreply.github.com> Date: Tue, 5 May 2020 12:20:40 -0400 Subject: [PATCH] First brutal force replacement --- setup.py | 3 ++- sklearn_pandas/__init__.py | 1 + sklearn_pandas/categorical_imputer.py | 16 +++++++++------- sklearn_pandas/dataframe_mapper.py | 13 +++++++++---- sklearn_pandas/mix_ins.py | 19 +++++++++++++++++++ sklearn_pandas/transformers.py | 19 +++++++++++-------- tox.ini | 1 + 7 files changed, 52 insertions(+), 20 deletions(-) create mode 100644 sklearn_pandas/mix_ins.py diff --git a/setup.py b/setup.py index 19dfa73..a52c4d0 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,8 @@ def run(self): 'scikit-learn>=0.15.0', 'scipy>=0.14', 'pandas>=0.11.0', - 'numpy>=1.6.1'], + 'numpy>=1.6.1', + 'koalas>=0.32.0'], tests_require=['pytest', 'mock'], cmdclass={'test': PyTest}, ) diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py index 5143a4a..b45b6d9 100644 --- a/sklearn_pandas/__init__.py +++ b/sklearn_pandas/__init__.py @@ -4,3 +4,4 @@ from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV # NOQA from .transformers import CategoricalImputer, FunctionTransformer # NOQA from .features_generator import gen_features # NOQA +from .mix_ins import DataframeMixin # What is NOQA? \ No newline at end of file diff --git a/sklearn_pandas/categorical_imputer.py b/sklearn_pandas/categorical_imputer.py index a9f9599..0abd53f 100644 --- a/sklearn_pandas/categorical_imputer.py +++ b/sklearn_pandas/categorical_imputer.py @@ -1,24 +1,26 @@ import pandas as pd import numpy as np - +import databricks.koalas as ks from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted +from .mix_ins import DataframeMixin + -def _get_mask(X, value): +def _get_mask(X, value, dflib_): """ Compute the boolean mask X == missing_values. """ if value == "NaN" or \ value is None or \ (isinstance(value, float) and np.isnan(value)): - return pd.isnull(X) + return dflib_.isnull(X) else: return X == value -class CategoricalImputer(BaseEstimator, TransformerMixin): +class CategoricalImputer(BaseEstimator, TransformerMixin, DataframeMixin): """ Impute missing values from a categorical/string np.ndarray or pd.Series with the most frequent value on the training data. @@ -89,8 +91,8 @@ def fit(self, X, y=None): ------- self: CategoricalImputer """ - - mask = _get_mask(X, self.missing_values) + self._set_df_library(X) + mask = _get_mask(X, self.missing_values, self.dflib_) X = X[~mask] if self.strategy == 'most_frequent': modes = pd.Series(X).mode() @@ -128,7 +130,7 @@ def transform(self, X): if self.copy: X = X.copy() - mask = _get_mask(X, self.missing_values) + mask = _get_mask(X, self.missing_values, self.dflib_) X[mask] = self.fill_ return np.asarray(X) diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py index f530521..a3a8c72 100644 --- a/sklearn_pandas/dataframe_mapper.py +++ b/sklearn_pandas/dataframe_mapper.py @@ -3,11 +3,13 @@ import pandas as pd import numpy as np +import databricks.koalas as ks from scipy import sparse from sklearn.base import BaseEstimator, TransformerMixin from .cross_validation import DataWrapper from .pipeline import make_transformer_pipeline, _call_fit, TransformerPipeline +from .mix_ins import DataframeMixin PY3 = sys.version_info[0] == 3 if PY3: @@ -62,7 +64,7 @@ def add_column_names_to_exception(column_names): raise -class DataFrameMapper(BaseEstimator, TransformerMixin): +class DataFrameMapper(BaseEstimator, TransformerMixin, DataframeMixin): """ Map Pandas data frame column subsets to their own sklearn transformation. @@ -172,6 +174,8 @@ def _get_col_subset(self, X, cols, input_df=False): Returns a numpy array with the data from the selected columns """ + self._set_df_library(X) + if isinstance(cols, string_types): return_vector = True cols = [cols] @@ -183,7 +187,7 @@ def _get_col_subset(self, X, cols, input_df=False): # Will be dropped on sklearn-pandas 2.0. if isinstance(X, list): X = [x[cols] for x in X] - X = pd.DataFrame(X) + X = self.dflib_.DataFrame(X) elif isinstance(X, DataWrapper): X = X.df # fetch underlying data @@ -273,7 +277,7 @@ def get_dtypes(self, extracted): def get_dtype(self, ex): if isinstance(ex, np.ndarray) or sparse.issparse(ex): return [ex.dtype] * ex.shape[1] - elif isinstance(ex, pd.DataFrame): + elif isinstance(ex, pd.DataFrame) or isinstance(ex, ks.DataFrame): return list(ex.dtypes) else: raise TypeError(type(ex)) @@ -347,6 +351,7 @@ def _transform(self, X, y=None, do_fit=False): stacked = np.hstack(extracted) if self.df_out: + self._set_df_library(X) # if no rows were dropped preserve the original index, # otherwise use a new integer one no_rows_dropped = len(X) == len(stacked) @@ -357,7 +362,7 @@ def _transform(self, X, y=None, do_fit=False): # output different data types, if appropriate dtypes = self.get_dtypes(extracted) - df_out = pd.DataFrame( + df_out = self.dflib_.DataFrame( stacked, columns=self.transformed_names_, index=index) diff --git a/sklearn_pandas/mix_ins.py b/sklearn_pandas/mix_ins.py new file mode 100644 index 0000000..8c8703d --- /dev/null +++ b/sklearn_pandas/mix_ins.py @@ -0,0 +1,19 @@ +from typing import Union + +import databricks.koalas as ks +import pandas as pd + + +class DataframeMixin: + """Mixin class for all transformers in scikit-learn.""" + + def _set_df_library(self, X: Union[pd.DataFrame, ks.DataFrame]): + """ Detect X and set .dflib_ to pd or ks or pyspark. + + """ + if isinstance(X, pd.DataFrame): + self.dflib_ = pd + elif isinstance(X, ks.DataFrame): + self.dflib_ = ks + else: + raise ValueError("Only pandas and koalas dataframe are supported.") \ No newline at end of file diff --git a/sklearn_pandas/transformers.py b/sklearn_pandas/transformers.py index fbfdf00..3824f04 100644 --- a/sklearn_pandas/transformers.py +++ b/sklearn_pandas/transformers.py @@ -1,23 +1,26 @@ import numpy as np import pandas as pd +import databricks.koalas as ks from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils.validation import check_is_fitted +from .mix_ins import DataframeMixin -def _get_mask(X, value): + +def _get_mask(X, value, dflib_): """ Compute the boolean mask X == missing_values. """ if value == "NaN" or \ value is None or \ (isinstance(value, float) and np.isnan(value)): - return pd.isnull(X) + return dflib_.isnull(X) else: return X == value -class CategoricalImputer(BaseEstimator, TransformerMixin): +class CategoricalImputer(BaseEstimator, TransformerMixin, DataframeMixin): """ Impute missing values from a categorical/string np.ndarray or pd.Series with the most frequent value on the training data. @@ -88,11 +91,11 @@ def fit(self, X, y=None): ------- self: CategoricalImputer """ - - mask = _get_mask(X, self.missing_values) + self._set_df_library(X) + mask = _get_mask(X, self.missing_values, self.dflib_) X = X[~mask] if self.strategy == 'most_frequent': - modes = pd.Series(X).mode() + modes = self.dflib_.Series(X).mode() elif self.strategy == 'constant': modes = np.array([self.fill_value]) if modes.shape[0] == 0: @@ -121,13 +124,13 @@ def transform(self, X): np.ndarray Data with imputed values. """ - + self._set_df_library(X) check_is_fitted(self, 'fill_') if self.copy: X = X.copy() - mask = _get_mask(X, self.missing_values) + mask = _get_mask(X, self.missing_values, self.dflib_) X[mask] = self.fill_ return np.asarray(X) diff --git a/tox.ini b/tox.ini index f99cf75..f92b071 100644 --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,7 @@ deps = flake8==2.4.1 numpy==1.14.3 scipy==0.18.1 + koalas==0.32.0 pandas19: pandas==0.19.2 pandas22: pandas==0.22.0 sklearn17: scikit-learn==0.17.1