Refactor Series.describe to a new-style (#362)

kozlov-alexey · AlexanderKalistratov · commit 32a72dca6a0a · 2019-12-31T02:04:09.000+03:00
* Refactor Series.describe to a new-style

* Applying review comments
diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py
@@ -480,3 +480,34 @@ def sdc_check_indexes_equal_string_impl(A, B):
             return is_index_equal
 
         return sdc_check_indexes_equal_string_impl
+
+
+@numba.njit
+def _sdc_pandas_format_percentiles(arr):
+    """ Function converting float array of percentiles to a list of strings formatted
+        the same as in pandas.io.formats.format.format_percentiles
+    """
+
+    percentiles_strs = []
+    for percentile in arr:
+        p_as_string = str(percentile * 100)
+
+        trim_index = len(p_as_string) - 1
+        while trim_index >= 0:
+            if p_as_string[trim_index] == '0':
+                trim_index -= 1
+                continue
+            elif p_as_string[trim_index] == '.':
+                break
+
+            trim_index += 1
+            break
+
+        if trim_index < 0:
+            p_as_string_trimmed = '0'
+        else:
+            p_as_string_trimmed = p_as_string[:trim_index]
+
+        percentiles_strs.append(p_as_string_trimmed + '%')
+
+    return percentiles_strs
diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py
@@ -4489,3 +4489,110 @@ def hpat_pandas_series_pct_change_impl(self, periods=1, fill_method='pad', limit
         return pandas.Series(result)
 
     return hpat_pandas_series_pct_change_impl
+
+
+@overload_method(SeriesType, 'describe')
+def hpat_pandas_series_describe(self, percentiles=None, include=None, exclude=None):
+    """
+    Pandas Series method :meth:`pandas.Series.describe` implementation.
+
+    Note: Differs from Pandas in returning statistics as Series of strings when applied to
+        Series of strings or date-time values
+
+    .. only:: developer
+
+       Tests: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_describe*
+
+    Parameters
+    ----------
+    self: :obj:`pandas.Series`
+        Input series
+    percentiles: :obj:`list-like`
+        The percentiles to include in the output. The default is [.25, .5, .75]
+        All should fall between 0 and 1 and no duplicates are allowed.
+    include: 'all', :obj:`list-like` of dtypes or None, default None
+        A white list of data types to include in the result. Ignored for Series.
+    exclude: :obj:`list-like` of dtypes or None, default None
+        A black list of data types to omit from the result. Ignored for Series.
+
+    Returns
+    -------
+    :obj:`pandas.Series`
+        returns :obj:`pandas.Series` object containing summary statistics of the Series
+    """
+
+    ty_checker = TypeChecker('Method describe().')
+    ty_checker.check(self, SeriesType)
+
+    if not (isinstance(percentiles, (types.List, types.Array, types.UniTuple))
+            and isinstance(percentiles.dtype, types.Number)
+            or isinstance(percentiles, (types.Omitted, types.NoneType))
+            or percentiles is None):
+        ty_checker.raise_exc(percentiles, 'list-like', 'percentiles')
+
+    is_percentiles_none = percentiles is None or isinstance(percentiles, (types.Omitted, types.NoneType))
+
+    if isinstance(self.dtype, types.Number):
+        def hpat_pandas_series_describe_numeric_impl(self, percentiles=None, include=None, exclude=None):
+
+            if is_percentiles_none == False:  # noqa
+                percentiles_list = list(percentiles)
+                median_in_percentiles = 0.5 in percentiles_list
+                if not median_in_percentiles:
+                    percentiles_list.append(0.5)
+                sorted_percentiles = sorted(percentiles_list)
+
+                # check percentiles have correct values:
+                arr = numpy.asarray(sorted_percentiles)
+                if len(numpy.unique(arr)) != len(arr):
+                    raise ValueError("percentiles cannot contain duplicates")
+                if numpy.any(arr[(arr < 0) * (arr > 1)]):
+                    raise ValueError("percentiles should all be in the interval [0, 1].")
+
+                # TODO: support proper rounding of percentiles like in pandas.io.formats.format.format_percentiles
+                # requires numpy.round(precision), numpy.isclose to be supported by Numba
+                percentiles_indexes = common_functions._sdc_pandas_format_percentiles(arr)
+            else:
+                sorted_percentiles = [0.25, 0.5, 0.75]
+                percentiles_indexes = ['25%', '50%', '75%']
+
+            index_strings = ['count', 'mean', 'std', 'min']
+            index_strings.extend(percentiles_indexes)
+            index_strings.append('max')
+
+            values = []
+            values.append(numpy.float64(self.count()))
+            values.append(self.mean())
+            values.append(self.std())
+            values.append(self.min())
+            for p in sorted_percentiles:
+                values.append(self.quantile(p))
+            values.append(self.max())
+
+            return pandas.Series(values, index_strings)
+
+        return hpat_pandas_series_describe_numeric_impl
+
+    elif isinstance(self.dtype, types.UnicodeType):
+        def hpat_pandas_series_describe_string_impl(self, percentiles=None, include=None, exclude=None):
+
+            objcounts = self.value_counts()
+            index_strings = ['count', 'unique', 'top', 'freq']
+
+            # use list of strings for the output series, since Numba doesn't support np.arrays with object dtype
+            values = []
+            values.append(str(self.count()))
+            values.append(str(len(self.unique())))
+            values.append(str(objcounts.index[0]))
+            values.append(str(objcounts.iloc[0]))
+
+            return pandas.Series(values, index_strings)
+
+        return hpat_pandas_series_describe_string_impl
+
+    elif isinstance(self.dtype, (types.NPDatetime, types.NPTimedelta)):
+        # TODO: provide specialization for (types.NPDatetime, types.NPTimedelta)
+        # needs dropna for date-time series, conversion to int and tz_convert to be implemented
+        return None
+
+    return None
diff --git a/sdc/hiframes/pd_series_ext.py b/sdc/hiframes/pd_series_ext.py
@@ -773,7 +773,7 @@ def generic_expand_cumulative_series(self, args, kws):
     'resolve_append', 'resolve_combine', 'resolve_corr', 'resolve_cov',
     'resolve_dropna', 'resolve_fillna', 'resolve_head', 'resolve_nlargest',
     'resolve_nsmallest', 'resolve_pct_change', 'resolve_rolling', 'resolve_loc',
-    'resolve_value_counts'
+    'resolve_value_counts', 'resolve_describe', 'resolve_iloc'
 ]
 
 # use ArrayAttribute for attributes not defined in SeriesAttribute
diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py
@@ -5556,6 +5556,132 @@ def test_series_isalnum_str(self):
             S = pd.Series(ser)
             pd.testing.assert_series_equal(cfunc(S), isalnum_usecase(S))
 
+    @skip_sdc_jit('Old-style implementation returns string, but not series')
+    def test_series_describe_numeric(self):
+        def test_impl(A):
+            return A.describe()
+        hpat_func = self.jit(test_impl)
+
+        n = 11
+        S = pd.Series(np.arange(n))
+        pd.testing.assert_series_equal(hpat_func(S), test_impl(S))
+
+    @skip_sdc_jit('Old-style implementation doesn\'t support pecentiles argument')
+    def test_series_describe_numeric_percentiles(self):
+        def test_impl(A, values):
+            return A.describe(percentiles=values)
+        hpat_func = self.jit(test_impl)
+
+        n = 11
+        S = pd.Series(np.arange(n))
+        supported_values = [
+            [0.323, 0.778, 0.1, 0.01, 0.2],
+            [0.001, 0.002],
+            [0.001, 0.5, 0.002],
+            [0.9999, 0.0001],
+            (0.323, 0.778, 0.1, 0.01, 0.2),
+            np.array([0, 1.0]),
+            np.array([0.323, 0.778, 0.1, 0.01, 0.2]),
+            None,
+        ]
+        for percentiles in supported_values:
+            with self.subTest(percentiles=percentiles):
+                pd.testing.assert_series_equal(hpat_func(S, percentiles), test_impl(S, percentiles))
+
+    @skip_sdc_jit('Old-style implementation for string series is not supported')
+    def test_series_describe_str(self):
+        def test_impl(A):
+            return A.describe()
+        hpat_func = self.jit(test_impl)
+
+        S = pd.Series(['a', 'dd', None, 'bbbb', 'dd', '', 'dd', '', 'dd'])
+        # SDC implementation returns series of string, hence conversion of reference result is needed
+        pd.testing.assert_series_equal(hpat_func(S), test_impl(S).astype(str))
+
+    @skip_sdc_jit('Old-style implementation for datetime series is not supported')
+    @skip_numba_jit('Series.describe is not implemented for datatime Series due to Numba limitations\n'
+                    'Requires dropna for pd.Timestamp (depends on Numba isnat) to be implemented')
+    def test_series_describe_dt(self):
+        def test_impl(A):
+            return A.describe()
+        hpat_func = self.jit(test_impl)
+
+        S = pd.Series([pd.Timestamp('1970-12-01 03:02:35'),
+                       pd.NaT,
+                       pd.Timestamp('1970-03-03 12:34:59'),
+                       pd.Timestamp('1970-12-01 03:02:35'),
+                       pd.Timestamp('2012-07-25'),
+                       None])
+        # SDC implementation returns series of string, hence conversion of reference result is needed
+        pd.testing.assert_series_equal(hpat_func(S), test_impl(S).astype(str))
+
+    @skip_sdc_jit('Old-style implementation doesn\'t support pecentiles argument')
+    def test_series_describe_unsupported_percentiles(self):
+        def test_impl(A, values):
+            return A.describe(percentiles=values)
+        hpat_func = self.jit(test_impl)
+
+        n = 11
+        S = pd.Series(np.arange(n))
+        unsupported_values = [0.5, '0.77', True, ('a', 'b'), ['0.5', '0.7'], np.arange(0.1, 0.5, 0.1).astype(str)]
+        for percentiles in unsupported_values:
+            with self.assertRaises(TypingError) as raises:
+                hpat_func(S, percentiles)
+            msg = 'Method describe(). The object percentiles'
+            self.assertIn(msg, str(raises.exception))
+
+    @skip_sdc_jit('Old-style implementation doesn\'t support pecentiles argument')
+    def test_series_describe_invalid_percentiles(self):
+        def test_impl(A, values):
+            return A.describe(percentiles=values)
+        hpat_func = self.jit(test_impl)
+
+        n = 11
+        S = pd.Series(np.arange(n))
+        unsupported_values = [
+            [0.5, 0.7, 1.1],
+            [-0.5, 0.7, 1.1],
+            [0.5, 0.7, 0.2, 0.7]
+        ]
+        for percentiles in unsupported_values:
+            with self.assertRaises(Exception) as context:
+                test_impl(S, percentiles)
+            pandas_exception = context.exception
+
+            self.assertRaises(type(pandas_exception), hpat_func, S, percentiles)
+
+    @skip_numba_jit('BUG: Series.count() impl for String series does count None elements, but it should not')
+    def test_series_count_string_with_none(self):
+        def test_impl(S):
+            return S.count()
+        hpat_func = self.jit(test_impl)
+
+        S = pd.Series(['a', 'dd', None, 'bbbb', 'dd', '', 'dd', '', 'dd'])
+        test_impl(S)
+        self.assertEqual(hpat_func(S), test_impl(S))
+
+    @skip_sdc_jit('BUG: Series.value_counts() impl for String series does count None elements, but it should not')
+    @skip_numba_jit('BUG: Series.value_counts() impl for String series does count None elements, but it should not')
+    def test_series_value_counts_string_with_none(self):
+        def test_impl(S):
+            return S.value_counts()
+        hpat_func = self.jit(test_impl)
+
+        S = pd.Series(['a', 'dd', None, 'bbbb', '', '', 'dd'])
+        pd.testing.assert_series_equal(hpat_func(S), test_impl(S))
+
+    @skip_sdc_jit('Fails occasionally due to use of non-stable sort in Pandas and SDC implementations')
+    @skip_numba_jit('Fails occasionally due to use of non-stable sort in Pandas and SDC implementations')
+    def test_series_value_counts_string_order_in_group(self):
+        def test_impl(S):
+            return S.value_counts()
+        hpat_func = self.jit(test_impl)
+
+        S = pd.Series(['c', 'dd', 'b', 'a', 'dd', 'dd', 'e', 'f', 'g'])
+        pandas_res = test_impl(S)
+        hpat_res = hpat_func(S)
+        pd.testing.assert_series_equal(hpat_res, pandas_res)
+
 
 if __name__ == "__main__":
     unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -773,7 +773,7 @@ def generic_expand_cumulative_series(self, args, kws):`
`773`	`773`	`'resolve_append', 'resolve_combine', 'resolve_corr', 'resolve_cov',`
`774`	`774`	`'resolve_dropna', 'resolve_fillna', 'resolve_head', 'resolve_nlargest',`
`775`	`775`	`'resolve_nsmallest', 'resolve_pct_change', 'resolve_rolling', 'resolve_loc',`
`776`		`- 'resolve_value_counts'`
	`776`	`+ 'resolve_value_counts', 'resolve_describe', 'resolve_iloc'`
`777`	`777`	`]`
`778`	`778`
`779`	`779`	`# use ArrayAttribute for attributes not defined in SeriesAttribute`