Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 32a72dc

Browse files
kozlov-alexeyAlexanderKalistratov
authored andcommitted
Refactor Series.describe to a new-style (#362)
* Refactor Series.describe to a new-style * Applying review comments
1 parent b57c9ad commit 32a72dc

File tree

4 files changed

+265
-1
lines changed

4 files changed

+265
-1
lines changed

sdc/datatypes/common_functions.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,3 +480,34 @@ def sdc_check_indexes_equal_string_impl(A, B):
480480
return is_index_equal
481481

482482
return sdc_check_indexes_equal_string_impl
483+
484+
485+
@numba.njit
486+
def _sdc_pandas_format_percentiles(arr):
487+
""" Function converting float array of percentiles to a list of strings formatted
488+
the same as in pandas.io.formats.format.format_percentiles
489+
"""
490+
491+
percentiles_strs = []
492+
for percentile in arr:
493+
p_as_string = str(percentile * 100)
494+
495+
trim_index = len(p_as_string) - 1
496+
while trim_index >= 0:
497+
if p_as_string[trim_index] == '0':
498+
trim_index -= 1
499+
continue
500+
elif p_as_string[trim_index] == '.':
501+
break
502+
503+
trim_index += 1
504+
break
505+
506+
if trim_index < 0:
507+
p_as_string_trimmed = '0'
508+
else:
509+
p_as_string_trimmed = p_as_string[:trim_index]
510+
511+
percentiles_strs.append(p_as_string_trimmed + '%')
512+
513+
return percentiles_strs

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4489,3 +4489,110 @@ def hpat_pandas_series_pct_change_impl(self, periods=1, fill_method='pad', limit
44894489
return pandas.Series(result)
44904490

44914491
return hpat_pandas_series_pct_change_impl
4492+
4493+
4494+
@overload_method(SeriesType, 'describe')
4495+
def hpat_pandas_series_describe(self, percentiles=None, include=None, exclude=None):
4496+
"""
4497+
Pandas Series method :meth:`pandas.Series.describe` implementation.
4498+
4499+
Note: Differs from Pandas in returning statistics as Series of strings when applied to
4500+
Series of strings or date-time values
4501+
4502+
.. only:: developer
4503+
4504+
Tests: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_describe*
4505+
4506+
Parameters
4507+
----------
4508+
self: :obj:`pandas.Series`
4509+
Input series
4510+
percentiles: :obj:`list-like`
4511+
The percentiles to include in the output. The default is [.25, .5, .75]
4512+
All should fall between 0 and 1 and no duplicates are allowed.
4513+
include: 'all', :obj:`list-like` of dtypes or None, default None
4514+
A white list of data types to include in the result. Ignored for Series.
4515+
exclude: :obj:`list-like` of dtypes or None, default None
4516+
A black list of data types to omit from the result. Ignored for Series.
4517+
4518+
Returns
4519+
-------
4520+
:obj:`pandas.Series`
4521+
returns :obj:`pandas.Series` object containing summary statistics of the Series
4522+
"""
4523+
4524+
ty_checker = TypeChecker('Method describe().')
4525+
ty_checker.check(self, SeriesType)
4526+
4527+
if not (isinstance(percentiles, (types.List, types.Array, types.UniTuple))
4528+
and isinstance(percentiles.dtype, types.Number)
4529+
or isinstance(percentiles, (types.Omitted, types.NoneType))
4530+
or percentiles is None):
4531+
ty_checker.raise_exc(percentiles, 'list-like', 'percentiles')
4532+
4533+
is_percentiles_none = percentiles is None or isinstance(percentiles, (types.Omitted, types.NoneType))
4534+
4535+
if isinstance(self.dtype, types.Number):
4536+
def hpat_pandas_series_describe_numeric_impl(self, percentiles=None, include=None, exclude=None):
4537+
4538+
if is_percentiles_none == False: # noqa
4539+
percentiles_list = list(percentiles)
4540+
median_in_percentiles = 0.5 in percentiles_list
4541+
if not median_in_percentiles:
4542+
percentiles_list.append(0.5)
4543+
sorted_percentiles = sorted(percentiles_list)
4544+
4545+
# check percentiles have correct values:
4546+
arr = numpy.asarray(sorted_percentiles)
4547+
if len(numpy.unique(arr)) != len(arr):
4548+
raise ValueError("percentiles cannot contain duplicates")
4549+
if numpy.any(arr[(arr < 0) * (arr > 1)]):
4550+
raise ValueError("percentiles should all be in the interval [0, 1].")
4551+
4552+
# TODO: support proper rounding of percentiles like in pandas.io.formats.format.format_percentiles
4553+
# requires numpy.round(precision), numpy.isclose to be supported by Numba
4554+
percentiles_indexes = common_functions._sdc_pandas_format_percentiles(arr)
4555+
else:
4556+
sorted_percentiles = [0.25, 0.5, 0.75]
4557+
percentiles_indexes = ['25%', '50%', '75%']
4558+
4559+
index_strings = ['count', 'mean', 'std', 'min']
4560+
index_strings.extend(percentiles_indexes)
4561+
index_strings.append('max')
4562+
4563+
values = []
4564+
values.append(numpy.float64(self.count()))
4565+
values.append(self.mean())
4566+
values.append(self.std())
4567+
values.append(self.min())
4568+
for p in sorted_percentiles:
4569+
values.append(self.quantile(p))
4570+
values.append(self.max())
4571+
4572+
return pandas.Series(values, index_strings)
4573+
4574+
return hpat_pandas_series_describe_numeric_impl
4575+
4576+
elif isinstance(self.dtype, types.UnicodeType):
4577+
def hpat_pandas_series_describe_string_impl(self, percentiles=None, include=None, exclude=None):
4578+
4579+
objcounts = self.value_counts()
4580+
index_strings = ['count', 'unique', 'top', 'freq']
4581+
4582+
# use list of strings for the output series, since Numba doesn't support np.arrays with object dtype
4583+
values = []
4584+
values.append(str(self.count()))
4585+
values.append(str(len(self.unique())))
4586+
values.append(str(objcounts.index[0]))
4587+
values.append(str(objcounts.iloc[0]))
4588+
4589+
return pandas.Series(values, index_strings)
4590+
4591+
return hpat_pandas_series_describe_string_impl
4592+
4593+
elif isinstance(self.dtype, (types.NPDatetime, types.NPTimedelta)):
4594+
# TODO: provide specialization for (types.NPDatetime, types.NPTimedelta)
4595+
# needs dropna for date-time series, conversion to int and tz_convert to be implemented
4596+
return None
4597+
4598+
return None

sdc/hiframes/pd_series_ext.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -773,7 +773,7 @@ def generic_expand_cumulative_series(self, args, kws):
773773
'resolve_append', 'resolve_combine', 'resolve_corr', 'resolve_cov',
774774
'resolve_dropna', 'resolve_fillna', 'resolve_head', 'resolve_nlargest',
775775
'resolve_nsmallest', 'resolve_pct_change', 'resolve_rolling', 'resolve_loc',
776-
'resolve_value_counts'
776+
'resolve_value_counts', 'resolve_describe', 'resolve_iloc'
777777
]
778778

779779
# use ArrayAttribute for attributes not defined in SeriesAttribute

sdc/tests/test_series.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5556,6 +5556,132 @@ def test_series_isalnum_str(self):
55565556
S = pd.Series(ser)
55575557
pd.testing.assert_series_equal(cfunc(S), isalnum_usecase(S))
55585558

5559+
@skip_sdc_jit('Old-style implementation returns string, but not series')
5560+
def test_series_describe_numeric(self):
5561+
def test_impl(A):
5562+
return A.describe()
5563+
hpat_func = self.jit(test_impl)
5564+
5565+
n = 11
5566+
S = pd.Series(np.arange(n))
5567+
pd.testing.assert_series_equal(hpat_func(S), test_impl(S))
5568+
5569+
@skip_sdc_jit('Old-style implementation doesn\'t support pecentiles argument')
5570+
def test_series_describe_numeric_percentiles(self):
5571+
def test_impl(A, values):
5572+
return A.describe(percentiles=values)
5573+
hpat_func = self.jit(test_impl)
5574+
5575+
n = 11
5576+
S = pd.Series(np.arange(n))
5577+
supported_values = [
5578+
[0.323, 0.778, 0.1, 0.01, 0.2],
5579+
[0.001, 0.002],
5580+
[0.001, 0.5, 0.002],
5581+
[0.9999, 0.0001],
5582+
(0.323, 0.778, 0.1, 0.01, 0.2),
5583+
np.array([0, 1.0]),
5584+
np.array([0.323, 0.778, 0.1, 0.01, 0.2]),
5585+
None,
5586+
]
5587+
for percentiles in supported_values:
5588+
with self.subTest(percentiles=percentiles):
5589+
pd.testing.assert_series_equal(hpat_func(S, percentiles), test_impl(S, percentiles))
5590+
5591+
@skip_sdc_jit('Old-style implementation for string series is not supported')
5592+
def test_series_describe_str(self):
5593+
def test_impl(A):
5594+
return A.describe()
5595+
hpat_func = self.jit(test_impl)
5596+
5597+
S = pd.Series(['a', 'dd', None, 'bbbb', 'dd', '', 'dd', '', 'dd'])
5598+
# SDC implementation returns series of string, hence conversion of reference result is needed
5599+
pd.testing.assert_series_equal(hpat_func(S), test_impl(S).astype(str))
5600+
5601+
@skip_sdc_jit('Old-style implementation for datetime series is not supported')
5602+
@skip_numba_jit('Series.describe is not implemented for datatime Series due to Numba limitations\n'
5603+
'Requires dropna for pd.Timestamp (depends on Numba isnat) to be implemented')
5604+
def test_series_describe_dt(self):
5605+
def test_impl(A):
5606+
return A.describe()
5607+
hpat_func = self.jit(test_impl)
5608+
5609+
S = pd.Series([pd.Timestamp('1970-12-01 03:02:35'),
5610+
pd.NaT,
5611+
pd.Timestamp('1970-03-03 12:34:59'),
5612+
pd.Timestamp('1970-12-01 03:02:35'),
5613+
pd.Timestamp('2012-07-25'),
5614+
None])
5615+
# SDC implementation returns series of string, hence conversion of reference result is needed
5616+
pd.testing.assert_series_equal(hpat_func(S), test_impl(S).astype(str))
5617+
5618+
@skip_sdc_jit('Old-style implementation doesn\'t support pecentiles argument')
5619+
def test_series_describe_unsupported_percentiles(self):
5620+
def test_impl(A, values):
5621+
return A.describe(percentiles=values)
5622+
hpat_func = self.jit(test_impl)
5623+
5624+
n = 11
5625+
S = pd.Series(np.arange(n))
5626+
unsupported_values = [0.5, '0.77', True, ('a', 'b'), ['0.5', '0.7'], np.arange(0.1, 0.5, 0.1).astype(str)]
5627+
for percentiles in unsupported_values:
5628+
with self.assertRaises(TypingError) as raises:
5629+
hpat_func(S, percentiles)
5630+
msg = 'Method describe(). The object percentiles'
5631+
self.assertIn(msg, str(raises.exception))
5632+
5633+
@skip_sdc_jit('Old-style implementation doesn\'t support pecentiles argument')
5634+
def test_series_describe_invalid_percentiles(self):
5635+
def test_impl(A, values):
5636+
return A.describe(percentiles=values)
5637+
hpat_func = self.jit(test_impl)
5638+
5639+
n = 11
5640+
S = pd.Series(np.arange(n))
5641+
unsupported_values = [
5642+
[0.5, 0.7, 1.1],
5643+
[-0.5, 0.7, 1.1],
5644+
[0.5, 0.7, 0.2, 0.7]
5645+
]
5646+
for percentiles in unsupported_values:
5647+
with self.assertRaises(Exception) as context:
5648+
test_impl(S, percentiles)
5649+
pandas_exception = context.exception
5650+
5651+
self.assertRaises(type(pandas_exception), hpat_func, S, percentiles)
5652+
5653+
@skip_numba_jit('BUG: Series.count() impl for String series does count None elements, but it should not')
5654+
def test_series_count_string_with_none(self):
5655+
def test_impl(S):
5656+
return S.count()
5657+
hpat_func = self.jit(test_impl)
5658+
5659+
S = pd.Series(['a', 'dd', None, 'bbbb', 'dd', '', 'dd', '', 'dd'])
5660+
test_impl(S)
5661+
self.assertEqual(hpat_func(S), test_impl(S))
5662+
5663+
@skip_sdc_jit('BUG: Series.value_counts() impl for String series does count None elements, but it should not')
5664+
@skip_numba_jit('BUG: Series.value_counts() impl for String series does count None elements, but it should not')
5665+
def test_series_value_counts_string_with_none(self):
5666+
def test_impl(S):
5667+
return S.value_counts()
5668+
hpat_func = self.jit(test_impl)
5669+
5670+
S = pd.Series(['a', 'dd', None, 'bbbb', '', '', 'dd'])
5671+
pd.testing.assert_series_equal(hpat_func(S), test_impl(S))
5672+
5673+
@skip_sdc_jit('Fails occasionally due to use of non-stable sort in Pandas and SDC implementations')
5674+
@skip_numba_jit('Fails occasionally due to use of non-stable sort in Pandas and SDC implementations')
5675+
def test_series_value_counts_string_order_in_group(self):
5676+
def test_impl(S):
5677+
return S.value_counts()
5678+
hpat_func = self.jit(test_impl)
5679+
5680+
S = pd.Series(['c', 'dd', 'b', 'a', 'dd', 'dd', 'e', 'f', 'g'])
5681+
pandas_res = test_impl(S)
5682+
hpat_res = hpat_func(S)
5683+
pd.testing.assert_series_equal(hpat_res, pandas_res)
5684+
55595685

55605686
if __name__ == "__main__":
55615687
unittest.main()

0 commit comments

Comments
 (0)