Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit b5bbbaa

Browse files
authored
Overload df.rolling.cov, add perf.test (#547)
* Overload df.rolling.cov, add perf.test * Add missing perf.tests for df.rolling * Fix issue with name of the method in exception msg * Minor fixes for df.rolling.cov
1 parent c8f6cbf commit b5bbbaa

File tree

5 files changed

+344
-118
lines changed

5 files changed

+344
-118
lines changed

sdc/datatypes/common_functions.py

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,14 @@
4141
from numba import numpy_support
4242

4343
import sdc
44-
from sdc.str_arr_type import string_array_type
45-
from sdc.str_arr_ext import (num_total_chars, append_string_array_to,
46-
str_arr_is_na, pre_alloc_string_array, str_arr_set_na,
47-
cp_str_list_to_array)
44+
from sdc.hiframes.pd_series_type import SeriesType
45+
from sdc.str_arr_ext import (
46+
append_string_array_to, cp_str_list_to_array, num_total_chars,
47+
pre_alloc_string_array, str_arr_is_na, str_arr_set_na, string_array_type
48+
)
4849
from sdc.utilities.utils import sdc_overload, sdc_register_jitable
49-
from sdc.utilities.sdc_typing_utils import find_common_dtype_from_numpy_dtypes
50+
from sdc.utilities.sdc_typing_utils import (find_common_dtype_from_numpy_dtypes,
51+
TypeChecker)
5052

5153

5254
def hpat_arrays_append(A, B):
@@ -537,3 +539,55 @@ def _sdc_pandas_series_check_axis_impl(axis):
537539
return _sdc_pandas_series_check_axis_impl
538540

539541
return None
542+
543+
544+
def _sdc_pandas_series_align(series, other, size='max', finiteness=False):
545+
"""
546+
Align series and other series by
547+
size where size of output series is max/min size of input series
548+
finiteness where all the infinite and matched finite values are replaced with nans, e.g.
549+
series: [1., inf, inf, -1., 0.] -> [1., nan, nan, -1., 0.]
550+
other: [1., -1., 0., 0.1, -0.1] -> [1., nan, nan, 0.1, -0.1]
551+
"""
552+
pass
553+
554+
555+
@sdc_overload(_sdc_pandas_series_align, jit_options={'parallel': False})
556+
def _sdc_pandas_series_align_overload(series, other, size='max', finiteness=False):
557+
ty_checker = TypeChecker('Function sdc.common_functions._sdc_pandas_series_align().')
558+
ty_checker.check(series, SeriesType)
559+
ty_checker.check(other, SeriesType)
560+
561+
str_types = (str, types.StringLiteral, types.UnicodeType, types.Omitted)
562+
if not isinstance(size, str_types):
563+
ty_checker.raise_exc(size, 'str', 'size')
564+
565+
if not isinstance(finiteness, (bool, types.Boolean, types.Omitted)):
566+
ty_checker.raise_exc(finiteness, 'bool', 'finiteness')
567+
568+
def _sdc_pandas_series_align_impl(series, other, size='max', finiteness=False):
569+
if size != 'max' and size != 'min':
570+
raise ValueError("Function sdc.common_functions._sdc_pandas_series_align(). "
571+
"The object size\n expected: 'max' or 'min'")
572+
573+
arr, other_arr = series._data, other._data
574+
arr_len, other_arr_len = len(arr), len(other_arr)
575+
min_length = min(arr_len, other_arr_len)
576+
length = max(arr_len, other_arr_len) if size == 'max' else min_length
577+
578+
aligned_arr = numpy.repeat([numpy.nan], length)
579+
aligned_other_arr = numpy.repeat([numpy.nan], length)
580+
581+
for i in numba.prange(min_length):
582+
if not finiteness or (numpy.isfinite(arr[i]) and numpy.isfinite(other_arr[i])):
583+
aligned_arr[i] = arr[i]
584+
aligned_other_arr[i] = other_arr[i]
585+
else:
586+
aligned_arr[i] = aligned_other_arr[i] = numpy.nan
587+
588+
aligned = pandas.Series(aligned_arr, name=series._name)
589+
aligned_other = pandas.Series(aligned_other_arr, name=other._name)
590+
591+
return aligned, aligned_other
592+
593+
return _sdc_pandas_series_align_impl

sdc/datatypes/hpat_pandas_dataframe_rolling_functions.py

Lines changed: 95 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def df_rolling_method_other_df_codegen(method_name, self, other, args=None, kws=
120120
' else:',
121121
' _pairwise = pairwise',
122122
' if _pairwise:',
123-
' raise ValueError("Method rolling.corr(). The object pairwise\\n expected: False, None")'
123+
f' raise ValueError("Method rolling.{method_name}(). The object pairwise\\n expected: False, None")'
124124
]
125125

126126
data_length = 'len(get_dataframe_data(self._data, 0))' if data_columns else '0'
@@ -139,7 +139,7 @@ def df_rolling_method_other_df_codegen(method_name, self, other, args=None, kws=
139139
f' series_{col} = pandas.Series(data_{col})',
140140
f' {other_series} = pandas.Series(other_data_{col})',
141141
f' rolling_{col} = series_{col}.rolling({rolling_params})',
142-
f' result_{col} = rolling_{col}.corr({method_params})',
142+
f' result_{col} = rolling_{col}.{method_name}({method_params})',
143143
f' {res_data} = result_{col}._data[:length]'
144144
]
145145
else:
@@ -182,32 +182,41 @@ def df_rolling_method_main_codegen(method_params, df_columns, method_name):
182182
return func_lines
183183

184184

185-
def df_rolling_method_other_none_codegen(method_name, self, args=None, kws=None):
186-
args = args or []
187-
kwargs = kws or {}
185+
def gen_df_rolling_method_other_none_codegen(rewrite_name=None):
186+
"""Generate df.rolling method code generator based on name of the method"""
187+
def df_rolling_method_other_none_codegen(method_name, self, args=None, kws=None):
188+
_method_name = rewrite_name or method_name
189+
args = args or []
190+
kwargs = kws or {}
188191

189-
impl_params = ['self'] + args + params2list(kwargs)
190-
impl_params_as_str = ', '.join(impl_params)
192+
impl_params = ['self'] + args + params2list(kwargs)
193+
impl_params_as_str = ', '.join(impl_params)
191194

192-
impl_name = f'_df_rolling_{method_name}_other_none_impl'
193-
func_lines = [f'def {impl_name}({impl_params_as_str}):']
195+
impl_name = f'_df_rolling_{_method_name}_other_none_impl'
196+
func_lines = [f'def {impl_name}({impl_params_as_str}):']
194197

195-
if 'pairwise' in kwargs:
196-
func_lines += [
197-
' if pairwise is None:',
198-
' _pairwise = True',
199-
' else:',
200-
' _pairwise = pairwise',
201-
' if _pairwise:',
202-
' raise ValueError("Method rolling.corr(). The object pairwise\\n expected: False")'
203-
]
204-
method_params = args + ['{}={}'.format(k, k) for k in kwargs if k != 'other']
205-
func_lines += df_rolling_method_main_codegen(method_params, self.data.columns, method_name)
206-
func_text = '\n'.join(func_lines)
198+
if 'pairwise' in kwargs:
199+
func_lines += [
200+
' if pairwise is None:',
201+
' _pairwise = True',
202+
' else:',
203+
' _pairwise = pairwise',
204+
' if _pairwise:',
205+
f' raise ValueError("Method rolling.{_method_name}(). The object pairwise\\n expected: False")'
206+
]
207+
method_params = args + ['{}={}'.format(k, k) for k in kwargs if k != 'other']
208+
func_lines += df_rolling_method_main_codegen(method_params, self.data.columns, method_name)
209+
func_text = '\n'.join(func_lines)
207210

208-
global_vars = {'pandas': pandas, 'get_dataframe_data': get_dataframe_data}
211+
global_vars = {'pandas': pandas, 'get_dataframe_data': get_dataframe_data}
209212

210-
return func_text, global_vars
213+
return func_text, global_vars
214+
215+
return df_rolling_method_other_none_codegen
216+
217+
218+
df_rolling_method_other_none_codegen = gen_df_rolling_method_other_none_codegen()
219+
df_rolling_cov_other_none_codegen = gen_df_rolling_method_other_none_codegen('cov')
211220

212221

213222
def df_rolling_method_codegen(method_name, self, args=None, kws=None):
@@ -249,6 +258,16 @@ def gen_df_rolling_method_other_none_impl(method_name, self, args=None, kws=None
249258
return _impl
250259

251260

261+
def gen_df_rolling_cov_other_none_impl(method_name, self, args=None, kws=None):
262+
func_text, global_vars = df_rolling_cov_other_none_codegen(method_name, self,
263+
args=args, kws=kws)
264+
loc_vars = {}
265+
exec(func_text, global_vars, loc_vars)
266+
_impl = loc_vars[f'_df_rolling_cov_other_none_impl']
267+
268+
return _impl
269+
270+
252271
def gen_df_rolling_method_impl(method_name, self, args=None, kws=None):
253272
func_text, global_vars = df_rolling_method_codegen(method_name, self,
254273
args=args, kws=kws)
@@ -308,6 +327,37 @@ def sdc_pandas_dataframe_rolling_count(self):
308327
return gen_df_rolling_method_impl('count', self)
309328

310329

330+
@sdc_overload_method(DataFrameRollingType, 'cov')
331+
def sdc_pandas_dataframe_rolling_cov(self, other=None, pairwise=None, ddof=1):
332+
333+
ty_checker = TypeChecker('Method rolling.cov().')
334+
ty_checker.check(self, DataFrameRollingType)
335+
336+
accepted_other = (Omitted, NoneType, DataFrameType, SeriesType)
337+
if not isinstance(other, accepted_other) and other is not None:
338+
ty_checker.raise_exc(other, 'DataFrame, Series', 'other')
339+
340+
accepted_pairwise = (bool, Boolean, Omitted, NoneType)
341+
if not isinstance(pairwise, accepted_pairwise) and pairwise is not None:
342+
ty_checker.raise_exc(pairwise, 'bool', 'pairwise')
343+
344+
if not isinstance(ddof, (int, Integer, Omitted)):
345+
ty_checker.raise_exc(ddof, 'int', 'ddof')
346+
347+
none_other = isinstance(other, (Omitted, NoneType)) or other is None
348+
kws = {'other': 'None', 'pairwise': 'None', 'ddof': '1'}
349+
350+
if none_other:
351+
# method _df_cov in comparison to method cov doesn't align input data
352+
# by replacing infinite and matched finite values with nans
353+
return gen_df_rolling_cov_other_none_impl('_df_cov', self, kws=kws)
354+
355+
if isinstance(other, DataFrameType):
356+
return gen_df_rolling_method_other_df_impl('cov', self, other, kws=kws)
357+
358+
return gen_df_rolling_method_impl('cov', self, kws=kws)
359+
360+
311361
@sdc_overload_method(DataFrameRollingType, 'kurt')
312362
def sdc_pandas_dataframe_rolling_kurt(self):
313363

@@ -457,6 +507,28 @@ def sdc_pandas_dataframe_rolling_var(self, ddof=1):
457507
'extra_params': ''
458508
})
459509

510+
sdc_pandas_dataframe_rolling_cov.__doc__ = sdc_pandas_dataframe_rolling_docstring_tmpl.format(**{
511+
'method_name': 'cov',
512+
'example_caption': 'Calculate rolling covariance.',
513+
'limitations_block':
514+
"""
515+
Limitations
516+
-----------
517+
DataFrame elements cannot be max/min float/integer. Otherwise SDC and Pandas results are different.
518+
Different size of `self` and `other` can produce result different from the result of Pandas
519+
due to different float rounding in Python and SDC.
520+
""",
521+
'extra_params':
522+
"""
523+
other: :obj:`Series` or :obj:`DataFrame`
524+
Other Series or DataFrame.
525+
pairwise: :obj:`bool`
526+
Calculate pairwise combinations of columns within a DataFrame.
527+
ddof: :obj:`int`
528+
Delta Degrees of Freedom.
529+
"""
530+
})
531+
460532
sdc_pandas_dataframe_rolling_kurt.__doc__ = sdc_pandas_dataframe_rolling_docstring_tmpl.format(**{
461533
'method_name': 'kurt',
462534
'example_caption': 'Calculate unbiased rolling kurtosis.',

sdc/datatypes/hpat_pandas_series_rolling_functions.py

Lines changed: 36 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,10 @@
3434
from numba.types import (float64, Boolean, Integer, NoneType, Number,
3535
Omitted, StringLiteral, UnicodeType)
3636

37-
from sdc.utilities.sdc_typing_utils import TypeChecker
37+
from sdc.datatypes.common_functions import _sdc_pandas_series_align
3838
from sdc.datatypes.hpat_pandas_series_rolling_types import SeriesRollingType
39+
from sdc.hiframes.pd_series_type import SeriesType
40+
from sdc.utilities.sdc_typing_utils import TypeChecker
3941
from sdc.utilities.utils import sdc_overload_method, sdc_register_jitable
4042

4143

@@ -111,15 +113,6 @@ def arr_nonnan_count(arr):
111113
return len(arr) - numpy.isnan(arr).sum()
112114

113115

114-
@sdc_register_jitable
115-
def arr_cov(x, y, ddof):
116-
"""Calculate covariance of values 1D arrays x and y of the same size"""
117-
if len(x) == 0:
118-
return numpy.nan
119-
120-
return numpy.cov(x, y, ddof=ddof)[0, 1]
121-
122-
123116
@sdc_register_jitable
124117
def _moment(arr, moment):
125118
mn = numpy.mean(arr)
@@ -451,16 +444,15 @@ def hpat_pandas_rolling_series_count_impl(self):
451444
return hpat_pandas_rolling_series_count_impl
452445

453446

454-
@sdc_rolling_overload(SeriesRollingType, 'cov')
455-
def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):
456-
447+
def _hpat_pandas_series_rolling_cov_check_types(self, other=None,
448+
pairwise=None, ddof=1):
449+
"""Check types of parameters of series.rolling.cov()"""
457450
ty_checker = TypeChecker('Method rolling.cov().')
458451
ty_checker.check(self, SeriesRollingType)
459452

460-
# TODO: check `other` is Series after a circular import of SeriesType fixed
461-
# accepted_other = (bool, Omitted, NoneType, SeriesType)
462-
# if not isinstance(other, accepted_other) and other is not None:
463-
# ty_checker.raise_exc(other, 'Series', 'other')
453+
accepted_other = (bool, Omitted, NoneType, SeriesType)
454+
if not isinstance(other, accepted_other) and other is not None:
455+
ty_checker.raise_exc(other, 'Series', 'other')
464456

465457
accepted_pairwise = (bool, Boolean, Omitted, NoneType)
466458
if not isinstance(pairwise, accepted_pairwise) and pairwise is not None:
@@ -469,50 +461,48 @@ def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):
469461
if not isinstance(ddof, (int, Integer, Omitted)):
470462
ty_checker.raise_exc(ddof, 'int', 'ddof')
471463

464+
465+
def _gen_hpat_pandas_rolling_series_cov_impl(other, align_finiteness=False):
466+
"""Generate series.rolling.cov() implementation based on series alignment"""
472467
nan_other = isinstance(other, (Omitted, NoneType)) or other is None
473468

474-
def hpat_pandas_rolling_series_cov_impl(self, other=None, pairwise=None, ddof=1):
469+
def _impl(self, other=None, pairwise=None, ddof=1):
475470
win = self._window
476471
minp = self._min_periods
477472

478473
main_series = self._data
479-
main_arr = main_series._data
480-
main_arr_length = len(main_arr)
481-
482474
if nan_other == True: # noqa
483-
other_arr = main_arr
475+
other_series = main_series
484476
else:
485-
other_arr = other._data
477+
other_series = other
486478

487-
other_arr_length = len(other_arr)
488-
length = max(main_arr_length, other_arr_length)
489-
output_arr = numpy.empty(length, dtype=float64)
479+
main_aligned, other_aligned = _sdc_pandas_series_align(main_series, other_series,
480+
finiteness=align_finiteness)
481+
count = (main_aligned + other_aligned).rolling(win).count()
482+
bias_adj = count / (count - ddof)
490483

491-
def calc_cov(main, other, ddof, minp):
492-
# align arrays `main` and `other` by size and finiteness
493-
min_length = min(len(main), len(other))
494-
main_valid_indices = numpy.isfinite(main[:min_length])
495-
other_valid_indices = numpy.isfinite(other[:min_length])
496-
valid = main_valid_indices & other_valid_indices
484+
def mean(series):
485+
return series.rolling(win, min_periods=minp).mean()
497486

498-
if len(main[valid]) < minp:
499-
return numpy.nan
500-
else:
501-
return arr_cov(main[valid], other[valid], ddof)
487+
return (mean(main_aligned * other_aligned) - mean(main_aligned) * mean(other_aligned)) * bias_adj
502488

503-
for i in prange(min(win, length)):
504-
main_arr_range = main_arr[:i + 1]
505-
other_arr_range = other_arr[:i + 1]
506-
output_arr[i] = calc_cov(main_arr_range, other_arr_range, ddof, minp)
489+
return _impl
507490

508-
for i in prange(win, length):
509-
main_arr_range = main_arr[i + 1 - win:i + 1]
510-
other_arr_range = other_arr[i + 1 - win:i + 1]
511-
output_arr[i] = calc_cov(main_arr_range, other_arr_range, ddof, minp)
512491

513-
return pandas.Series(output_arr)
492+
@sdc_rolling_overload(SeriesRollingType, 'cov')
493+
def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):
494+
_hpat_pandas_series_rolling_cov_check_types(self, other=other,
495+
pairwise=pairwise, ddof=ddof)
496+
497+
return _gen_hpat_pandas_rolling_series_cov_impl(other, align_finiteness=True)
498+
499+
500+
@sdc_rolling_overload(SeriesRollingType, '_df_cov')
501+
def hpat_pandas_series_rolling_cov(self, other=None, pairwise=None, ddof=1):
502+
_hpat_pandas_series_rolling_cov_check_types(self, other=other,
503+
pairwise=pairwise, ddof=ddof)
514504

515-
return hpat_pandas_rolling_series_cov_impl
505+
return _gen_hpat_pandas_rolling_series_cov_impl(other)
516506

517507

518508
@sdc_rolling_overload(SeriesRollingType, 'kurt')

0 commit comments

Comments
 (0)