Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 923b23a

Browse files
RubtsowaAlexanderKalistratov
authored andcommitted
add reduce (#366)
* add reduce * add reduce * change * change hpat->sdc, change input parameters * change * add example for check reduce * division into 2 functions * add selection of parameters * comment string in __init__ * import ovetload for DF * unskip test * correction allocation params * correction default parametrs * unskiped test, added input parameters for series * delete print, skip with SDC_CONFIG_PIPELINE=1, not work with arguments * comment string in __init__ * commented function * fixed style issues * change
1 parent 90e241a commit 923b23a

File tree

5 files changed

+88
-34
lines changed

5 files changed

+88
-34
lines changed

sdc/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,10 @@
6161
"""
6262

6363
# sdc.config.numba_compiler_define_nopython_pipeline_orig = \
64-
# numba.compiler.DefaultPassBuilder.define_nopython_pipeline
64+
# numba.compiler.DefaultPassBuilder.define_nopython_pipeline
6565
# numba.compiler.DefaultPassBuilder.define_nopython_pipeline = \
66-
# sdc.datatypes.hpat_pandas_dataframe_pass.sdc_nopython_pipeline_lite_register
66+
# sdc.datatypes.hpat_pandas_dataframe_pass.sdc_nopython_pipeline_lite_register
67+
6768

6869
def _init_extension():
6970
'''Register Pandas classes and functions with Numba.

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 79 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -31,65 +31,116 @@
3131

3232
import operator
3333
import pandas
34+
import copy
35+
import numpy
36+
37+
import sdc
3438

3539
from numba import types
3640
from numba.extending import (overload, overload_method, overload_attribute)
41+
from sdc.hiframes.pd_dataframe_ext import DataFrameType
3742
from numba.errors import TypingError
43+
import sdc.datatypes.hpat_pandas_dataframe_types
44+
45+
from sdc.datatypes.hpat_pandas_series_functions import TypeChecker
46+
47+
48+
# Example func_text for func_name='count' columns=('A', 'B'):
49+
#
50+
# def _df_count_impl(df, axis=0, level=None, numeric_only=False):
51+
# series_A = init_series(get_dataframe_data(df, 0))
52+
# result_A = series_A.count(level=level)
53+
# series_B = init_series(get_dataframe_data(df, 1))
54+
# result_B = series_B.count(level=level)
55+
# return pandas.Series([result_A, result_B], ['A', 'B'])
56+
57+
58+
def _dataframe_reduce_columns_codegen(func_name, func_params, series_params, columns):
59+
result_name_list = []
60+
joined = ', '.join(func_params)
61+
func_lines = [f'def _df_{func_name}_impl({joined}):']
62+
for i, c in enumerate(columns):
63+
result_c = f'result_{c}'
64+
func_lines += [f' series_{c} = init_series(get_dataframe_data({func_params[0]}, {i}))',
65+
f' {result_c} = series_{c}.{func_name}({series_params})']
66+
result_name_list.append(result_c)
67+
all_results = ', '.join(result_name_list)
68+
all_columns = ', '.join([f"'{c}'" for c in columns])
69+
70+
func_lines += [f' return pandas.Series([{all_results}], [{all_columns}])']
71+
func_text = '\n'.join(func_lines)
72+
73+
global_vars = {'pandas': pandas, 'np': numpy,
74+
'init_series': sdc.hiframes.api.init_series,
75+
'get_dataframe_data': sdc.hiframes.pd_dataframe_ext.get_dataframe_data}
76+
77+
return func_text, global_vars
3878

39-
from sdc.datatypes.hpat_pandas_dataframe_types import DataFrameType
40-
from sdc.utils import sdc_overload_method
4179

80+
def sdc_pandas_dataframe_reduce_columns(df, func_name, params, ser_params):
81+
all_params = ['df']
82+
ser_par = []
4283

43-
@sdc_overload_method(DataFrameType, 'count')
44-
def sdc_pandas_dataframe_count(self, axis=0, level=None, numeric_only=False):
84+
for key, value in params.items():
85+
all_params.append('{}={}'.format(key, value))
86+
for key, value in ser_params.items():
87+
ser_par.append('{}={}'.format(key, value))
88+
89+
s_par = '{}'.format(', '.join(ser_par[:]))
90+
91+
df_func_name = f'_df_{func_name}_impl'
92+
93+
func_text, global_vars = _dataframe_reduce_columns_codegen(func_name, all_params, s_par, df.columns)
94+
95+
loc_vars = {}
96+
exec(func_text, global_vars, loc_vars)
97+
_reduce_impl = loc_vars[df_func_name]
98+
99+
return _reduce_impl
100+
101+
102+
@overload_method(DataFrameType, 'count')
103+
def count_overload(df, axis=0, level=None, numeric_only=False):
45104
"""
46105
Pandas DataFrame method :meth:`pandas.DataFrame.count` implementation.
47106
48107
.. only:: developer
49108
50-
Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count
109+
Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count
110+
Test: python -m sdc.runtests sdc.tests.test_dataframe.TestDataFrame.test_count1
51111
52112
Parameters
53113
-----------
54114
self: :class:`pandas.DataFrame`
55-
input arg
115+
input arg
56116
axis:
57-
*unsupported*
117+
*unsupported*
58118
level:
59-
*unsupported*
119+
*unsupported*
60120
numeric_only:
61-
*unsupported*
121+
*unsupported*
62122
63123
Returns
64124
-------
65125
:obj:`pandas.Series` or `pandas.DataFrame`
66-
returns: For each column/row the number of non-NA/null entries. If level is specified returns a DataFrame.
126+
for each column/row the number of non-NA/null entries. If level is specified returns a DataFrame.
67127
"""
68128

69-
_func_name = 'Method pandas.dataframe.count().'
129+
name = 'count'
70130

71-
if not isinstance(self, DataFrameType):
72-
raise TypingError('{} The object must be a pandas.dataframe. Given: {}'.format(_func_name, self))
131+
ty_checker = TypeChecker('Method {}().'.format(name))
132+
ty_checker.check(df, DataFrameType)
73133

74134
if not (isinstance(axis, types.Omitted) or axis == 0):
75-
raise TypingError("{} 'axis' unsupported. Given: {}".format(_func_name, axis))
135+
ty_checker.raise_exc(axis, 'unsupported', 'axis')
76136

77137
if not (isinstance(level, types.Omitted) or level is None):
78-
raise TypingError("{} 'level' unsupported. Given: {}".format(_func_name, axis))
138+
ty_checker.raise_exc(level, 'unsupported', 'level')
79139

80140
if not (isinstance(numeric_only, types.Omitted) or numeric_only is False):
81-
raise TypingError("{} 'numeric_only' unsupported. Given: {}".format(_func_name, axis))
82-
83-
def sdc_pandas_dataframe_count_impl(self, axis=0, level=None, numeric_only=False):
84-
result_data = []
85-
result_index = []
86-
87-
for dataframe_item in self._data:
88-
item_count = dataframe_item.count()
89-
item_name = dataframe_item._name
90-
result_data.append(item_count)
91-
result_index.append(item_name)
141+
ty_checker.raise_exc(numeric_only, 'unsupported', 'numeric_only')
92142

93-
return pandas.Series(data=result_data, index=result_index)
143+
params = {'axis': 0, 'level': None, 'numeric_only': False}
144+
ser_par = {'level': 'level'}
94145

95-
return sdc_pandas_dataframe_count_impl
146+
return sdc_pandas_dataframe_reduce_columns(df, name, params, ser_par)

sdc/datatypes/hpat_pandas_dataframe_types.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def iterator_type(self):
110110
return DataFrameTypeIterator(self)
111111

112112

113-
if config_pipeline_hpat_default is 0:
113+
if not config_pipeline_hpat_default:
114114
@register_model(DataFrameType)
115115
class DataFrameTypeModel(StructModel):
116116
"""
@@ -163,7 +163,7 @@ def _hpat_pandas_dataframe_init_codegen(context, builder, signature, args):
163163
return sig, _hpat_pandas_dataframe_init_codegen
164164

165165

166-
if config_pipeline_hpat_default is 0:
166+
if not config_pipeline_hpat_default:
167167
@overload(pandas.DataFrame)
168168
def hpat_pandas_dataframe(data=None, index=None, columns=None, dtype=None, copy=False):
169169
"""

sdc/hiframes/pd_dataframe_ext.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,3 +1628,7 @@ def _impl(df, path_or_buf=None, sep=',', na_rep='', float_format=None,
16281628
date_format, doublequote, escapechar, decimal)
16291629

16301630
return _impl
1631+
1632+
1633+
if not sdc.config.config_pipeline_hpat_default:
1634+
from sdc.datatypes.hpat_pandas_dataframe_functions import *

sdc/tests/test_dataframe.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -929,7 +929,6 @@ def test_impl(n):
929929
n = 11
930930
pd.testing.assert_series_equal(hpat_func(n), test_impl(n))
931931

932-
@skip_numba_jit
933932
def test_count(self):
934933
def test_impl(n):
935934
df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)})
@@ -939,7 +938,6 @@ def test_impl(n):
939938
n = 11
940939
pd.testing.assert_series_equal(hpat_func(n), test_impl(n))
941940

942-
@skip_numba_jit
943941
def test_count1(self):
944942
# TODO: non-numeric columns should be ignored automatically
945943
def test_impl(n):

0 commit comments

Comments
 (0)