Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit bb625dd

Browse files
Optimize getitem operations by checking for same indexes (#800)
1 parent 542c8f5 commit bb625dd

File tree

7 files changed

+94
-30
lines changed

7 files changed

+94
-30
lines changed

sdc/datatypes/common_functions.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ def _sdc_asarray(data):
561561
pass
562562

563563

564-
@sdc_overload(_sdc_asarray, jit_options={'parallel': True})
564+
@sdc_overload(_sdc_asarray)
565565
def _sdc_asarray_overload(data):
566566

567567
# TODO: extend with other types
@@ -673,14 +673,21 @@ def sdc_reindex_series(arr, index, name, by_index):
673673
pass
674674

675675

676-
@sdc_overload(sdc_reindex_series, jit_options={'parallel': True})
676+
@sdc_overload(sdc_reindex_series)
677677
def sdc_reindex_series_overload(arr, index, name, by_index):
678678
""" Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """
679679

680+
same_index_types = index is by_index
680681
data_dtype, index_dtype = arr.dtype, index.dtype
681682
data_is_str_arr = isinstance(arr.dtype, types.UnicodeType)
682683

683684
def sdc_reindex_series_impl(arr, index, name, by_index):
685+
686+
# if index types are the same, we may not reindex if indexes are the same
687+
if same_index_types == True: # noqa
688+
if index is by_index:
689+
return pandas.Series(data=arr, index=index, name=name)
690+
684691
if data_is_str_arr == True: # noqa
685692
_res_data = [''] * len(by_index)
686693
res_data_nan_mask = numpy.zeros(len(by_index), dtype=types.bool_)
@@ -722,5 +729,3 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
722729
return pandas.Series(data=res_data, index=by_index, name=name)
723730

724731
return sdc_reindex_series_impl
725-
726-
return None

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
| Also, it contains Numba internal operators which are required for DataFrame type handling
3030
'''
3131

32-
3332
import numba
3433
import numpy
3534
import operator
@@ -66,6 +65,7 @@
6665
from sdc.datatypes.common_functions import _sdc_take, sdc_reindex_series
6766
from sdc.utilities.prange_utils import parallel_chunks
6867

68+
6969
@sdc_overload_attribute(DataFrameType, 'index')
7070
def hpat_pandas_dataframe_index(df):
7171
"""
@@ -105,6 +105,7 @@ def hpat_pandas_df_index_none_impl(df):
105105

106106
return hpat_pandas_df_index_none_impl
107107
else:
108+
108109
def hpat_pandas_df_index_impl(df):
109110
return df._index
110111

@@ -404,7 +405,6 @@ def sdc_pandas_dataframe_append_impl(df, other, _func_name, ignore_index, indexe
404405

405406
return sdc_pandas_dataframe_append_impl(df, other, _func_name, ignore_index, indexes_comparable, args)
406407

407-
408408
# Example func_text for func_name='count' columns=('A', 'B'):
409409
#
410410
# def _df_count_impl(df, axis=0, level=None, numeric_only=False):
@@ -1534,9 +1534,9 @@ def df_getitem_bool_series_idx_main_codelines(self, idx):
15341534
else:
15351535
func_lines = [f' length = {df_length_expr(self)}',
15361536
f' self_index = self.index',
1537-
f' idx_reindexed = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)',
1538-
f' res_index = getitem_by_mask(self_index, idx_reindexed._data)',
1539-
f' selected_pos = getitem_by_mask(range(length), idx_reindexed._data)']
1537+
f' reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)',
1538+
f' res_index = getitem_by_mask(self_index, reindexed_idx._data)',
1539+
f' selected_pos = getitem_by_mask(numpy.arange(length), reindexed_idx._data)']
15401540

15411541
results = []
15421542
for i, col in enumerate(self.columns):
@@ -1835,6 +1835,7 @@ def _df_getitem_str_literal_idx_impl(self, idx):
18351835
return _df_getitem_str_literal_idx_impl
18361836

18371837
if isinstance(idx, types.UnicodeType):
1838+
18381839
def _df_getitem_unicode_idx_impl(self, idx):
18391840
# http://numba.pydata.org/numba-doc/dev/developer/literal.html#specifying-for-literal-typing
18401841
# literally raises special exception to call getitem with literal idx value got from unicode
@@ -1886,6 +1887,7 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx):
18861887
if isinstance(idx, types.Tuple) and isinstance(idx[1], types.Literal):
18871888
col = idx[1].literal_value
18881889
if -1 < col < len(self.dataframe.columns):
1890+
18891891
def df_getitem_iat_tuple_impl(self, idx):
18901892
row, _ = idx
18911893
if -1 < row < len(self._dataframe.index):
@@ -2335,6 +2337,7 @@ def df_set_column_overload(self, key, value):
23352337
return gen_df_replace_column_impl(self, key)
23362338

23372339
if isinstance(key, types.UnicodeType):
2340+
23382341
def _df_set_column_unicode_key_impl(self, key, value):
23392342
# http://numba.pydata.org/numba-doc/dev/developer/literal.html#specifying-for-literal-typing
23402343
# literally raises special exception to call df._set_column with literal idx value got from unicode

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -420,29 +420,27 @@ def hpat_pandas_series_getitem_idx_list_impl(self, idx):
420420
' Given: self.index={}, idx.index={}'
421421
raise TypingError(msg.format(_func_name, self.index, idx.index))
422422

423-
def hpat_pandas_series_getitem_idx_bool_indexer_impl(self, idx):
423+
def _series_getitem_idx_bool_indexer_impl(self, idx):
424424

425425
if none_indexes == True: # noqa
426426
if len(self) > len(idx):
427427
msg = "Unalignable boolean Series provided as indexer " + \
428428
"(index of the boolean Series and of the indexed object do not match)."
429429
raise IndexingError(msg)
430430

431-
return pandas.Series(
432-
data=numpy_like.getitem_by_mask(self._data, idx._data),
433-
index=numpy_like.getitem_by_mask(range(len(self)), idx._data),
434-
name=self._name
435-
)
431+
self_index = range(len(self))
432+
reindexed_idx = idx
436433
else:
437434
self_index = self.index
438-
idx_reindexed = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)
439-
return pandas.Series(
440-
data=numpy_like.getitem_by_mask(self._data, idx_reindexed._data),
441-
index=numpy_like.getitem_by_mask(self_index, idx_reindexed._data),
442-
name=self._name
443-
)
435+
reindexed_idx = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)
436+
437+
return pandas.Series(
438+
data=numpy_like.getitem_by_mask(self._data, reindexed_idx._data),
439+
index=numpy_like.getitem_by_mask(self_index, reindexed_idx._data),
440+
name=self._name
441+
)
444442

445-
return hpat_pandas_series_getitem_idx_bool_indexer_impl
443+
return _series_getitem_idx_bool_indexer_impl
446444

447445
# idx is Series and it's index is None, idx.dtype is not Boolean
448446
if (isinstance(idx, SeriesType) and index_is_none

sdc/str_arr_ext.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1543,3 +1543,14 @@ def _sdc_str_arr_operator_mul_impl(self, other):
15431543
return res_arr
15441544

15451545
return _sdc_str_arr_operator_mul_impl
1546+
1547+
1548+
@lower_builtin(operator.is_, StringArrayType, StringArrayType)
1549+
def sdc_str_arr_operator_is(context, builder, sig, args):
1550+
1551+
# meminfo ptr uniquely identifies each StringArray allocation
1552+
a = context.make_helper(builder, string_array_type, args[0])
1553+
b = context.make_helper(builder, string_array_type, args[1])
1554+
ma = builder.ptrtoint(a.meminfo, cgutils.intp_t)
1555+
mb = builder.ptrtoint(b.meminfo, cgutils.intp_t)
1556+
return builder.icmp_signed('==', ma, mb)

sdc/tests/test_series.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6984,6 +6984,31 @@ def test_impl(A, idx):
69846984
sdc_exception = context.exception
69856985
self.assertIn(str(sdc_exception), str(pandas_exception))
69866986

6987+
@skip_sdc_jit('Not implemented in old-pipeline')
6988+
def test_series_getitem_idx_bool_series3(self):
6989+
""" Verifies Series.getitem by mask indicated by a Boolean Series with the same object as index """
6990+
def test_impl(A, mask, index):
6991+
S = pd.Series(A, index)
6992+
idx = pd.Series(mask, S.index)
6993+
return S[idx]
6994+
hpat_func = self.jit(test_impl)
6995+
6996+
n = 11
6997+
np.random.seed(0)
6998+
6999+
idxs_to_test = [
7000+
np.arange(n),
7001+
np.arange(n, dtype='float'),
7002+
gen_strlist(n, 2, 'abcd123 ')
7003+
]
7004+
series_data = np.arange(n)
7005+
mask = np.random.choice([True, False], n)
7006+
for index in idxs_to_test:
7007+
with self.subTest(series_index=index):
7008+
result = hpat_func(series_data, mask, index)
7009+
result_ref = test_impl(series_data, mask, index)
7010+
pd.testing.assert_series_equal(result, result_ref)
7011+
69877012
@skip_sdc_jit('Not implemented in old-pipeline')
69887013
def test_series_getitem_idx_bool_series_reindex(self):
69897014
""" Verifies Series.getitem with reindexing by mask indicated by a Boolean Series

sdc/tests/tests_perf/data_generator.py

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,12 @@ def gen_series_fixed_str(data_num, data_length, input_data, data_width):
2828
return results
2929

3030

31-
def gen_arr_from_input(input_data, data_length, random=True):
31+
def gen_arr_from_input(data_length, input_data, random=True, repeat=True, seed=None):
32+
if seed is not None:
33+
np.random.seed(seed)
34+
3235
if random:
33-
return np.random.choice(input_data, data_length)
36+
return np.random.choice(input_data, data_length, replace=repeat)
3437
else:
3538
return np.asarray(multiply_oneds_data(input_data, data_length))
3639

@@ -50,7 +53,7 @@ def gen_arr_of_dtype(data_length, dtype='float', random=True, limits=None, nuniq
5053

5154
# prefer generation based on input data if it's provided
5255
if input_data is not None:
53-
return gen_arr_from_input(input_data, data_length, random=random)
56+
return gen_arr_from_input(data_length, input_data, random=random)
5457

5558
if dtype == 'float':
5659
return np.random.ranf(data_length)
@@ -67,6 +70,21 @@ def gen_arr_of_dtype(data_length, dtype='float', random=True, limits=None, nuniq
6770
return None
6871

6972

73+
def gen_unique_values(data_length, dtype='int', seed=None):
74+
"""
75+
data_length: result length of array of unique values,
76+
dtype: dtype of generated array,
77+
seed: seed to initialize random state
78+
"""
79+
80+
if dtype in ('float', 'int'):
81+
values = np.arange(data_length, dtype=dtype)
82+
if dtype == 'str':
83+
values = gen_strlist(data_length)
84+
85+
return gen_arr_from_input(data_length, values, repeat=False, seed=seed)
86+
87+
7088
def gen_series(data_length, dtype='float', random=True, limits=None, nunique=1000, input_data=None, seed=None):
7189
"""
7290
data_length: result series length,
@@ -82,7 +100,7 @@ def gen_series(data_length, dtype='float', random=True, limits=None, nunique=100
82100

83101
# prefer generation based on input data if it's provided
84102
if input_data is not None:
85-
series_data = gen_arr_from_input(input_data, data_length, random=random)
103+
series_data = gen_arr_from_input(data_length, input_data, random=random)
86104
else:
87105
series_data = gen_arr_of_dtype(data_length, dtype=dtype, limits=limits, nunique=nunique)
88106

@@ -98,13 +116,15 @@ def gen_df(data_length,
98116
limits=None,
99117
nunique=1000,
100118
input_data=None,
119+
index_gen=None,
101120
seed=None):
102121
"""
103122
data_length: result series length,
104123
dtype: dtype of generated series,
105124
limits: a tuple of (min, max) limits for numeric series,
106125
nunique: number of unique values in generated series,
107126
input_data: 2D sequence of values used for generation of dataframe columns,
127+
index_gen: callable that will generate index of needed size,
108128
seed: seed to initialize random state
109129
"""
110130

@@ -116,10 +136,10 @@ def gen_df(data_length,
116136
for i in range(columns):
117137
# prefer generation based on input data if it's provided
118138
if (input_data is not None and i < len(input_data)):
119-
col_data = gen_arr_from_input(input_data[i], data_length, random=random)
139+
col_data = gen_arr_from_input(data_length, input_data[i], random=random)
120140
else:
121141
col_data = gen_arr_of_dtype(data_length, dtype=dtype, limits=limits, nunique=nunique)
122142
all_data.append(col_data)
123143

124-
# TODO: support index generation
125-
return pd.DataFrame(dict(zip(col_names, all_data)))
144+
index_data = index_gen(data_length) if index_gen is not None else None
145+
return pd.DataFrame(dict(zip(col_names, all_data)), index=index_data)

sdc/tests/tests_perf/test_perf_df.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from sdc.tests.test_utils import test_global_input_data_float64
3737
from .generator import generate_test_cases
3838
from .generator import TestCase as TC
39-
from .data_generator import gen_df, gen_series, gen_arr_of_dtype
39+
from .data_generator import gen_df, gen_series, gen_arr_of_dtype, gen_unique_values
4040

4141

4242
# python -m sdc.runtests sdc.tests.tests_perf.test_perf_df.TestDataFrameMethods.test_df_{method_name}
@@ -88,6 +88,8 @@ def _test_case(self, pyfunc, name, total_data_length, input_data=None, data_num=
8888
TC(name='getitem_idx_bool_array', size=[10 ** 7], call_expr='df[idx]', usecase_params='df, idx',
8989
data_gens=(gen_df, partial(gen_arr_of_dtype, dtype='bool', random=False)),
9090
input_data=[None, [True, False, False, True, False, True]]),
91+
TC(name='getitem_filter_by_value', size=[10 ** 7], call_expr='df[df.A > 0]', usecase_params='df',
92+
data_gens=(partial(gen_df, index_gen=gen_unique_values), )),
9193
]
9294

9395
generate_test_cases(cases, TestDataFrameMethods, 'df')

0 commit comments

Comments
 (0)