Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit f31650f

Browse files
Fixing perf issue with getitem by Boolean mask (#790)
1 parent 4e2cd36 commit f31650f

File tree

4 files changed

+64
-13
lines changed

4 files changed

+64
-13
lines changed

sdc/datatypes/common_functions.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,24 @@ def _sdc_take_str_arr_impl(data, indexes):
629629

630630
return _sdc_take_str_arr_impl
631631

632+
elif (isinstance(data, types.RangeType) and isinstance(data.dtype, types.Integer)):
633+
arr_dtype = data.dtype
634+
635+
def _sdc_take_array_impl(data, indexes):
636+
res_size = len(indexes)
637+
index_errors = 0
638+
res_arr = numpy.empty(res_size, dtype=arr_dtype)
639+
for i in numba.prange(res_size):
640+
value = data.start + data.step * indexes[i]
641+
if value >= data.stop:
642+
index_errors += 1
643+
res_arr[i] = value
644+
if index_errors:
645+
raise IndexError("_sdc_take: index out-of-bounds")
646+
return res_arr
647+
648+
return _sdc_take_array_impl
649+
632650
return None
633651

634652

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1454,13 +1454,16 @@ def df_length_expr(self):
14541454
return '0'
14551455

14561456

1457-
def df_index_expr(self, length_expr=None):
1457+
def df_index_expr(self, length_expr=None, as_range=False):
14581458
"""Generate expression to get or create index of DF"""
14591459
if isinstance(self.index, types.NoneType):
14601460
if length_expr is None:
14611461
length_expr = df_length_expr(self)
14621462

1463-
return f'numpy.arange({length_expr})'
1463+
if as_range:
1464+
return f'range({length_expr})'
1465+
else:
1466+
return f'numpy.arange({length_expr})'
14641467

14651468
return 'self._index'
14661469

@@ -1507,12 +1510,13 @@ def df_getitem_bool_series_idx_main_codelines(self, idx):
15071510
# optimization for default indexes in df and idx when index alignment is trivial
15081511
if (isinstance(self.index, types.NoneType) and isinstance(idx.index, types.NoneType)):
15091512
func_lines = [f' length = {df_length_expr(self)}',
1513+
f' self_index = {df_index_expr(self, as_range=True)}',
15101514
f' if length > len(idx):',
15111515
f' msg = "Unalignable boolean Series provided as indexer " + \\',
15121516
f' "(index of the boolean Series and of the indexed object do not match)."',
15131517
f' raise IndexingError(msg)',
15141518
f' # do not trim idx._data to length as getitem_by_mask handles such case',
1515-
f' res_index = getitem_by_mask(self.index, idx._data)',
1519+
f' res_index = getitem_by_mask(self_index, idx._data)',
15161520
f' # df index is default, same as positions so it can be used in take']
15171521
results = []
15181522
for i, col in enumerate(self.columns):
@@ -1532,7 +1536,7 @@ def df_getitem_bool_series_idx_main_codelines(self, idx):
15321536
f' self_index = self.index',
15331537
f' idx_reindexed = sdc_reindex_series(idx._data, idx.index, idx._name, self_index)',
15341538
f' res_index = getitem_by_mask(self_index, idx_reindexed._data)',
1535-
f' selected_pos = getitem_by_mask(numpy.arange(length), idx_reindexed._data)']
1539+
f' selected_pos = getitem_by_mask(range(length), idx_reindexed._data)']
15361540

15371541
results = []
15381542
for i, col in enumerate(self.columns):
@@ -1553,11 +1557,13 @@ def df_getitem_bool_series_idx_main_codelines(self, idx):
15531557

15541558
def df_getitem_bool_array_idx_main_codelines(self, idx):
15551559
"""Generate main code lines for df.getitem"""
1560+
15561561
func_lines = [f' length = {df_length_expr(self)}',
15571562
f' if length != len(idx):',
15581563
f' raise ValueError("Item wrong length.")',
1559-
f' taken_pos = getitem_by_mask(numpy.arange(length), idx)',
1560-
f' res_index = sdc_take(self.index, taken_pos)']
1564+
f' self_index = {df_index_expr(self, as_range=True)}',
1565+
f' taken_pos = getitem_by_mask(self_index, idx)',
1566+
f' res_index = sdc_take(self_index, taken_pos)']
15611567
results = []
15621568
for i, col in enumerate(self.columns):
15631569
res_data = f'res_data_{i}'
@@ -1635,12 +1641,13 @@ def df_getitem_bool_series_idx_codegen(self, idx):
16351641
Example of generated implementation with provided index:
16361642
def _df_getitem_bool_series_idx_impl(self, idx):
16371643
length = len(self._data[0])
1644+
self_index = range(len(self._data[0]))
16381645
if length > len(idx):
16391646
msg = "Unalignable boolean Series provided as indexer " + \
16401647
"(index of the boolean Series and of the indexed object do not match)."
16411648
raise IndexingError(msg)
16421649
# do not trim idx._data to length as getitem_by_mask handles such case
1643-
res_index = getitem_by_mask(self.index, idx._data)
1650+
res_index = getitem_by_mask(self_index, idx._data)
16441651
# df index is default, same as positions so it can be used in take
16451652
data_0 = self._data[0]
16461653
res_data_0 = sdc_take(data_0, res_index)
@@ -1667,8 +1674,9 @@ def _df_getitem_bool_array_idx_impl(self, idx):
16671674
length = len(self._data[0])
16681675
if length != len(idx):
16691676
raise ValueError("Item wrong length.")
1670-
taken_pos = getitem_by_mask(numpy.arange(length), idx)
1671-
res_index = sdc_take(self.index, taken_pos)
1677+
self_index = range(len(self._data[0]))
1678+
taken_pos = getitem_by_mask(self_index, idx)
1679+
res_index = sdc_take(self_index, taken_pos)
16721680
data_0 = self._data[0]
16731681
res_data_0 = sdc_take(data_0, taken_pos)
16741682
data_1 = self._data[1]

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ def hpat_pandas_series_getitem_idx_bool_indexer_impl(self, idx):
430430

431431
return pandas.Series(
432432
data=numpy_like.getitem_by_mask(self._data, idx._data),
433-
index=numpy_like.getitem_by_mask(self.index, idx._data),
433+
index=numpy_like.getitem_by_mask(range(len(self)), idx._data),
434434
name=self._name
435435
)
436436
else:

sdc/functions/numpy_like.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -900,9 +900,30 @@ def getitem_by_mask(arr, idx):
900900

901901
@sdc_overload(getitem_by_mask)
902902
def getitem_by_mask_overload(arr, idx):
903-
dtype = arr.dtype
903+
"""
904+
Creates a new array from arr by selecting elements indicated by Boolean mask idx.
905+
906+
Parameters
907+
-----------
908+
arr: :obj:`Array` or :obj:`Range`
909+
Input array or range
910+
idx: :obj:`Array` of dtype :class:`bool`
911+
Boolean mask
912+
913+
Returns
914+
-------
915+
:obj:`Array` of the same dtype as arr
916+
Array with only elements indicated by mask left
917+
918+
"""
919+
920+
is_range = isinstance(arr, types.RangeType) and isinstance(arr.dtype, types.Integer)
904921
is_str_arr = arr == string_array_type
922+
if not (isinstance(arr, types.Array) or is_str_arr or is_range):
923+
return
905924

925+
res_dtype = arr.dtype
926+
is_str_arr = arr == string_array_type
906927
def getitem_by_mask_impl(arr, idx):
907928
chunks = parallel_chunks(len(arr))
908929
arr_len = numpy.empty(len(chunks), dtype=numpy.int64)
@@ -921,15 +942,19 @@ def getitem_by_mask_impl(arr, idx):
921942
result_data = [''] * length
922943
result_nan_mask = numpy.empty(shape=length, dtype=types.bool_)
923944
else:
924-
result_data = numpy.empty(shape=length, dtype=dtype)
945+
result_data = numpy.empty(shape=length, dtype=res_dtype)
925946
for i in prange(len(chunks)):
926947
chunk = chunks[i]
927948
new_start = int(sum(arr_len[0:i]))
928949
current_pos = new_start
929950

930951
for j in range(chunk.start, chunk.stop):
931952
if idx[j]:
932-
result_data[current_pos] = arr[j]
953+
if is_range == True: # noqa
954+
value = arr.start + arr.step * j
955+
else:
956+
value = arr[j]
957+
result_data[current_pos] = value
933958
if is_str_arr == True: # noqa
934959
result_nan_mask[current_pos] = isna(arr, j)
935960
current_pos += 1

0 commit comments

Comments
 (0)