Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 6315c52

Browse files
authored
Parallel isin (#603)
1 parent e061965 commit 6315c52

File tree

2 files changed

+32
-7
lines changed

2 files changed

+32
-7
lines changed

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,14 @@
6363
from sdc.str_arr_type import (StringArrayType, string_array_type)
6464
from sdc.str_arr_ext import (str_arr_is_na, str_arr_set_na, num_total_chars,
6565
pre_alloc_string_array, cp_str_list_to_array,
66-
create_str_arr_from_list, str_arr_set_na_by_mask)
66+
create_str_arr_from_list, str_arr_set_na_by_mask,
67+
str_list_to_array)
6768
from sdc.utilities.utils import to_array, sdc_overload, sdc_overload_method, sdc_overload_attribute
6869
from sdc import sdc_autogenerated
6970
from sdc.functions import numpy_like
7071
from sdc.hiframes.api import isna
7172
from sdc.datatypes.hpat_pandas_groupby_functions import init_series_groupby
73+
from sdc.utilities.prange_utils import parallel_chunks
7274

7375
from .pandas_series_functions import apply
7476
from .pandas_series_functions import map as _map
@@ -2048,7 +2050,7 @@ def hpat_pandas_series_isin(self, values):
20482050
Pandas Series method :meth:`pandas.Series.isin` implementation.
20492051
20502052
.. only:: developer
2051-
Test: python -m sdc.runtests sdc.tests.test_series.TestSeries.test_series_isin_list1
2053+
Test: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_isin*
20522054
"""
20532055

20542056
_func_name = 'Method isin().'
@@ -2059,10 +2061,31 @@ def hpat_pandas_series_isin(self, values):
20592061
if not isinstance(values, (types.Set, types.List)):
20602062
ty_checker.raise_exc(values, 'set or list', 'values')
20612063

2062-
def hpat_pandas_series_isin_impl(self, values):
2063-
# TODO: replace with below line when Numba supports np.isin in nopython mode
2064-
# return pandas.Series(np.isin(self._data, values))
2065-
return pandas.Series(data=[(x in values) for x in self._data], index=self._index, name=self._name)
2064+
if isinstance(values.dtype, (types.UnicodeType, types.StringLiteral)):
2065+
def hpat_pandas_series_isin_impl(self, values):
2066+
# TODO: replace with below line when Numba supports np.isin in nopython mode
2067+
# return pandas.Series (np.isin (self._data, values))
2068+
2069+
values = str_list_to_array(list(values))
2070+
values = set(values)
2071+
data_len = len(self._data)
2072+
result = numpy.empty(data_len, dtype=numpy.bool_)
2073+
for i in prange(data_len):
2074+
result[i] = self._data[i] in values
2075+
2076+
return pandas.Series(data=result, index=self._index, name=self._name)
2077+
else:
2078+
def hpat_pandas_series_isin_impl(self, values):
2079+
# TODO: replace with below line when Numba supports np.isin in nopython mode
2080+
# return pandas.Series (np.isin (self._data, values))
2081+
2082+
values = set(values)
2083+
data_len = len(self._data)
2084+
result = numpy.empty(data_len, dtype=numpy.bool_)
2085+
for i in prange(data_len):
2086+
result[i] = self._data[i] in values
2087+
2088+
return pandas.Series(data=result, index=self._index, name=self._name)
20662089

20672090
return hpat_pandas_series_isin_impl
20682091

sdc/tests/tests_perf/test_perf_series.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,9 @@ def _test_case(self, pyfunc, name, total_data_length, input_data=None, data_num=
107107
TC(name='idxmin', size=[10 ** 8], check_skipna=True),
108108
TC(name='iloc', size=[10 ** 7], call_expr='data.iloc[100000]', usecase_params='data'),
109109
TC(name='index', size=[10 ** 7], call_expr='data.index', usecase_params='data'),
110-
TC(name='isin', size=[10 ** 7], call_expr='data.isin([0])', usecase_params='data'),
110+
TC(name='isin', size=[10 ** 7], params='values=[0]'),
111+
TC(name='isin', size=[10 ** 7], call_expr='data.isin(["a", "q", "c", "q", "d", "q", "e"])', usecase_params='data',
112+
input_data=[['a', 'b', 'q', 'w', 'c', 'd', 'e', 'r']]),
111113
TC(name='isna', size=[10 ** 7]),
112114
TC(name='isnull', size=[10 ** 7]),
113115
TC(name='le', size=[10 ** 7], params='other', data_num=2),

0 commit comments

Comments
 (0)