Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 5d201ab

Browse files
authored
Series str contains (#793)
1 parent da52b52 commit 5d201ab

File tree

5 files changed

+165
-4
lines changed

5 files changed

+165
-4
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2019-2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
import pandas as pd
28+
from numba import njit
29+
30+
31+
@njit
32+
def series_str_contains():
33+
series = pd.Series(['dog', 'foo', 'bar'])
34+
35+
return series.str.contains('o') # Expect series of True, True, False
36+
37+
38+
print(series_str_contains())

sdc/datatypes/hpat_pandas_stringmethods_functions.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ def hpat_pandas_stringmethods_upper_impl(self):
8989
from sdc.utilities.utils import sdc_overload_method, sdc_register_jitable
9090
from sdc.hiframes.api import get_nan_mask
9191
from sdc.str_arr_ext import str_arr_set_na_by_mask, create_str_arr_from_list
92+
from sdc.datatypes.common_functions import SDCLimitation
9293

9394

9495
@sdc_overload_method(StringMethodsType, 'center')
@@ -151,6 +152,87 @@ def hpat_pandas_stringmethods_center_impl(self, width, fillchar=' '):
151152
return hpat_pandas_stringmethods_center_impl
152153

153154

155+
@sdc_overload_method(StringMethodsType, 'contains')
156+
def hpat_pandas_stringmethods_contains(self, pat, case=True, flags=0, na=None, regex=True):
157+
"""
158+
Intel Scalable Dataframe Compiler User Guide
159+
********************************************
160+
Pandas API: pandas.Series.str.contains
161+
162+
Limitations
163+
-----------
164+
- Series elements are expected to be Unicode strings. Elements cannot be `NaNs`.
165+
- Parameter ``na`` is supported only with default value ``None``.
166+
- Parameter ``flags`` is supported only with default value ``0``.
167+
- Parameter ``regex`` is supported only with default value ``True``.
168+
169+
Examples
170+
--------
171+
.. literalinclude:: ../../../examples/series/str/series_str_contains.py
172+
:language: python
173+
:lines: 27-
174+
:caption: Tests if string element contains a pattern.
175+
:name: ex_series_str_contains
176+
177+
.. command-output:: python ./series/str/series_str_contains.py
178+
:cwd: ../../../examples
179+
180+
.. seealso::
181+
:ref:`Series.str.startswith <pandas.Series.str.startswith>`
182+
Same as endswith, but tests the start of string.
183+
:ref:`Series.str.endswith <pandas.Series.str.endswith>`
184+
Same as startswith, but tests the end of string.
185+
186+
Intel Scalable Dataframe Compiler Developer Guide
187+
*************************************************
188+
189+
Pandas Series method :meth:`pandas.core.strings.StringMethods.contains()` implementation.
190+
191+
.. only:: developer
192+
193+
Test: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_contains
194+
"""
195+
196+
ty_checker = TypeChecker('Method contains().')
197+
ty_checker.check(self, StringMethodsType)
198+
199+
if not isinstance(pat, (StringLiteral, UnicodeType)):
200+
ty_checker.raise_exc(pat, 'str', 'pat')
201+
202+
if not isinstance(na, (Omitted, NoneType)) and na is not None:
203+
ty_checker.raise_exc(na, 'none', 'na')
204+
205+
if not isinstance(case, (Boolean, Omitted)) and case is not True:
206+
ty_checker.raise_exc(case, 'bool', 'case')
207+
208+
if not isinstance(flags, (Omitted, Integer)) and flags != 0:
209+
ty_checker.raise_exc(flags, 'int64', 'flags')
210+
211+
if not isinstance(regex, (Omitted, Boolean)) and regex is not True:
212+
ty_checker.raise_exc(regex, 'bool', 'regex')
213+
214+
def hpat_pandas_stringmethods_contains_impl(self, pat, case=True, flags=0, na=None, regex=True):
215+
if flags != 0:
216+
raise SDCLimitation("Method contains(). Unsupported parameter. Given 'flags' != 0")
217+
218+
if not regex:
219+
raise SDCLimitation("Method contains(). Unsupported parameter. Given 'regex' is False")
220+
221+
if not case:
222+
_pat = pat.lower()
223+
else:
224+
_pat = pat
225+
226+
len_data = len(self._data)
227+
res_list = numpy.empty(len_data, numba.types.boolean)
228+
for idx in numba.prange(len_data):
229+
res_list[idx] = _pat in self._data._data[idx]
230+
231+
return pandas.Series(res_list, self._data._index, name=self._data._name)
232+
233+
return hpat_pandas_stringmethods_contains_impl
234+
235+
154236
@sdc_overload_method(StringMethodsType, 'endswith')
155237
def hpat_pandas_stringmethods_endswith(self, pat, na=None):
156238
"""

sdc/hiframes/pd_series_ext.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ def resolve_value_counts(self, ary, args, kws):
498498
'lower', 'lstrip', 'rjust', 'rstrip', 'startswith', 'strip', 'zfill',
499499
'isspace', 'islower', 'isalpha', 'isalnum', 'istitle', 'isnumeric',
500500
'isdigit', 'isdecimal', 'isupper', 'capitalize', 'title', 'swapcase',
501-
'casefold',
501+
'casefold', 'contains'
502502
]
503503
"""
504504
Functions which are used from Numba directly by calling from StringMethodsType
@@ -519,9 +519,9 @@ def resolve_value_counts(self, ary, args, kws):
519519
class SeriesStrMethodAttribute(AttributeTemplate):
520520
key = StringMethodsType
521521

522-
@bound_function("strmethod.contains")
523-
def resolve_contains(self, ary, args, kws):
524-
return signature(SeriesType(types.bool_), *args)
522+
# @bound_function("strmethod.contains")
523+
# def resolve_contains(self, ary, args, kws):
524+
# return signature(SeriesType(types.bool_), *args)
525525

526526
# @bound_function("strmethod.len")
527527
# def resolve_len(self, ary, args, kws):

sdc/tests/test_series.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from sdc.tests.gen_test_data import ParquetGenerator
5757

5858
from sdc.tests.test_utils import test_global_input_data_unicode_kind1
59+
from sdc.datatypes.common_functions import SDCLimitation
5960

6061

6162
_cov_corr_series = [(pd.Series(x), pd.Series(y)) for x, y in [
@@ -297,6 +298,10 @@ def rstrip_usecase(series, to_strip=None):
297298
return series.str.rstrip(to_strip)
298299

299300

301+
def contains_usecase(series, pat, case=True, flags=0, na=None, regex=True):
302+
return series.str.contains(pat, case, flags, na, regex)
303+
304+
300305
class TestSeries(
301306
TestSeries_apply,
302307
TestSeries_map,
@@ -6093,6 +6098,41 @@ def test_series_isupper_str(self):
60936098
s = pd.Series(data)
60946099
pd.testing.assert_series_equal(cfunc(s), isupper_usecase(s))
60956100

6101+
def test_series_contains(self):
6102+
hpat_func = self.jit(contains_usecase)
6103+
s = pd.Series(['Mouse', 'dog', 'house and parrot', '23'])
6104+
for pat in ['og', 'Og', 'OG', 'o']:
6105+
for case in [True, False]:
6106+
with self.subTest(pat=pat, case=case):
6107+
pd.testing.assert_series_equal(hpat_func(s, pat, case), contains_usecase(s, pat, case))
6108+
6109+
def test_series_contains_with_na_flags_regex(self):
6110+
hpat_func = self.jit(contains_usecase)
6111+
s = pd.Series(['Mouse', 'dog', 'house and parrot', '23'])
6112+
pat = 'og'
6113+
pd.testing.assert_series_equal(hpat_func(s, pat, flags=0, na=None, regex=True),
6114+
contains_usecase(s, pat, flags=0, na=None, regex=True))
6115+
6116+
def test_series_contains_unsupported(self):
6117+
hpat_func = self.jit(contains_usecase)
6118+
s = pd.Series(['Mouse', 'dog', 'house and parrot', '23'])
6119+
pat = 'og'
6120+
6121+
with self.assertRaises(SDCLimitation) as raises:
6122+
hpat_func(s, pat, flags=1)
6123+
msg = "Method contains(). Unsupported parameter. Given 'flags' != 0"
6124+
self.assertIn(msg, str(raises.exception))
6125+
6126+
with self.assertRaises(TypingError) as raises:
6127+
hpat_func(s, pat, na=0)
6128+
msg = 'Method contains(). The object na\n given: int64\n expected: none'
6129+
self.assertIn(msg, str(raises.exception))
6130+
6131+
with self.assertRaises(SDCLimitation) as raises:
6132+
hpat_func(s, pat, regex=False)
6133+
msg = "Method contains(). Unsupported parameter. Given 'regex' is False"
6134+
self.assertIn(msg, str(raises.exception))
6135+
60966136
@skip_sdc_jit('Old-style implementation returns string, but not series')
60976137
def test_series_describe_numeric(self):
60986138
def test_impl(A):

sdc/tests/tests_perf/test_perf_series_str.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def _test_case(self, pyfunc, name, total_data_length, input_data=None, data_num=
7777
TC(name='capitalize', size=[10 ** 4, 10 ** 5]),
7878
TC(name='casefold', size=[10 ** 4, 10 ** 5]),
7979
TC(name='center', params='1', size=[10 ** 4, 10 ** 5], input_data=test_global_input_data_unicode_kind1),
80+
TC(name='contains', params='"a"', size=[10 ** 4, 10 ** 5]),
8081
TC(name='endswith', params='"e"', size=[10 ** 4, 10 ** 5]),
8182
TC(name='find', params='"e"', size=[10 ** 4, 10 ** 5]),
8283
TC(name='isalnum', size=[10 ** 4, 10 ** 5]),

0 commit comments

Comments
 (0)