Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit abc9fbb

Browse files
Add PositionalIndex and EmptyIndex types and align indexes API (#951)
* Add PositionalIndex and EmptyIndex types and align indexes API Motivation: removing types.None as representation of default pd.RangeIndex created when index=None is used in DF and Series ctors, that requires most of Series and DF implementations to branch basing on index types. Instead all Series and DF functions should use common indexes API aligned to pandas one. Changed in this PR: - types.None index is removed; - Added EmptyIndexType (instead types.None index) to represent empty pandas index with dtype='Object' i.e. pd.Index([]); - Added PositionalIndexType as a replacement for types.None index for non-empty DFs and Series; - Changed unboxing of RangeIndex objects to either RangeIndexType of PositionalIndexType depending on whether it's default range or not; - Changed fix_df_index and dataframe_constructor.py to create EmptyIndexType or other index depending on whether tuple of columns is empty; - Moved implementations for index types from common and numpy_like functions to specific index files; - Updated operators and other Series methods implementations to avoid branching on index types and used index objects API instead; - Reorganized index tests and added tests verifying specific function (e.g. series reindexing) for all types of indexes. * Resolving minor issues and fixing tests * Fixing types.None index in read_csv * Fixing PEP remarks * Revert back change in Series.astype breaking float to str conversion
1 parent defe2aa commit abc9fbb

36 files changed

+3696
-1904
lines changed

sdc/datatypes/common_functions.py

Lines changed: 108 additions & 213 deletions
Large diffs are not rendered by default.

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 173 additions & 215 deletions
Large diffs are not rendered by default.

sdc/datatypes/hpat_pandas_groupby_functions.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,13 @@
4141
from numba.core.typing import signature
4242
from numba import literally
4343

44-
from sdc.datatypes.common_functions import sdc_arrays_argsort, _sdc_asarray, _sdc_take
44+
from sdc.datatypes.common_functions import sdc_arrays_argsort, _sdc_asarray
4545
from sdc.datatypes.hpat_pandas_groupby_types import DataFrameGroupByType, SeriesGroupByType
4646
from sdc.utilities.sdc_typing_utils import TypeChecker, kwsparams2list, sigparams2list
4747
from sdc.utilities.utils import (sdc_overload, sdc_overload_method, sdc_register_jitable)
4848
from sdc.hiframes.pd_series_type import SeriesType
4949
from sdc.str_ext import string_type
50+
from sdc.functions.numpy_like import take as nplike_take
5051

5152

5253
performance_limitation = "This function may reveal slower performance than Pandas* on user system.\
@@ -218,15 +219,15 @@ def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, column_loc,
218219
f' column_data_{i} = {df}._data[{type_id}][{col_id}]',
219220
f' for j in numpy.arange(res_index_len):',
220221
f' idx = argsorted_index[j] if {groupby_param_sort} else j',
221-
f' group_arr_{i} = _sdc_take(column_data_{i}, list({groupby_dict}[group_keys[idx]]))',
222+
f' group_arr_{i} = sdc_take(column_data_{i}, list({groupby_dict}[group_keys[idx]]))',
222223
f' group_series_{i} = pandas.Series(group_arr_{i})',
223224
f' result_data_{i}[j] = group_series_{i}.{func_name}({extra_impl_params})',
224225
]
225226

226227
data = ', '.join(f'\'{column_names[i]}\': result_data_{i}' for i in range(len(columns)))
227228
func_lines.extend(['\n'.join([
228229
f' if {groupby_param_sort}:',
229-
f' res_index = _sdc_take(group_keys, argsorted_index)',
230+
f' res_index = sdc_take(group_keys, argsorted_index)',
230231
f' else:',
231232
f' res_index = group_keys',
232233
f' return pandas.DataFrame({{{data}}}, index=res_index)'
@@ -236,7 +237,7 @@ def _sdc_pandas_groupby_generic_func_codegen(func_name, columns, column_loc,
236237
global_vars = {'pandas': pandas,
237238
'numpy': numpy,
238239
'_sdc_asarray': _sdc_asarray,
239-
'_sdc_take': _sdc_take,
240+
'sdc_take': nplike_take,
240241
'sdc_arrays_argsort': sdc_arrays_argsort}
241242

242243
return func_text, global_vars
@@ -262,11 +263,11 @@ def _sdc_pandas_series_groupby_generic_func_codegen(func_name, func_params, defa
262263
f' result_data = numpy.empty(res_index_len, dtype=res_dtype)',
263264
f' for j in numpy.arange(res_index_len):',
264265
f' idx = argsorted_index[j] if {groupby_param_sort} else j',
265-
f' group_arr = _sdc_take({series}._data, list({groupby_dict}[group_keys[idx]]))',
266+
f' group_arr = sdc_take({series}._data, list({groupby_dict}[group_keys[idx]]))',
266267
f' group_series = pandas.Series(group_arr)',
267268
f' result_data[j] = group_series.{func_name}({extra_impl_params})',
268269
f' if {groupby_param_sort}:',
269-
f' res_index = _sdc_take(group_keys, argsorted_index)',
270+
f' res_index = sdc_take(group_keys, argsorted_index)',
270271
f' else:',
271272
f' res_index = group_keys',
272273
f' return pandas.Series(data=result_data, index=res_index, name={series}._name)'
@@ -276,7 +277,7 @@ def _sdc_pandas_series_groupby_generic_func_codegen(func_name, func_params, defa
276277
global_vars = {'pandas': pandas,
277278
'numpy': numpy,
278279
'_sdc_asarray': _sdc_asarray,
279-
'_sdc_take': _sdc_take,
280+
'sdc_take': nplike_take,
280281
'sdc_arrays_argsort': sdc_arrays_argsort}
281282

282283
return func_text, global_vars

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 115 additions & 178 deletions
Large diffs are not rendered by default.

sdc/datatypes/hpat_pandas_stringmethods_functions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def hpat_pandas_stringmethods_upper_impl(self):
8989
from sdc.utilities.utils import sdc_overload_method, sdc_register_jitable
9090
from sdc.hiframes.api import get_nan_mask
9191
from sdc.str_arr_ext import str_arr_set_na_by_mask, create_str_arr_from_list
92-
from sdc.datatypes.common_functions import SDCLimitation
92+
from sdc.utilities.sdc_typing_utils import SDCLimitation
9393

9494

9595
@sdc_overload_method(StringMethodsType, 'center')

sdc/datatypes/indexes/__init__.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
# modules are dependent on each other e.g. positional_index_type
28+
# needs range_index_type to be imported, so below order matters
29+
from .range_index_type import RangeIndexType
30+
from .positional_index_type import PositionalIndexType
31+
from .empty_index_type import EmptyIndexType
32+
from .int64_index_type import Int64IndexType
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# -*- coding: utf-8 -*-
2+
# *****************************************************************************
3+
# Copyright (c) 2020, Intel Corporation All rights reserved.
4+
#
5+
# Redistribution and use in source and binary forms, with or without
6+
# modification, are permitted provided that the following conditions are met:
7+
#
8+
# Redistributions of source code must retain the above copyright notice,
9+
# this list of conditions and the following disclaimer.
10+
#
11+
# Redistributions in binary form must reproduce the above copyright notice,
12+
# this list of conditions and the following disclaimer in the documentation
13+
# and/or other materials provided with the distribution.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
# *****************************************************************************
27+
28+
from numba import types
29+
from numba.extending import (
30+
models,
31+
register_model,
32+
make_attribute_wrapper
33+
)
34+
35+
36+
class EmptyIndexType(types.Type):
37+
38+
# this index represents special case of pd.Index([]) with dtype='object'
39+
# for overload typing functions assume it has following dtype
40+
dtype = types.pyobject
41+
42+
def __init__(self, is_named=False):
43+
self.is_named = is_named
44+
super(EmptyIndexType, self).__init__(
45+
name='EmptyIndexType({})'.format(is_named))
46+
47+
48+
@register_model(EmptyIndexType)
49+
class EmptyIndexModel(models.StructModel):
50+
def __init__(self, dmm, fe_type):
51+
52+
name_type = types.unicode_type if fe_type.is_named else types.none
53+
members = [
54+
('name', name_type),
55+
]
56+
models.StructModel.__init__(self, dmm, fe_type, members)
57+
58+
59+
# FIXME_Numba#3372: add into numba.types to allow returning from objmode
60+
types.EmptyIndexType = EmptyIndexType
61+
62+
63+
make_attribute_wrapper(EmptyIndexType, 'name', '_name')
File renamed without changes.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# -*- coding: utf-8 -*-
2+
# *****************************************************************************
3+
# Copyright (c) 2020, Intel Corporation All rights reserved.
4+
#
5+
# Redistribution and use in source and binary forms, with or without
6+
# modification, are permitted provided that the following conditions are met:
7+
#
8+
# Redistributions of source code must retain the above copyright notice,
9+
# this list of conditions and the following disclaimer.
10+
#
11+
# Redistributions in binary form must reproduce the above copyright notice,
12+
# this list of conditions and the following disclaimer in the documentation
13+
# and/or other materials provided with the distribution.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
# *****************************************************************************
27+
28+
from numba import types
29+
from numba.extending import (
30+
models,
31+
register_model,
32+
make_attribute_wrapper
33+
)
34+
35+
from sdc.datatypes.indexes import RangeIndexType
36+
37+
38+
class PositionalIndexType(types.IterableType):
39+
dtype = types.int64
40+
41+
def __init__(self, is_named=False):
42+
self.data = RangeIndexType(is_named)
43+
self.is_named = is_named
44+
super(PositionalIndexType, self).__init__(
45+
name='PositionalIndexType({})'.format(is_named))
46+
47+
@property
48+
def iterator_type(self):
49+
res = self.data.iterator_type
50+
return res
51+
52+
53+
@register_model(PositionalIndexType)
54+
class PositionalIndexModel(models.StructModel):
55+
def __init__(self, dmm, fe_type):
56+
57+
members = [
58+
('data', fe_type.data),
59+
]
60+
models.StructModel.__init__(self, dmm, fe_type, members)
61+
62+
63+
# FIXME_Numba#3372: add into numba.types to allow returning from objmode
64+
types.PositionalIndexType = PositionalIndexType
65+
66+
67+
make_attribute_wrapper(PositionalIndexType, 'data', '_data')
File renamed without changes.

0 commit comments

Comments
 (0)