Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 4c85598

Browse files
Adding support of pd.RangeIndex for Series and DFs (#862)
* Adding support of pd.RangeIndex for Series and DFs This PR: 1. Modifies boxing/unboxing of Series and DFs to handle pd.RangeIndex, 2. Adds fix_df_index to transform values of index argumenent of Series and DF ctor calls, which fixes RewriteDataFrame ctor now handling index=None as argument, 3. Adds iteration, operators (is, eq, ne) support for RangeIndexType, 4. Renames and refactors sdc_check_indexes_equal to numpy_like.array_equal, 5. Adds specializations for RangeIndexType in all Series/DF methods, such as operators, getitem, setitem and indexing related functions (sdc_join_series_indexes, sdc_reindex_series, etc).
1 parent 4cf1deb commit 4c85598

18 files changed

+1408
-571
lines changed

sdc/datatypes/common_functions.py

Lines changed: 73 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@
4545
import sdc
4646
from sdc.hiframes.api import isna
4747
from sdc.hiframes.pd_series_type import SeriesType
48-
from sdc.str_arr_type import string_array_type
48+
from sdc.functions import numpy_like
49+
from sdc.str_arr_type import string_array_type, StringArrayType
50+
from sdc.datatypes.range_index_type import RangeIndexType
4951
from sdc.str_arr_ext import (num_total_chars, append_string_array_to,
5052
str_arr_is_na, pre_alloc_string_array, str_arr_set_na, string_array_type,
5153
cp_str_list_to_array, create_str_arr_from_list, get_utf8_size,
@@ -69,15 +71,18 @@ def hpat_arrays_append(A, B):
6971
def hpat_arrays_append_overload(A, B):
7072
"""Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A"""
7173

72-
if isinstance(A, types.Array):
73-
if isinstance(B, types.Array):
74+
A_is_range_index = isinstance(A, RangeIndexType)
75+
B_is_range_index = isinstance(B, RangeIndexType)
76+
if isinstance(A, (types.Array, RangeIndexType)):
77+
if isinstance(B, (types.Array, RangeIndexType)):
7478
def _append_single_numeric_impl(A, B):
75-
return numpy.concatenate((A, B,))
79+
_A = A.values if A_is_range_index == True else A # noqa
80+
_B = B.values if B_is_range_index == True else B # noqa
81+
return numpy.concatenate((_A, _B,))
7682

7783
return _append_single_numeric_impl
78-
elif isinstance(B, (types.UniTuple, types.List)):
79-
# TODO: this heavily relies on B being a homogeneous tuple/list - find a better way
80-
# to resolve common dtype of heterogeneous sequence of arrays
84+
elif isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, (types.Array, RangeIndexType)):
85+
B_dtype_is_range_index = isinstance(B.dtype, RangeIndexType)
8186
numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], [])
8287

8388
# TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime
@@ -87,11 +92,13 @@ def _append_list_numeric_impl(A, B):
8792
new_data = numpy.empty(total_length, numba_common_dtype)
8893

8994
stop = len(A)
90-
new_data[:stop] = A
95+
_A = numpy.array(A) if A_is_range_index == True else A # noqa
96+
new_data[:stop] = _A
9197
for arr in B:
98+
_arr = numpy.array(arr) if B_dtype_is_range_index == True else arr # noqa
9299
start = stop
93-
stop = start + len(arr)
94-
new_data[start:stop] = arr
100+
stop = start + len(_arr)
101+
new_data[start:stop] = _arr
95102
return new_data
96103

97104
return _append_list_numeric_impl
@@ -210,9 +217,41 @@ def sdc_join_series_indexes(left, right):
210217
def sdc_join_series_indexes_overload(left, right):
211218
"""Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm"""
212219

213-
# TODO: eliminate code duplication by merging implementations for numeric and StringArray
214-
# requires equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays
215-
if (isinstance(left, types.Array) and isinstance(right, types.Array)):
220+
# check that both operands are of types used for representing Pandas indexes
221+
if not (isinstance(left, (types.Array, StringArrayType, RangeIndexType))
222+
and isinstance(right, (types.Array, StringArrayType, RangeIndexType))):
223+
return None
224+
225+
convert_left = isinstance(left, RangeIndexType)
226+
convert_right = isinstance(right, RangeIndexType)
227+
228+
def _convert_to_arrays_impl(left, right):
229+
_left = left.values if convert_left == True else left # noqa
230+
_right = right.values if convert_right == True else right # noqa
231+
return sdc_join_series_indexes(_left, _right)
232+
233+
if isinstance(left, RangeIndexType) and isinstance(right, RangeIndexType):
234+
235+
def sdc_join_range_indexes_impl(left, right):
236+
if (left is right or numpy_like.array_equal(left, right)):
237+
joined = left.values
238+
lidx = numpy.arange(len(joined))
239+
ridx = lidx
240+
return joined, lidx, ridx
241+
else:
242+
return sdc_join_series_indexes(left.values, right.values)
243+
244+
return sdc_join_range_indexes_impl
245+
246+
elif isinstance(left, RangeIndexType) and isinstance(right, types.Array):
247+
return _convert_to_arrays_impl
248+
249+
elif isinstance(left, types.Array) and isinstance(right, RangeIndexType):
250+
return _convert_to_arrays_impl
251+
252+
# TODO: remove code duplication below and merge numeric and StringArray impls into one
253+
# needs equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays
254+
elif isinstance(left, types.Array) and isinstance(right, types.Array):
216255

217256
numba_common_dtype = find_common_dtype_from_numpy_dtypes([left.dtype, right.dtype], [])
218257
if isinstance(numba_common_dtype, types.Number):
@@ -321,8 +360,6 @@ def sdc_join_series_indexes_impl(left, right):
321360
return sdc_join_series_indexes_impl
322361

323362
else:
324-
# TODO: support joining indexes with common dtype=object - requires Numba
325-
# support of such numpy arrays in nopython mode, for now just return None
326363
return None
327364

328365
elif (left == string_array_type and right == string_array_type):
@@ -440,34 +477,6 @@ def sdc_join_series_indexes_impl(left, right):
440477
return None
441478

442479

443-
def sdc_check_indexes_equal(left, right):
444-
pass
445-
446-
447-
@sdc_overload(sdc_check_indexes_equal, jit_options={'parallel': False})
448-
def sdc_check_indexes_equal_overload(A, B):
449-
"""Function for checking arrays A and B of the same type are equal"""
450-
451-
if isinstance(A, types.Array):
452-
def sdc_check_indexes_equal_numeric_impl(A, B):
453-
return numpy.array_equal(A, B)
454-
return sdc_check_indexes_equal_numeric_impl
455-
456-
elif A == string_array_type:
457-
def sdc_check_indexes_equal_string_impl(A, B):
458-
# TODO: replace with StringArrays comparison
459-
is_index_equal = (len(A) == len(B)
460-
and num_total_chars(A) == num_total_chars(B))
461-
for i in numpy.arange(len(A)):
462-
if (A[i] != B[i]
463-
or str_arr_is_na(A, i) is not str_arr_is_na(B, i)):
464-
return False
465-
466-
return is_index_equal
467-
468-
return sdc_check_indexes_equal_string_impl
469-
470-
471480
@numba.njit
472481
def _sdc_pandas_format_percentiles(arr):
473482
""" Function converting float array of percentiles to a list of strings formatted
@@ -606,9 +615,16 @@ def _sdc_take(data, indexes):
606615
pass
607616

608617

609-
@sdc_overload(_sdc_take, jit_options={'parallel': True})
618+
@sdc_overload(_sdc_take)
610619
def _sdc_take_overload(data, indexes):
611-
if isinstance(indexes.dtype, types.ListType) and isinstance(data, (types.Array, types.List)):
620+
621+
if not isinstance(data, (types.Array, StringArrayType, RangeIndexType)):
622+
return None
623+
if not (isinstance(indexes, (types.Array, types.List))
624+
and isinstance(indexes.dtype, (types.Integer, types.ListType))):
625+
return None
626+
627+
if isinstance(indexes.dtype, types.ListType) and isinstance(data, (types.Array, types.List, RangeIndexType)):
612628
arr_dtype = data.dtype
613629

614630
def _sdc_take_list_impl(data, indexes):
@@ -661,7 +677,7 @@ def _sdc_take_list_str_impl(data, indexes):
661677

662678
return _sdc_take_list_str_impl
663679

664-
elif isinstance(data, types.Array):
680+
elif isinstance(data, (types.Array, RangeIndexType)):
665681
arr_dtype = data.dtype
666682

667683
def _sdc_take_array_impl(data, indexes):
@@ -673,7 +689,7 @@ def _sdc_take_array_impl(data, indexes):
673689

674690
return _sdc_take_array_impl
675691

676-
elif data == string_array_type:
692+
elif isinstance(data, StringArrayType):
677693
def _sdc_take_str_arr_impl(data, indexes):
678694
res_size = len(indexes)
679695
nan_mask = numpy.zeros(res_size, dtype=numpy.bool_)
@@ -693,24 +709,6 @@ def _sdc_take_str_arr_impl(data, indexes):
693709

694710
return _sdc_take_str_arr_impl
695711

696-
elif (isinstance(data, types.RangeType) and isinstance(data.dtype, types.Integer)):
697-
arr_dtype = data.dtype
698-
699-
def _sdc_take_array_impl(data, indexes):
700-
res_size = len(indexes)
701-
index_errors = 0
702-
res_arr = numpy.empty(res_size, dtype=arr_dtype)
703-
for i in numba.prange(res_size):
704-
value = data.start + data.step * indexes[i]
705-
if value >= data.stop:
706-
index_errors += 1
707-
res_arr[i] = value
708-
if index_errors:
709-
raise IndexError("_sdc_take: index out-of-bounds")
710-
return res_arr
711-
712-
return _sdc_take_array_impl
713-
714712
return None
715713

716714

@@ -741,16 +739,19 @@ def sdc_reindex_series(arr, index, name, by_index):
741739
def sdc_reindex_series_overload(arr, index, name, by_index):
742740
""" Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """
743741

744-
same_index_types = index is by_index
742+
range_indexes = isinstance(index, RangeIndexType) and isinstance(by_index, RangeIndexType)
745743
data_dtype, index_dtype = arr.dtype, index.dtype
746744
data_is_str_arr = isinstance(arr.dtype, types.UnicodeType)
747745

748746
def sdc_reindex_series_impl(arr, index, name, by_index):
749747

750-
# if index types are the same, we may not reindex if indexes are the same
751-
if same_index_types == True: # noqa
752-
if index is by_index:
753-
return pandas.Series(data=arr, index=index, name=name)
748+
# no reindexing is needed if indexes are equal
749+
if range_indexes == True: # noqa
750+
equal_indexes = numpy_like.array_equal(index, by_index)
751+
else:
752+
equal_indexes = False
753+
if (index is by_index or equal_indexes):
754+
return pandas.Series(data=arr, index=by_index, name=name)
754755

755756
if data_is_str_arr == True: # noqa
756757
_res_data = [''] * len(by_index)
@@ -771,7 +772,8 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
771772
map_index_to_position[value] = i
772773

773774
index_mismatch = 0
774-
for i in numba.prange(len(by_index)):
775+
# FIXME: TypingError in parfor step (wrong promotion to float64?) if prange is used
776+
for i in numpy.arange(len(by_index)):
775777
if by_index[i] in map_index_to_position:
776778
pos_in_self = map_index_to_position[by_index[i]]
777779
_res_data[i] = arr[pos_in_self]

0 commit comments

Comments
 (0)