4545import sdc
4646from sdc .hiframes .api import isna
4747from sdc .hiframes .pd_series_type import SeriesType
48- from sdc .str_arr_type import string_array_type
48+ from sdc .functions import numpy_like
49+ from sdc .str_arr_type import string_array_type , StringArrayType
50+ from sdc .datatypes .range_index_type import RangeIndexType
4951from sdc .str_arr_ext import (num_total_chars , append_string_array_to ,
5052 str_arr_is_na , pre_alloc_string_array , str_arr_set_na , string_array_type ,
5153 cp_str_list_to_array , create_str_arr_from_list , get_utf8_size ,
@@ -69,15 +71,18 @@ def hpat_arrays_append(A, B):
6971def hpat_arrays_append_overload (A , B ):
7072 """Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A"""
7173
72- if isinstance (A , types .Array ):
73- if isinstance (B , types .Array ):
74+ A_is_range_index = isinstance (A , RangeIndexType )
75+ B_is_range_index = isinstance (B , RangeIndexType )
76+ if isinstance (A , (types .Array , RangeIndexType )):
77+ if isinstance (B , (types .Array , RangeIndexType )):
7478 def _append_single_numeric_impl (A , B ):
75- return numpy .concatenate ((A , B ,))
79+ _A = A .values if A_is_range_index == True else A # noqa
80+ _B = B .values if B_is_range_index == True else B # noqa
81+ return numpy .concatenate ((_A , _B ,))
7682
7783 return _append_single_numeric_impl
78- elif isinstance (B , (types .UniTuple , types .List )):
79- # TODO: this heavily relies on B being a homogeneous tuple/list - find a better way
80- # to resolve common dtype of heterogeneous sequence of arrays
84+ elif isinstance (B , (types .UniTuple , types .List )) and isinstance (B .dtype , (types .Array , RangeIndexType )):
85+ B_dtype_is_range_index = isinstance (B .dtype , RangeIndexType )
8186 numba_common_dtype = find_common_dtype_from_numpy_dtypes ([A .dtype , B .dtype .dtype ], [])
8287
8388 # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime
@@ -87,11 +92,13 @@ def _append_list_numeric_impl(A, B):
8792 new_data = numpy .empty (total_length , numba_common_dtype )
8893
8994 stop = len (A )
90- new_data [:stop ] = A
95+ _A = numpy .array (A ) if A_is_range_index == True else A # noqa
96+ new_data [:stop ] = _A
9197 for arr in B :
98+ _arr = numpy .array (arr ) if B_dtype_is_range_index == True else arr # noqa
9299 start = stop
93- stop = start + len (arr )
94- new_data [start :stop ] = arr
100+ stop = start + len (_arr )
101+ new_data [start :stop ] = _arr
95102 return new_data
96103
97104 return _append_list_numeric_impl
@@ -210,9 +217,41 @@ def sdc_join_series_indexes(left, right):
210217def sdc_join_series_indexes_overload (left , right ):
211218 """Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm"""
212219
213- # TODO: eliminate code duplication by merging implementations for numeric and StringArray
214- # requires equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays
215- if (isinstance (left , types .Array ) and isinstance (right , types .Array )):
220+ # check that both operands are of types used for representing Pandas indexes
221+ if not (isinstance (left , (types .Array , StringArrayType , RangeIndexType ))
222+ and isinstance (right , (types .Array , StringArrayType , RangeIndexType ))):
223+ return None
224+
225+ convert_left = isinstance (left , RangeIndexType )
226+ convert_right = isinstance (right , RangeIndexType )
227+
228+ def _convert_to_arrays_impl (left , right ):
229+ _left = left .values if convert_left == True else left # noqa
230+ _right = right .values if convert_right == True else right # noqa
231+ return sdc_join_series_indexes (_left , _right )
232+
233+ if isinstance (left , RangeIndexType ) and isinstance (right , RangeIndexType ):
234+
235+ def sdc_join_range_indexes_impl (left , right ):
236+ if (left is right or numpy_like .array_equal (left , right )):
237+ joined = left .values
238+ lidx = numpy .arange (len (joined ))
239+ ridx = lidx
240+ return joined , lidx , ridx
241+ else :
242+ return sdc_join_series_indexes (left .values , right .values )
243+
244+ return sdc_join_range_indexes_impl
245+
246+ elif isinstance (left , RangeIndexType ) and isinstance (right , types .Array ):
247+ return _convert_to_arrays_impl
248+
249+ elif isinstance (left , types .Array ) and isinstance (right , RangeIndexType ):
250+ return _convert_to_arrays_impl
251+
252+ # TODO: remove code duplication below and merge numeric and StringArray impls into one
253+ # needs equivalents of numpy.arsort and _hpat_ensure_array_capacity for StringArrays
254+ elif isinstance (left , types .Array ) and isinstance (right , types .Array ):
216255
217256 numba_common_dtype = find_common_dtype_from_numpy_dtypes ([left .dtype , right .dtype ], [])
218257 if isinstance (numba_common_dtype , types .Number ):
@@ -321,8 +360,6 @@ def sdc_join_series_indexes_impl(left, right):
321360 return sdc_join_series_indexes_impl
322361
323362 else :
324- # TODO: support joining indexes with common dtype=object - requires Numba
325- # support of such numpy arrays in nopython mode, for now just return None
326363 return None
327364
328365 elif (left == string_array_type and right == string_array_type ):
@@ -440,34 +477,6 @@ def sdc_join_series_indexes_impl(left, right):
440477 return None
441478
442479
443- def sdc_check_indexes_equal (left , right ):
444- pass
445-
446-
447- @sdc_overload (sdc_check_indexes_equal , jit_options = {'parallel' : False })
448- def sdc_check_indexes_equal_overload (A , B ):
449- """Function for checking arrays A and B of the same type are equal"""
450-
451- if isinstance (A , types .Array ):
452- def sdc_check_indexes_equal_numeric_impl (A , B ):
453- return numpy .array_equal (A , B )
454- return sdc_check_indexes_equal_numeric_impl
455-
456- elif A == string_array_type :
457- def sdc_check_indexes_equal_string_impl (A , B ):
458- # TODO: replace with StringArrays comparison
459- is_index_equal = (len (A ) == len (B )
460- and num_total_chars (A ) == num_total_chars (B ))
461- for i in numpy .arange (len (A )):
462- if (A [i ] != B [i ]
463- or str_arr_is_na (A , i ) is not str_arr_is_na (B , i )):
464- return False
465-
466- return is_index_equal
467-
468- return sdc_check_indexes_equal_string_impl
469-
470-
471480@numba .njit
472481def _sdc_pandas_format_percentiles (arr ):
473482 """ Function converting float array of percentiles to a list of strings formatted
@@ -606,9 +615,16 @@ def _sdc_take(data, indexes):
606615 pass
607616
608617
609- @sdc_overload (_sdc_take , jit_options = { 'parallel' : True } )
618+ @sdc_overload (_sdc_take )
610619def _sdc_take_overload (data , indexes ):
611- if isinstance (indexes .dtype , types .ListType ) and isinstance (data , (types .Array , types .List )):
620+
621+ if not isinstance (data , (types .Array , StringArrayType , RangeIndexType )):
622+ return None
623+ if not (isinstance (indexes , (types .Array , types .List ))
624+ and isinstance (indexes .dtype , (types .Integer , types .ListType ))):
625+ return None
626+
627+ if isinstance (indexes .dtype , types .ListType ) and isinstance (data , (types .Array , types .List , RangeIndexType )):
612628 arr_dtype = data .dtype
613629
614630 def _sdc_take_list_impl (data , indexes ):
@@ -661,7 +677,7 @@ def _sdc_take_list_str_impl(data, indexes):
661677
662678 return _sdc_take_list_str_impl
663679
664- elif isinstance (data , types .Array ):
680+ elif isinstance (data , ( types .Array , RangeIndexType ) ):
665681 arr_dtype = data .dtype
666682
667683 def _sdc_take_array_impl (data , indexes ):
@@ -673,7 +689,7 @@ def _sdc_take_array_impl(data, indexes):
673689
674690 return _sdc_take_array_impl
675691
676- elif data == string_array_type :
692+ elif isinstance ( data , StringArrayType ) :
677693 def _sdc_take_str_arr_impl (data , indexes ):
678694 res_size = len (indexes )
679695 nan_mask = numpy .zeros (res_size , dtype = numpy .bool_ )
@@ -693,24 +709,6 @@ def _sdc_take_str_arr_impl(data, indexes):
693709
694710 return _sdc_take_str_arr_impl
695711
696- elif (isinstance (data , types .RangeType ) and isinstance (data .dtype , types .Integer )):
697- arr_dtype = data .dtype
698-
699- def _sdc_take_array_impl (data , indexes ):
700- res_size = len (indexes )
701- index_errors = 0
702- res_arr = numpy .empty (res_size , dtype = arr_dtype )
703- for i in numba .prange (res_size ):
704- value = data .start + data .step * indexes [i ]
705- if value >= data .stop :
706- index_errors += 1
707- res_arr [i ] = value
708- if index_errors :
709- raise IndexError ("_sdc_take: index out-of-bounds" )
710- return res_arr
711-
712- return _sdc_take_array_impl
713-
714712 return None
715713
716714
@@ -741,16 +739,19 @@ def sdc_reindex_series(arr, index, name, by_index):
741739def sdc_reindex_series_overload (arr , index , name , by_index ):
742740 """ Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """
743741
744- same_index_types = index is by_index
742+ range_indexes = isinstance ( index , RangeIndexType ) and isinstance ( by_index , RangeIndexType )
745743 data_dtype , index_dtype = arr .dtype , index .dtype
746744 data_is_str_arr = isinstance (arr .dtype , types .UnicodeType )
747745
748746 def sdc_reindex_series_impl (arr , index , name , by_index ):
749747
750- # if index types are the same, we may not reindex if indexes are the same
751- if same_index_types == True : # noqa
752- if index is by_index :
753- return pandas .Series (data = arr , index = index , name = name )
748+ # no reindexing is needed if indexes are equal
749+ if range_indexes == True : # noqa
750+ equal_indexes = numpy_like .array_equal (index , by_index )
751+ else :
752+ equal_indexes = False
753+ if (index is by_index or equal_indexes ):
754+ return pandas .Series (data = arr , index = by_index , name = name )
754755
755756 if data_is_str_arr == True : # noqa
756757 _res_data = ['' ] * len (by_index )
@@ -771,7 +772,8 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
771772 map_index_to_position [value ] = i
772773
773774 index_mismatch = 0
774- for i in numba .prange (len (by_index )):
775+ # FIXME: TypingError in parfor step (wrong promotion to float64?) if prange is used
776+ for i in numpy .arange (len (by_index )):
775777 if by_index [i ] in map_index_to_position :
776778 pos_in_self = map_index_to_position [by_index [i ]]
777779 _res_data [i ] = arr [pos_in_self ]
0 commit comments