Df.loc impl (#788)

1e-to · web-flow · commit 669443c3d974 · 2020-04-21T21:32:29.000+03:00
I'm ok with this for now. We'll return to it later
diff --git a/examples/dataframe/dataframe_loc.py b/examples/dataframe/dataframe_loc.py
@@ -0,0 +1,45 @@
+# *****************************************************************************
+# Copyright (c) 2020, Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#
+#     Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions and the following disclaimer in the documentation
+#     and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+"""
+   Expected result:
+        A  B  C
+    2  3.0  6  2
+"""
+
+import pandas as pd
+from numba import njit
+
+
+@njit
+def dataframe_loc():
+    df = pd.DataFrame({'A': [1.0, 2.0, 3.0, 1.0], 'B': [4, 5, 6, 7], 'C': [4, 5, 2, 1]})
+
+    return df.loc[2]
+
+
+print(dataframe_loc())
diff --git a/sdc/datatypes/common_functions.py b/sdc/datatypes/common_functions.py
@@ -596,8 +596,60 @@ def _sdc_take(data, indexes):
 
 @sdc_overload(_sdc_take, jit_options={'parallel': True})
 def _sdc_take_overload(data, indexes):
+    if isinstance(indexes.dtype, types.ListType) and isinstance(data, (types.Array, types.List)):
+        arr_dtype = data.dtype
+
+        def _sdc_take_list_impl(data, indexes):
+            res_size = 0
+            for i in numba.prange(len(indexes)):
+                res_size += len(indexes[i])
+            res_arr = numpy.empty(res_size, dtype=arr_dtype)
+            for i in numba.prange(len(indexes)):
+                start = 0
+                for l in range(len(indexes[0:i])):
+                    start += len(indexes[l])
+                current_pos = start
+                for j in range(len(indexes[i])):
+                    res_arr[current_pos] = data[indexes[i][j]]
+                    current_pos += 1
+            return res_arr
+
+        return _sdc_take_list_impl
+
+    elif isinstance(indexes.dtype, types.ListType) and data == string_array_type:
+        def _sdc_take_list_str_impl(data, indexes):
+            res_size = 0
+            for i in numba.prange(len(indexes)):
+                res_size += len(indexes[i])
+            nan_mask = numpy.zeros(res_size, dtype=numpy.bool_)
+            num_total_bytes = 0
+            for i in numba.prange(len(indexes)):
+                start = 0
+                for l in range(len(indexes[0:i])):
+                    start += len(indexes[l])
+                current_pos = start
+                for j in range(len(indexes[i])):
+                    num_total_bytes += get_utf8_size(data[indexes[i][j]])
+                    if isna(data, indexes[i][j]):
+                        nan_mask[current_pos] = True
+                    current_pos += 1
+            res_arr = pre_alloc_string_array(res_size, num_total_bytes)
+            for i in numba.prange(len(indexes)):
+                start = 0
+                for l in range(len(indexes[0:i])):
+                    start += len(indexes[l])
+                current_pos = start
+                for j in range(len(indexes[i])):
+                    res_arr[current_pos] = data[indexes[i][j]]
+                    if nan_mask[current_pos]:
+                        str_arr_set_na(res_arr, current_pos)
+                    current_pos += 1
+
+            return res_arr
+
+        return _sdc_take_list_str_impl
 
-    if isinstance(data, types.Array):
+    elif isinstance(data, types.Array):
         arr_dtype = data.dtype
 
         def _sdc_take_array_impl(data, indexes):
diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -64,6 +64,7 @@
 from sdc.functions.numpy_like import getitem_by_mask
 from sdc.datatypes.common_functions import _sdc_take, sdc_reindex_series
 from sdc.utilities.prange_utils import parallel_chunks
+from sdc.functions.numpy_like import find_idx
 
 
 @sdc_overload_attribute(DataFrameType, 'index')
@@ -1876,6 +1877,58 @@ def _df_getitem_unicode_idx_impl(self, idx):
     ty_checker.raise_exc(idx, expected_types, 'idx')
 
 
+def df_getitem_single_label_loc_codegen(self, idx):
+    """
+    Example of generated implementation:
+        def _df_getitem_single_label_loc_impl(self, idx):
+            idx_list = find_idx(self._dataframe._index, idx)
+            data_0 = _sdc_take(self._dataframe._data[0], idx_list)
+            res_data_0 = pandas.Series(data_0)
+            data_1 = _sdc_take(self._dataframe._data[1], idx_list)
+            res_data_1 = pandas.Series(data_1)
+            if len(idx_list) < 1:
+                raise KeyError('Index is not in the DataFrame')
+            new_index = _sdc_take(self._dataframe._index, idx_list)
+            return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=new_index)
+    """
+    if isinstance(self.index, types.NoneType):
+        fill_list = ['  idx_list =  numpy.array([idx])']
+        new_index = ['  new_index = numpy.array([idx])']
+
+    else:
+        fill_list = ['  idx_list = find_idx(self._dataframe._index, idx)']
+        new_index = ['  new_index = _sdc_take(self._dataframe._index, idx_list)']
+
+    fill_list_text = '\n'.join(fill_list)
+    new_index_text = '\n'.join(new_index)
+    func_lines = ['def _df_getitem_single_label_loc_impl(self, idx):',
+                  f'{fill_list_text}']
+    results = []
+    for i, c in enumerate(self.columns):
+        data = f'data_{i}'
+        index_in_list = f'index_in_list_{i}'
+        res_data = f'res_data_{i}'
+        func_lines += [f'  {data} = _sdc_take(self._dataframe._data[{i}], idx_list)',
+                       f'  {res_data} = pandas.Series({data})']
+        results.append((c, res_data))
+
+    func_lines += ['  if len(idx_list) < 1:',
+                   "    raise KeyError('Index is not in the DataFrame')"]
+
+    data = ', '.join(f'"{col}": {data}' for col, data in results)
+    func_lines += [f'{new_index_text}',
+                   f'  return pandas.DataFrame({{{data}}}, index=new_index)']
+
+    func_text = '\n'.join(func_lines)
+    global_vars = {'pandas': pandas, 'numpy': numpy,
+                   'numba': numba,
+                   '_sdc_take': _sdc_take,
+                   'find_idx': find_idx,
+                   'KeyError': KeyError}
+
+    return func_text, global_vars
+
+
 def df_getitem_int_iloc_codegen(self, idx):
     """
     Example of generated implementation:
@@ -2010,6 +2063,9 @@ def _df_getitem_list_bool_iloc_impl(self, idx):
     return func_text, global_vars
 
 
+gen_df_getitem_loc_single_label_impl = gen_impl_generator(
+    df_getitem_single_label_loc_codegen, '_df_getitem_single_label_loc_impl')
+
 gen_df_getitem_iloc_int_impl = gen_impl_generator(
     df_getitem_int_iloc_codegen, '_df_getitem_int_iloc_impl')
 
@@ -2030,6 +2086,13 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx):
 
     accessor = self.accessor.literal_value
 
+    if accessor == 'loc':
+        if isinstance(idx, (types.Integer, types.UnicodeType, types.StringLiteral)):
+            return gen_df_getitem_loc_single_label_impl(self.dataframe, idx)
+
+        ty_checker = TypeChecker('Attribute loc().')
+        ty_checker.raise_exc(idx, 'int or str', 'idx')
+
     if accessor == 'iat':
         if isinstance(idx, types.Tuple) and isinstance(idx[1], types.Literal):
             col = idx[1].literal_value
@@ -2181,6 +2244,57 @@ def sdc_pandas_dataframe_iat_impl(self):
     return sdc_pandas_dataframe_iat_impl
 
 
+@sdc_overload_attribute(DataFrameType, 'loc')
+def sdc_pandas_dataframe_loc(self):
+    """
+    Intel Scalable Dataframe Compiler User Guide
+    ********************************************
+
+    Pandas API: pandas.DataFrame.loc
+
+    Limitations
+    -----------
+    - Loc always returns Dataframe.
+    - Parameter ``idx`` is supported only to be a single value, e.g. :obj:`df.loc['A']`.
+
+    Examples
+    --------
+    .. literalinclude:: ../../../examples/dataframe/dataframe_loc.py
+       :language: python
+       :lines: 36-
+       :caption: Access a group of rows and columns by label(s) or a boolean array.
+       :name: ex_dataframe_loc
+
+    .. command-output:: python ./dataframe/dataframe_loc.py
+       :cwd: ../../../examples
+
+    .. seealso::
+        :ref:`DataFrame.at <pandas.DataFrame.at>`
+            Access a single value for a row/column label pair.
+        :ref:`DataFrame.iloc <pandas.DataFrame.iloc>`
+            Access group of rows and columns by integer position(s).
+        :ref:`DataFrame.xs <pandas.DataFrame.xs>`
+            Returns a cross-section (row(s) or column(s)) from the Series/DataFrame.
+        :ref:`Series.loc <pandas.Series.loc>`
+            Access group of values using labels.
+
+    Intel Scalable Dataframe Compiler Developer Guide
+    *************************************************
+    Pandas DataFrame method :meth:`pandas.DataFrame.loc` implementation.
+
+    .. only:: developer
+        Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_df_loc*
+    """
+
+    ty_checker = TypeChecker('Attribute loc().')
+    ty_checker.check(self, DataFrameType)
+
+    def sdc_pandas_dataframe_loc_impl(self):
+        return sdc.datatypes.hpat_pandas_dataframe_getitem_types.dataframe_getitem_accessor_init(self, 'loc')
+
+    return sdc_pandas_dataframe_loc_impl
+
+
 @sdc_overload_method(DataFrameType, 'pct_change')
 def pct_change_overload(df, periods=1, fill_method='pad', limit=None, freq=None):
     """
diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py
@@ -41,6 +41,7 @@
 from numba import types, jit, prange, numpy_support, literally
 from numba.errors import TypingError
 from numba.targets.arraymath import get_isnan
+from numba.typed import List
 
 import sdc
 from sdc.utilities.sdc_typing_utils import TypeChecker
@@ -710,6 +711,26 @@ def dropna_impl(arr, idx, name):
     return dropna_impl
 
 
+def find_idx(arr, idx):
+    pass
+
+
+@sdc_overload(find_idx)
+def find_idx_overload(arr, idx):
+    def find_idx_impl(arr, idx):
+        chunks = parallel_chunks(len(arr))
+        new_arr = [List.empty_list(types.int64) for i in range(len(chunks))]
+        for i in prange(len(chunks)):
+            chunk = chunks[i]
+            for j in range(chunk.start, chunk.stop):
+                if arr[j] == idx:
+                    new_arr[i].append(j)
+
+        return new_arr
+
+    return find_idx_impl
+
+
 def nanmean(a):
     pass
 
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
@@ -1197,6 +1197,40 @@ def test_impl(df):
         msg = 'Index is out of bounds for axis'
         self.assertIn(msg, str(raises.exception))
 
+    def test_df_loc(self):
+        def test_impl(df):
+            return df.loc[4]
+
+        sdc_func = sdc.jit(test_impl)
+        idx = [3, 4, 1, 4, 0]
+        df = pd.DataFrame({"A": [3.2, 4.4, 7.0, 3.3, 1.0],
+                           "B": [3, 4, 1, 0, 222],
+                           "C": [3.1, 8.4, 7.1, 3.2, 1]}, index=idx)
+        pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
+
+    @unittest.skip("SDC Dataframe.loc[] always return Dataframe")
+    def test_df_loc_str(self):
+        def test_impl(df):
+            return df.loc['c']
+
+        sdc_func = sdc.jit(test_impl)
+        idx = ['a', 'b', 'c', 'с', 'e']
+        df = pd.DataFrame({"A": ['3.2', '4.4', '7.0', '3.3', '1.0'],
+                           "B": ['3', '4', '1', '0', '222'],
+                           "C": ['3.1', '8.4', '7.1', '3.2', '1']}, index=idx)
+        pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
+
+    @unittest.skip("SDC Dataframe.loc[] always return Dataframe")
+    def test_df_loc_no_idx(self):
+        def test_impl(df):
+            return df.loc[2]
+
+        sdc_func = sdc.jit(test_impl)
+        df = pd.DataFrame({"A": [3.2, 4.4, 7.0, 3.3, 1.0],
+                           "B": [3, 4, 1, 0, 222],
+                           "C": [3.1, 8.4, 7.1, 3.2, 1]})
+        pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
+
     def test_df_head(self):
         def get_func(n):
             def impl(a):