Df reset index (#656)

Rubtsowa · web-flow · commit ea2cbe4f2d1d · 2020-04-09T10:28:03.000+03:00
diff --git a/examples/dataframe/dataframe_reset_index_drop_False.py b/examples/dataframe/dataframe_reset_index_drop_False.py
@@ -0,0 +1,47 @@
+# *****************************************************************************
+# Copyright (c) 2020, Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#
+#     Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions and the following disclaimer in the documentation
+#     and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# result DataFrame
+#    index   A
+# 0      5  14
+# 1      2   4
+# 2    -11   5
+# 3      0   4
+# 4     13   1
+# 5      9  55
+
+import pandas as pd
+from numba import njit
+
+
+@njit
+def dataframe_reset_index():
+    df = pd.DataFrame({"A": [14, 4, 5, 4, 1, 55]}, index=[5, 2, -11, 0, 13, 9])
+
+    return df.reset_index(drop=False)
+
+
+print(dataframe_reset_index())
diff --git a/examples/dataframe/dataframe_reset_index_drop_True.py b/examples/dataframe/dataframe_reset_index_drop_True.py
@@ -0,0 +1,47 @@
+# *****************************************************************************
+# Copyright (c) 2020, Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#
+#     Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions and the following disclaimer in the documentation
+#     and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# result DataFrame
+#     A
+# 0  14
+# 1   4
+# 2   5
+# 3   4
+# 4   1
+# 5  55
+
+import pandas as pd
+from numba import njit
+
+
+@njit
+def dataframe_reset_index():
+    df = pd.DataFrame({"A": [14, 4, 5, 4, 1, 55]}, index=[5, 2, -11, 0, 13, 9])
+
+    return df.reset_index(drop=True)
+
+
+print(dataframe_reset_index())
diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -2344,3 +2344,169 @@ def _df_set_column_unicode_key_impl(self, key, value):
 
     ty_checker = TypeChecker('Method _set_column().')
     ty_checker.raise_exc(key, 'str', 'key')
+
+
+def sdc_pandas_dataframe_reset_index_codegen(drop, all_params, columns):
+    """
+    Example of generated implementation:
+        def _df_reset_index_impl(self, level=None, drop=False, inplace=False, col_level=0, col_fill=""):
+          old_index = self.index
+          result_0 = get_dataframe_data(self, 0)
+          result_1 = get_dataframe_data(self, 1)
+          result_2 = get_dataframe_data(self, 2)
+          return pandas.DataFrame({"index": old_index, "A": result_0, "B": result_1, "C": result_2})
+    """
+    result_name = []
+    all_params_str = ', '.join(all_params)
+    func_lines = [f'def _df_reset_index_impl({all_params_str}):']
+    df = all_params[0]
+    if not drop.literal_value:
+        old_index = 'old_index'
+        func_lines += [f'  {old_index} = {df}.index']
+        result_name.append((old_index, 'index'))
+    for i, c in enumerate(columns):
+        result_c = f'result_{i}'
+        func_lines += [
+            f'  result_{i} = get_dataframe_data({df}, {i})'
+        ]
+        result_name.append((result_c, c))
+    data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name)
+    func_lines += [f'  return pandas.DataFrame({{{data}}})']
+    func_text = '\n'.join(func_lines)
+
+    global_vars = {'pandas': pandas,
+                   'numpy': numpy,
+                   'get_dataframe_data': get_dataframe_data}
+
+    return func_text, global_vars
+
+
+def sdc_pandas_dataframe_reset_index_impl(self, drop=False):
+    all_params = ['self', 'level=None', 'drop=False', 'inplace=False', 'col_level=0', 'col_fill=""']
+
+    func_text, global_vars = sdc_pandas_dataframe_reset_index_codegen(drop, all_params, self.columns)
+    loc_vars = {}
+    exec(func_text, global_vars, loc_vars)
+    _apply_impl = loc_vars[f'_df_reset_index_impl']
+
+    return _apply_impl
+
+
+def sdc_pandas_dataframe_reset_index_default_codegen(drop, all_params, columns):
+    """
+    Example of generated implementation:
+        def _df_reset_index_impl(self, level=None, drop=False, inplace=False, col_level=0, col_fill=""):
+          old_index = self.index
+          result_0 = get_dataframe_data(self, 0)
+          result_1 = get_dataframe_data(self, 1)
+          return pandas.DataFrame({"index": old_index, "A": result_0, "B": result_1})
+    """
+    result_name = []
+    all_params_str = ', '.join(all_params)
+    func_lines = [f'def _df_reset_index_impl({all_params_str}):']
+    df = all_params[0]
+    if not drop:
+        old_index = 'old_index'
+        func_lines += [f'  {old_index} = {df}.index']
+        result_name.append((old_index, 'index'))
+    for i, c in enumerate(columns):
+        result_c = f'result_{i}'
+        func_lines += [
+            f'  result_{i} = get_dataframe_data({df}, {i})'
+        ]
+        result_name.append((result_c, c))
+    data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name)
+    func_lines += [f'  return pandas.DataFrame({{{data}}})']
+    func_text = '\n'.join(func_lines)
+
+    global_vars = {'pandas': pandas,
+                   'numpy': numpy,
+                   'get_dataframe_data': get_dataframe_data}
+
+    return func_text, global_vars
+
+
+def sdc_pandas_dataframe_reset_index_impl_default(self, drop=False):
+    all_params = ['self', 'level=None', 'drop=False', 'inplace=False', 'col_level=0', 'col_fill=""']
+
+    func_text, global_vars = sdc_pandas_dataframe_reset_index_default_codegen(drop, all_params, self.columns)
+    loc_vars = {}
+    exec(func_text, global_vars, loc_vars)
+    _apply_impl = loc_vars[f'_df_reset_index_impl']
+
+    return _apply_impl
+
+
+@sdc_overload_method(DataFrameType, 'reset_index')
+def sdc_pandas_dataframe_reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill=''):
+    """
+    Intel Scalable Dataframe Compiler User Guide
+    ********************************************
+    Pandas API: pandas.DataFrame.reset_index
+
+    Limitations
+    -----------
+    - Reset the index of the DataFrame, and use the default one instead.
+    - Parameters level, inplacem col_level, col_fill unsupported.
+    - Parameter drop can be only literal value or default value.
+
+    Examples
+    --------
+    .. literalinclude:: ../../../examples/dataframe/dataframe_reset_index_drop_False.py
+        :language: python
+        :lines: 36-
+        :caption: Reset the index of the DataFrame, and use the default one instead.
+                  The old index becomes the first column.
+        :name: ex_dataframe_reset_index
+
+    .. command-output:: python ./dataframe/dataframe_reset_index_drop_False.py
+        :cwd: ../../../examples
+
+    .. literalinclude:: ../../../examples/dataframe/dataframe_reset_index_drop_True.py
+        :language: python
+        :lines: 36-
+        :caption: Reset the index of the DataFrame, and use the default one instead.
+        :name: ex_dataframe_reset_index
+
+    .. command-output:: python ./dataframe/dataframe_reset_index_drop_True.py
+        :cwd: ../../../examples
+
+    Intel Scalable Dataframe Compiler Developer Guide
+    *************************************************
+    Pandas DataFrame method :meth:`pandas.DataFrame.reset_index` implementation.
+
+   .. only:: developer
+
+       Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_df_reset_index*
+   """
+
+    func_name = 'reset_index'
+
+    ty_checker = TypeChecker('Method {}().'.format(func_name))
+    ty_checker.check(self, DataFrameType)
+
+    if not (level is None or isinstance(level, types.Omitted)):
+        raise TypingError('{} Unsupported parameter level. Given: {}'.format(func_name, level))
+
+    if not (isinstance(drop, (types.Omitted, types.Boolean)) or drop is False):
+        ty_checker.raise_exc(drop, 'bool', 'drop')
+
+    if isinstance(drop, types.Omitted):
+        drop = False
+
+    if not (inplace is False or isinstance(inplace, types.Omitted)):
+        raise TypingError('{} Unsupported parameter inplace. Given: {}'.format(func_name, inplace))
+
+    if not (col_level == 0 or isinstance(col_level, types.Omitted)):
+        raise TypingError('{} Unsupported parameter col_level. Given: {}'.format(func_name, col_level))
+
+    if not (col_fill == '' or isinstance(col_fill, types.Omitted)):
+        raise TypingError('{} Unsupported parameter col_fill. Given: {}'.format(func_name, col_fill))
+
+    if not isinstance(drop, types.Literal):
+        if isinstance(drop, bool):
+            return sdc_pandas_dataframe_reset_index_impl_default(self, drop=drop)
+        else:
+            raise SDCLimitation('{} only work with Boolean literals drop.'.format(func_name))
+
+    return sdc_pandas_dataframe_reset_index_impl(self, drop=drop)
diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
@@ -887,49 +887,6 @@ def lower_fillna_dummy(context, builder, sig, args):
     return out_obj._getvalue()
 
 
-@overload_method(DataFrameType, 'reset_index')
-def reset_index_overload(df, level=None, drop=False, inplace=False,
-                         col_level=0, col_fill=''):
-
-    # TODO: avoid dummy and generate func here when inlining is possible
-    # TODO: inplace of df with parent (reflection)
-    def _impl(df, level=None, drop=False, inplace=False,
-              col_level=0, col_fill=''):
-        return sdc.hiframes.pd_dataframe_ext.reset_index_dummy(df, inplace)
-
-    return _impl
-
-
-def reset_index_dummy(df, n):
-    return df
-
-
-@infer_global(reset_index_dummy)
-class ResetIndexDummyTyper(AbstractTemplate):
-    def generic(self, args, kws):
-        df, inplace = args
-        # inplace value
-        if isinstance(inplace, sdc.utilities.utils.BooleanLiteral):
-            inplace = inplace.literal_value
-        else:
-            # XXX inplace type is just bool when value not passed. Therefore,
-            # we assume the default False value.
-            # TODO: more robust fix or just check
-            inplace = False
-
-        if not inplace:
-            out_df = DataFrameType(df.data, None, df.columns)
-            return signature(out_df, *args)
-        return signature(types.none, *args)
-
-
-@lower_builtin(reset_index_dummy, types.VarArg(types.Any))
-def lower_reset_index_dummy(context, builder, sig, args):
-    out_obj = cgutils.create_struct_proxy(
-        sig.return_type)(context, builder)
-    return out_obj._getvalue()
-
-
 @overload_method(DataFrameType, 'dropna')
 def dropna_overload(df, axis=0, how='any', thresh=None, subset=None,
                     inplace=False):
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
@@ -32,6 +32,7 @@
 import string
 import unittest
 from itertools import permutations, product
+from numba import types
 from numba.config import IS_32BITS
 from numba.special import literal_unroll
 from numba.errors import TypingError
@@ -1277,13 +1278,58 @@ def test_impl(A):
         hpat_func = self.jit(test_impl)
         pd.testing.assert_frame_equal(hpat_func(df), test_impl(df2))
 
-    @skip_numba_jit
-    def test_df_reset_index1(self):
+    def test_df_reset_index_drop(self):
+        def test_impl(df, drop):
+            return df.reset_index(drop=drop)
+
+        df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': np.arange(4.0)})
+        hpat_func = self.jit(test_impl)
+
+        for drop in [True, False]:
+            with self.subTest(drop=drop):
+                with self.assertRaises(Exception) as raises:
+                    hpat_func(df, drop)
+                msg = 'only work with Boolean literals drop'
+                self.assertIn(msg.format(types.bool_), str(raises.exception))
+
+    def test_df_reset_index_drop_false_index_int(self):
+        def test_impl(df):
+            return df.reset_index(drop=False)
+
+        df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0],
+                           'B': np.arange(4.0)}, index=[5, 8, 4, 6])
+        hpat_func = self.jit(test_impl)
+
+        pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
+
+    def test_df_reset_index_drop_true_index_int(self):
         def test_impl(df):
             return df.reset_index(drop=True)
 
-        df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0]})
+        df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0],
+                           'B': np.arange(4.0)}, index=[5, 8, 4, 6])
         hpat_func = self.jit(test_impl)
+
+        pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
+
+    def test_df_reset_index_drop_default_index_int(self):
+        def test_impl(df):
+            return df.reset_index()
+
+        df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0],
+                           'B': np.arange(4.0)}, index=[5, 8, 4, 6])
+        hpat_func = self.jit(test_impl)
+
+        pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
+
+    @skip_numba_jit
+    def test_df_reset_index_empty_df(self):
+        def test_impl(df):
+            return df.reset_index()
+
+        df = pd.DataFrame({})
+        hpat_func = self.jit(test_impl)
+
         pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
 
     @skip_numba_jit
diff --git a/sdc/tests/tests_perf/test_perf_df.py b/sdc/tests/tests_perf/test_perf_df.py
@@ -78,6 +78,7 @@ def _test_case(self, pyfunc, name, total_data_length, input_data=None, data_num=
     TC(name='min', size=[10 ** 7], check_skipna=True),
     TC(name='pct_change', size=[10 ** 7]),
     TC(name='prod', size=[10 ** 7], check_skipna=True),
+    TC(name='reset_index', size=[10 ** 7], params='drop=False'),
     TC(name='std', size=[10 ** 7], check_skipna=True),
     TC(name='sum', size=[10 ** 7], check_skipna=True),
     TC(name='var', size=[10 ** 7], check_skipna=True),