Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit ea2cbe4

Browse files
authored
Df reset index (#656)
1 parent 5d201ab commit ea2cbe4

File tree

6 files changed

+310
-46
lines changed

6 files changed

+310
-46
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
# result DataFrame
28+
# index A
29+
# 0 5 14
30+
# 1 2 4
31+
# 2 -11 5
32+
# 3 0 4
33+
# 4 13 1
34+
# 5 9 55
35+
36+
import pandas as pd
37+
from numba import njit
38+
39+
40+
@njit
41+
def dataframe_reset_index():
42+
df = pd.DataFrame({"A": [14, 4, 5, 4, 1, 55]}, index=[5, 2, -11, 0, 13, 9])
43+
44+
return df.reset_index(drop=False)
45+
46+
47+
print(dataframe_reset_index())
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
# result DataFrame
28+
# A
29+
# 0 14
30+
# 1 4
31+
# 2 5
32+
# 3 4
33+
# 4 1
34+
# 5 55
35+
36+
import pandas as pd
37+
from numba import njit
38+
39+
40+
@njit
41+
def dataframe_reset_index():
42+
df = pd.DataFrame({"A": [14, 4, 5, 4, 1, 55]}, index=[5, 2, -11, 0, 13, 9])
43+
44+
return df.reset_index(drop=True)
45+
46+
47+
print(dataframe_reset_index())

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2344,3 +2344,169 @@ def _df_set_column_unicode_key_impl(self, key, value):
23442344

23452345
ty_checker = TypeChecker('Method _set_column().')
23462346
ty_checker.raise_exc(key, 'str', 'key')
2347+
2348+
2349+
def sdc_pandas_dataframe_reset_index_codegen(drop, all_params, columns):
2350+
"""
2351+
Example of generated implementation:
2352+
def _df_reset_index_impl(self, level=None, drop=False, inplace=False, col_level=0, col_fill=""):
2353+
old_index = self.index
2354+
result_0 = get_dataframe_data(self, 0)
2355+
result_1 = get_dataframe_data(self, 1)
2356+
result_2 = get_dataframe_data(self, 2)
2357+
return pandas.DataFrame({"index": old_index, "A": result_0, "B": result_1, "C": result_2})
2358+
"""
2359+
result_name = []
2360+
all_params_str = ', '.join(all_params)
2361+
func_lines = [f'def _df_reset_index_impl({all_params_str}):']
2362+
df = all_params[0]
2363+
if not drop.literal_value:
2364+
old_index = 'old_index'
2365+
func_lines += [f' {old_index} = {df}.index']
2366+
result_name.append((old_index, 'index'))
2367+
for i, c in enumerate(columns):
2368+
result_c = f'result_{i}'
2369+
func_lines += [
2370+
f' result_{i} = get_dataframe_data({df}, {i})'
2371+
]
2372+
result_name.append((result_c, c))
2373+
data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name)
2374+
func_lines += [f' return pandas.DataFrame({{{data}}})']
2375+
func_text = '\n'.join(func_lines)
2376+
2377+
global_vars = {'pandas': pandas,
2378+
'numpy': numpy,
2379+
'get_dataframe_data': get_dataframe_data}
2380+
2381+
return func_text, global_vars
2382+
2383+
2384+
def sdc_pandas_dataframe_reset_index_impl(self, drop=False):
2385+
all_params = ['self', 'level=None', 'drop=False', 'inplace=False', 'col_level=0', 'col_fill=""']
2386+
2387+
func_text, global_vars = sdc_pandas_dataframe_reset_index_codegen(drop, all_params, self.columns)
2388+
loc_vars = {}
2389+
exec(func_text, global_vars, loc_vars)
2390+
_apply_impl = loc_vars[f'_df_reset_index_impl']
2391+
2392+
return _apply_impl
2393+
2394+
2395+
def sdc_pandas_dataframe_reset_index_default_codegen(drop, all_params, columns):
2396+
"""
2397+
Example of generated implementation:
2398+
def _df_reset_index_impl(self, level=None, drop=False, inplace=False, col_level=0, col_fill=""):
2399+
old_index = self.index
2400+
result_0 = get_dataframe_data(self, 0)
2401+
result_1 = get_dataframe_data(self, 1)
2402+
return pandas.DataFrame({"index": old_index, "A": result_0, "B": result_1})
2403+
"""
2404+
result_name = []
2405+
all_params_str = ', '.join(all_params)
2406+
func_lines = [f'def _df_reset_index_impl({all_params_str}):']
2407+
df = all_params[0]
2408+
if not drop:
2409+
old_index = 'old_index'
2410+
func_lines += [f' {old_index} = {df}.index']
2411+
result_name.append((old_index, 'index'))
2412+
for i, c in enumerate(columns):
2413+
result_c = f'result_{i}'
2414+
func_lines += [
2415+
f' result_{i} = get_dataframe_data({df}, {i})'
2416+
]
2417+
result_name.append((result_c, c))
2418+
data = ', '.join(f'"{column_name}": {column}' for column, column_name in result_name)
2419+
func_lines += [f' return pandas.DataFrame({{{data}}})']
2420+
func_text = '\n'.join(func_lines)
2421+
2422+
global_vars = {'pandas': pandas,
2423+
'numpy': numpy,
2424+
'get_dataframe_data': get_dataframe_data}
2425+
2426+
return func_text, global_vars
2427+
2428+
2429+
def sdc_pandas_dataframe_reset_index_impl_default(self, drop=False):
2430+
all_params = ['self', 'level=None', 'drop=False', 'inplace=False', 'col_level=0', 'col_fill=""']
2431+
2432+
func_text, global_vars = sdc_pandas_dataframe_reset_index_default_codegen(drop, all_params, self.columns)
2433+
loc_vars = {}
2434+
exec(func_text, global_vars, loc_vars)
2435+
_apply_impl = loc_vars[f'_df_reset_index_impl']
2436+
2437+
return _apply_impl
2438+
2439+
2440+
@sdc_overload_method(DataFrameType, 'reset_index')
2441+
def sdc_pandas_dataframe_reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill=''):
2442+
"""
2443+
Intel Scalable Dataframe Compiler User Guide
2444+
********************************************
2445+
Pandas API: pandas.DataFrame.reset_index
2446+
2447+
Limitations
2448+
-----------
2449+
- Reset the index of the DataFrame, and use the default one instead.
2450+
- Parameters level, inplacem col_level, col_fill unsupported.
2451+
- Parameter drop can be only literal value or default value.
2452+
2453+
Examples
2454+
--------
2455+
.. literalinclude:: ../../../examples/dataframe/dataframe_reset_index_drop_False.py
2456+
:language: python
2457+
:lines: 36-
2458+
:caption: Reset the index of the DataFrame, and use the default one instead.
2459+
The old index becomes the first column.
2460+
:name: ex_dataframe_reset_index
2461+
2462+
.. command-output:: python ./dataframe/dataframe_reset_index_drop_False.py
2463+
:cwd: ../../../examples
2464+
2465+
.. literalinclude:: ../../../examples/dataframe/dataframe_reset_index_drop_True.py
2466+
:language: python
2467+
:lines: 36-
2468+
:caption: Reset the index of the DataFrame, and use the default one instead.
2469+
:name: ex_dataframe_reset_index
2470+
2471+
.. command-output:: python ./dataframe/dataframe_reset_index_drop_True.py
2472+
:cwd: ../../../examples
2473+
2474+
Intel Scalable Dataframe Compiler Developer Guide
2475+
*************************************************
2476+
Pandas DataFrame method :meth:`pandas.DataFrame.reset_index` implementation.
2477+
2478+
.. only:: developer
2479+
2480+
Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_df_reset_index*
2481+
"""
2482+
2483+
func_name = 'reset_index'
2484+
2485+
ty_checker = TypeChecker('Method {}().'.format(func_name))
2486+
ty_checker.check(self, DataFrameType)
2487+
2488+
if not (level is None or isinstance(level, types.Omitted)):
2489+
raise TypingError('{} Unsupported parameter level. Given: {}'.format(func_name, level))
2490+
2491+
if not (isinstance(drop, (types.Omitted, types.Boolean)) or drop is False):
2492+
ty_checker.raise_exc(drop, 'bool', 'drop')
2493+
2494+
if isinstance(drop, types.Omitted):
2495+
drop = False
2496+
2497+
if not (inplace is False or isinstance(inplace, types.Omitted)):
2498+
raise TypingError('{} Unsupported parameter inplace. Given: {}'.format(func_name, inplace))
2499+
2500+
if not (col_level == 0 or isinstance(col_level, types.Omitted)):
2501+
raise TypingError('{} Unsupported parameter col_level. Given: {}'.format(func_name, col_level))
2502+
2503+
if not (col_fill == '' or isinstance(col_fill, types.Omitted)):
2504+
raise TypingError('{} Unsupported parameter col_fill. Given: {}'.format(func_name, col_fill))
2505+
2506+
if not isinstance(drop, types.Literal):
2507+
if isinstance(drop, bool):
2508+
return sdc_pandas_dataframe_reset_index_impl_default(self, drop=drop)
2509+
else:
2510+
raise SDCLimitation('{} only work with Boolean literals drop.'.format(func_name))
2511+
2512+
return sdc_pandas_dataframe_reset_index_impl(self, drop=drop)

sdc/hiframes/pd_dataframe_ext.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -887,49 +887,6 @@ def lower_fillna_dummy(context, builder, sig, args):
887887
return out_obj._getvalue()
888888

889889

890-
@overload_method(DataFrameType, 'reset_index')
891-
def reset_index_overload(df, level=None, drop=False, inplace=False,
892-
col_level=0, col_fill=''):
893-
894-
# TODO: avoid dummy and generate func here when inlining is possible
895-
# TODO: inplace of df with parent (reflection)
896-
def _impl(df, level=None, drop=False, inplace=False,
897-
col_level=0, col_fill=''):
898-
return sdc.hiframes.pd_dataframe_ext.reset_index_dummy(df, inplace)
899-
900-
return _impl
901-
902-
903-
def reset_index_dummy(df, n):
904-
return df
905-
906-
907-
@infer_global(reset_index_dummy)
908-
class ResetIndexDummyTyper(AbstractTemplate):
909-
def generic(self, args, kws):
910-
df, inplace = args
911-
# inplace value
912-
if isinstance(inplace, sdc.utilities.utils.BooleanLiteral):
913-
inplace = inplace.literal_value
914-
else:
915-
# XXX inplace type is just bool when value not passed. Therefore,
916-
# we assume the default False value.
917-
# TODO: more robust fix or just check
918-
inplace = False
919-
920-
if not inplace:
921-
out_df = DataFrameType(df.data, None, df.columns)
922-
return signature(out_df, *args)
923-
return signature(types.none, *args)
924-
925-
926-
@lower_builtin(reset_index_dummy, types.VarArg(types.Any))
927-
def lower_reset_index_dummy(context, builder, sig, args):
928-
out_obj = cgutils.create_struct_proxy(
929-
sig.return_type)(context, builder)
930-
return out_obj._getvalue()
931-
932-
933890
@overload_method(DataFrameType, 'dropna')
934891
def dropna_overload(df, axis=0, how='any', thresh=None, subset=None,
935892
inplace=False):

sdc/tests/test_dataframe.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import string
3333
import unittest
3434
from itertools import permutations, product
35+
from numba import types
3536
from numba.config import IS_32BITS
3637
from numba.special import literal_unroll
3738
from numba.errors import TypingError
@@ -1277,13 +1278,58 @@ def test_impl(A):
12771278
hpat_func = self.jit(test_impl)
12781279
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df2))
12791280

1280-
@skip_numba_jit
1281-
def test_df_reset_index1(self):
1281+
def test_df_reset_index_drop(self):
1282+
def test_impl(df, drop):
1283+
return df.reset_index(drop=drop)
1284+
1285+
df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': np.arange(4.0)})
1286+
hpat_func = self.jit(test_impl)
1287+
1288+
for drop in [True, False]:
1289+
with self.subTest(drop=drop):
1290+
with self.assertRaises(Exception) as raises:
1291+
hpat_func(df, drop)
1292+
msg = 'only work with Boolean literals drop'
1293+
self.assertIn(msg.format(types.bool_), str(raises.exception))
1294+
1295+
def test_df_reset_index_drop_false_index_int(self):
1296+
def test_impl(df):
1297+
return df.reset_index(drop=False)
1298+
1299+
df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0],
1300+
'B': np.arange(4.0)}, index=[5, 8, 4, 6])
1301+
hpat_func = self.jit(test_impl)
1302+
1303+
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
1304+
1305+
def test_df_reset_index_drop_true_index_int(self):
12821306
def test_impl(df):
12831307
return df.reset_index(drop=True)
12841308

1285-
df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0]})
1309+
df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0],
1310+
'B': np.arange(4.0)}, index=[5, 8, 4, 6])
12861311
hpat_func = self.jit(test_impl)
1312+
1313+
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
1314+
1315+
def test_df_reset_index_drop_default_index_int(self):
1316+
def test_impl(df):
1317+
return df.reset_index()
1318+
1319+
df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0],
1320+
'B': np.arange(4.0)}, index=[5, 8, 4, 6])
1321+
hpat_func = self.jit(test_impl)
1322+
1323+
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
1324+
1325+
@skip_numba_jit
1326+
def test_df_reset_index_empty_df(self):
1327+
def test_impl(df):
1328+
return df.reset_index()
1329+
1330+
df = pd.DataFrame({})
1331+
hpat_func = self.jit(test_impl)
1332+
12871333
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
12881334

12891335
@skip_numba_jit

sdc/tests/tests_perf/test_perf_df.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def _test_case(self, pyfunc, name, total_data_length, input_data=None, data_num=
7878
TC(name='min', size=[10 ** 7], check_skipna=True),
7979
TC(name='pct_change', size=[10 ** 7]),
8080
TC(name='prod', size=[10 ** 7], check_skipna=True),
81+
TC(name='reset_index', size=[10 ** 7], params='drop=False'),
8182
TC(name='std', size=[10 ** 7], check_skipna=True),
8283
TC(name='sum', size=[10 ** 7], check_skipna=True),
8384
TC(name='var', size=[10 ** 7], check_skipna=True),

0 commit comments

Comments
 (0)