Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 669443c

Browse files
authored
Df.loc impl (#788)
I'm ok with this for now. We'll return to it later
1 parent 002da4b commit 669443c

File tree

5 files changed

+267
-1
lines changed

5 files changed

+267
-1
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
28+
"""
29+
Expected result:
30+
A B C
31+
2 3.0 6 2
32+
"""
33+
34+
import pandas as pd
35+
from numba import njit
36+
37+
38+
@njit
39+
def dataframe_loc():
40+
df = pd.DataFrame({'A': [1.0, 2.0, 3.0, 1.0], 'B': [4, 5, 6, 7], 'C': [4, 5, 2, 1]})
41+
42+
return df.loc[2]
43+
44+
45+
print(dataframe_loc())

sdc/datatypes/common_functions.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -596,8 +596,60 @@ def _sdc_take(data, indexes):
596596

597597
@sdc_overload(_sdc_take, jit_options={'parallel': True})
598598
def _sdc_take_overload(data, indexes):
599+
if isinstance(indexes.dtype, types.ListType) and isinstance(data, (types.Array, types.List)):
600+
arr_dtype = data.dtype
601+
602+
def _sdc_take_list_impl(data, indexes):
603+
res_size = 0
604+
for i in numba.prange(len(indexes)):
605+
res_size += len(indexes[i])
606+
res_arr = numpy.empty(res_size, dtype=arr_dtype)
607+
for i in numba.prange(len(indexes)):
608+
start = 0
609+
for l in range(len(indexes[0:i])):
610+
start += len(indexes[l])
611+
current_pos = start
612+
for j in range(len(indexes[i])):
613+
res_arr[current_pos] = data[indexes[i][j]]
614+
current_pos += 1
615+
return res_arr
616+
617+
return _sdc_take_list_impl
618+
619+
elif isinstance(indexes.dtype, types.ListType) and data == string_array_type:
620+
def _sdc_take_list_str_impl(data, indexes):
621+
res_size = 0
622+
for i in numba.prange(len(indexes)):
623+
res_size += len(indexes[i])
624+
nan_mask = numpy.zeros(res_size, dtype=numpy.bool_)
625+
num_total_bytes = 0
626+
for i in numba.prange(len(indexes)):
627+
start = 0
628+
for l in range(len(indexes[0:i])):
629+
start += len(indexes[l])
630+
current_pos = start
631+
for j in range(len(indexes[i])):
632+
num_total_bytes += get_utf8_size(data[indexes[i][j]])
633+
if isna(data, indexes[i][j]):
634+
nan_mask[current_pos] = True
635+
current_pos += 1
636+
res_arr = pre_alloc_string_array(res_size, num_total_bytes)
637+
for i in numba.prange(len(indexes)):
638+
start = 0
639+
for l in range(len(indexes[0:i])):
640+
start += len(indexes[l])
641+
current_pos = start
642+
for j in range(len(indexes[i])):
643+
res_arr[current_pos] = data[indexes[i][j]]
644+
if nan_mask[current_pos]:
645+
str_arr_set_na(res_arr, current_pos)
646+
current_pos += 1
647+
648+
return res_arr
649+
650+
return _sdc_take_list_str_impl
599651

600-
if isinstance(data, types.Array):
652+
elif isinstance(data, types.Array):
601653
arr_dtype = data.dtype
602654

603655
def _sdc_take_array_impl(data, indexes):

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
from sdc.functions.numpy_like import getitem_by_mask
6565
from sdc.datatypes.common_functions import _sdc_take, sdc_reindex_series
6666
from sdc.utilities.prange_utils import parallel_chunks
67+
from sdc.functions.numpy_like import find_idx
6768

6869

6970
@sdc_overload_attribute(DataFrameType, 'index')
@@ -1876,6 +1877,58 @@ def _df_getitem_unicode_idx_impl(self, idx):
18761877
ty_checker.raise_exc(idx, expected_types, 'idx')
18771878

18781879

1880+
def df_getitem_single_label_loc_codegen(self, idx):
1881+
"""
1882+
Example of generated implementation:
1883+
def _df_getitem_single_label_loc_impl(self, idx):
1884+
idx_list = find_idx(self._dataframe._index, idx)
1885+
data_0 = _sdc_take(self._dataframe._data[0], idx_list)
1886+
res_data_0 = pandas.Series(data_0)
1887+
data_1 = _sdc_take(self._dataframe._data[1], idx_list)
1888+
res_data_1 = pandas.Series(data_1)
1889+
if len(idx_list) < 1:
1890+
raise KeyError('Index is not in the DataFrame')
1891+
new_index = _sdc_take(self._dataframe._index, idx_list)
1892+
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=new_index)
1893+
"""
1894+
if isinstance(self.index, types.NoneType):
1895+
fill_list = [' idx_list = numpy.array([idx])']
1896+
new_index = [' new_index = numpy.array([idx])']
1897+
1898+
else:
1899+
fill_list = [' idx_list = find_idx(self._dataframe._index, idx)']
1900+
new_index = [' new_index = _sdc_take(self._dataframe._index, idx_list)']
1901+
1902+
fill_list_text = '\n'.join(fill_list)
1903+
new_index_text = '\n'.join(new_index)
1904+
func_lines = ['def _df_getitem_single_label_loc_impl(self, idx):',
1905+
f'{fill_list_text}']
1906+
results = []
1907+
for i, c in enumerate(self.columns):
1908+
data = f'data_{i}'
1909+
index_in_list = f'index_in_list_{i}'
1910+
res_data = f'res_data_{i}'
1911+
func_lines += [f' {data} = _sdc_take(self._dataframe._data[{i}], idx_list)',
1912+
f' {res_data} = pandas.Series({data})']
1913+
results.append((c, res_data))
1914+
1915+
func_lines += [' if len(idx_list) < 1:',
1916+
" raise KeyError('Index is not in the DataFrame')"]
1917+
1918+
data = ', '.join(f'"{col}": {data}' for col, data in results)
1919+
func_lines += [f'{new_index_text}',
1920+
f' return pandas.DataFrame({{{data}}}, index=new_index)']
1921+
1922+
func_text = '\n'.join(func_lines)
1923+
global_vars = {'pandas': pandas, 'numpy': numpy,
1924+
'numba': numba,
1925+
'_sdc_take': _sdc_take,
1926+
'find_idx': find_idx,
1927+
'KeyError': KeyError}
1928+
1929+
return func_text, global_vars
1930+
1931+
18791932
def df_getitem_int_iloc_codegen(self, idx):
18801933
"""
18811934
Example of generated implementation:
@@ -2010,6 +2063,9 @@ def _df_getitem_list_bool_iloc_impl(self, idx):
20102063
return func_text, global_vars
20112064

20122065

2066+
gen_df_getitem_loc_single_label_impl = gen_impl_generator(
2067+
df_getitem_single_label_loc_codegen, '_df_getitem_single_label_loc_impl')
2068+
20132069
gen_df_getitem_iloc_int_impl = gen_impl_generator(
20142070
df_getitem_int_iloc_codegen, '_df_getitem_int_iloc_impl')
20152071

@@ -2030,6 +2086,13 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx):
20302086

20312087
accessor = self.accessor.literal_value
20322088

2089+
if accessor == 'loc':
2090+
if isinstance(idx, (types.Integer, types.UnicodeType, types.StringLiteral)):
2091+
return gen_df_getitem_loc_single_label_impl(self.dataframe, idx)
2092+
2093+
ty_checker = TypeChecker('Attribute loc().')
2094+
ty_checker.raise_exc(idx, 'int or str', 'idx')
2095+
20332096
if accessor == 'iat':
20342097
if isinstance(idx, types.Tuple) and isinstance(idx[1], types.Literal):
20352098
col = idx[1].literal_value
@@ -2181,6 +2244,57 @@ def sdc_pandas_dataframe_iat_impl(self):
21812244
return sdc_pandas_dataframe_iat_impl
21822245

21832246

2247+
@sdc_overload_attribute(DataFrameType, 'loc')
2248+
def sdc_pandas_dataframe_loc(self):
2249+
"""
2250+
Intel Scalable Dataframe Compiler User Guide
2251+
********************************************
2252+
2253+
Pandas API: pandas.DataFrame.loc
2254+
2255+
Limitations
2256+
-----------
2257+
- Loc always returns Dataframe.
2258+
- Parameter ``idx`` is supported only to be a single value, e.g. :obj:`df.loc['A']`.
2259+
2260+
Examples
2261+
--------
2262+
.. literalinclude:: ../../../examples/dataframe/dataframe_loc.py
2263+
:language: python
2264+
:lines: 36-
2265+
:caption: Access a group of rows and columns by label(s) or a boolean array.
2266+
:name: ex_dataframe_loc
2267+
2268+
.. command-output:: python ./dataframe/dataframe_loc.py
2269+
:cwd: ../../../examples
2270+
2271+
.. seealso::
2272+
:ref:`DataFrame.at <pandas.DataFrame.at>`
2273+
Access a single value for a row/column label pair.
2274+
:ref:`DataFrame.iloc <pandas.DataFrame.iloc>`
2275+
Access group of rows and columns by integer position(s).
2276+
:ref:`DataFrame.xs <pandas.DataFrame.xs>`
2277+
Returns a cross-section (row(s) or column(s)) from the Series/DataFrame.
2278+
:ref:`Series.loc <pandas.Series.loc>`
2279+
Access group of values using labels.
2280+
2281+
Intel Scalable Dataframe Compiler Developer Guide
2282+
*************************************************
2283+
Pandas DataFrame method :meth:`pandas.DataFrame.loc` implementation.
2284+
2285+
.. only:: developer
2286+
Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_df_loc*
2287+
"""
2288+
2289+
ty_checker = TypeChecker('Attribute loc().')
2290+
ty_checker.check(self, DataFrameType)
2291+
2292+
def sdc_pandas_dataframe_loc_impl(self):
2293+
return sdc.datatypes.hpat_pandas_dataframe_getitem_types.dataframe_getitem_accessor_init(self, 'loc')
2294+
2295+
return sdc_pandas_dataframe_loc_impl
2296+
2297+
21842298
@sdc_overload_method(DataFrameType, 'pct_change')
21852299
def pct_change_overload(df, periods=1, fill_method='pad', limit=None, freq=None):
21862300
"""

sdc/functions/numpy_like.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from numba import types, jit, prange, numpy_support, literally
4242
from numba.errors import TypingError
4343
from numba.targets.arraymath import get_isnan
44+
from numba.typed import List
4445

4546
import sdc
4647
from sdc.utilities.sdc_typing_utils import TypeChecker
@@ -710,6 +711,26 @@ def dropna_impl(arr, idx, name):
710711
return dropna_impl
711712

712713

714+
def find_idx(arr, idx):
715+
pass
716+
717+
718+
@sdc_overload(find_idx)
719+
def find_idx_overload(arr, idx):
720+
def find_idx_impl(arr, idx):
721+
chunks = parallel_chunks(len(arr))
722+
new_arr = [List.empty_list(types.int64) for i in range(len(chunks))]
723+
for i in prange(len(chunks)):
724+
chunk = chunks[i]
725+
for j in range(chunk.start, chunk.stop):
726+
if arr[j] == idx:
727+
new_arr[i].append(j)
728+
729+
return new_arr
730+
731+
return find_idx_impl
732+
733+
713734
def nanmean(a):
714735
pass
715736

sdc/tests/test_dataframe.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,40 @@ def test_impl(df):
11971197
msg = 'Index is out of bounds for axis'
11981198
self.assertIn(msg, str(raises.exception))
11991199

1200+
def test_df_loc(self):
1201+
def test_impl(df):
1202+
return df.loc[4]
1203+
1204+
sdc_func = sdc.jit(test_impl)
1205+
idx = [3, 4, 1, 4, 0]
1206+
df = pd.DataFrame({"A": [3.2, 4.4, 7.0, 3.3, 1.0],
1207+
"B": [3, 4, 1, 0, 222],
1208+
"C": [3.1, 8.4, 7.1, 3.2, 1]}, index=idx)
1209+
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
1210+
1211+
@unittest.skip("SDC Dataframe.loc[] always return Dataframe")
1212+
def test_df_loc_str(self):
1213+
def test_impl(df):
1214+
return df.loc['c']
1215+
1216+
sdc_func = sdc.jit(test_impl)
1217+
idx = ['a', 'b', 'c', 'с', 'e']
1218+
df = pd.DataFrame({"A": ['3.2', '4.4', '7.0', '3.3', '1.0'],
1219+
"B": ['3', '4', '1', '0', '222'],
1220+
"C": ['3.1', '8.4', '7.1', '3.2', '1']}, index=idx)
1221+
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
1222+
1223+
@unittest.skip("SDC Dataframe.loc[] always return Dataframe")
1224+
def test_df_loc_no_idx(self):
1225+
def test_impl(df):
1226+
return df.loc[2]
1227+
1228+
sdc_func = sdc.jit(test_impl)
1229+
df = pd.DataFrame({"A": [3.2, 4.4, 7.0, 3.3, 1.0],
1230+
"B": [3, 4, 1, 0, 222],
1231+
"C": [3.1, 8.4, 7.1, 3.2, 1]})
1232+
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))
1233+
12001234
def test_df_head(self):
12011235
def get_func(n):
12021236
def impl(a):

0 commit comments

Comments
 (0)