Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 6b53d4c

Browse files
authored
Series.dropna scalable draft (#604)
Series.dropna parallel
1 parent 4efe6b0 commit 6b53d4c

File tree

2 files changed

+60
-7
lines changed

2 files changed

+60
-7
lines changed

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4933,14 +4933,22 @@ def hpat_pandas_series_dropna(self, axis=0, inplace=False):
49334933
if not (inplace is False or isinstance(inplace, types.Omitted)):
49344934
ty_checker.raise_exc(inplace, 'bool', 'inplace')
49354935

4936-
def hpat_pandas_series_dropna_impl(self, axis=0, inplace=False):
4937-
# generate Series index if needed by using SeriesType.index (i.e. not self._index)
4938-
na_data_arr = sdc.hiframes.api.get_nan_mask(self._data)
4939-
data = self._data[~na_data_arr]
4940-
index = self.index[~na_data_arr]
4941-
return pandas.Series(data, index, self._name)
4936+
if isinstance(self.data.dtype, types.Number) and isinstance(self.index, (types.Number, types.NoneType)):
4937+
def hpat_pandas_series_dropna_impl(self, axis=0, inplace=False):
4938+
index = self.index
4939+
return numpy_like.dropna(self._data, index, self._name)
4940+
4941+
return hpat_pandas_series_dropna_impl
4942+
4943+
else:
4944+
def hpat_pandas_series_dropna_str_impl(self, axis=0, inplace=False):
4945+
# generate Series index if needed by using SeriesType.index (i.e. not self._index)
4946+
na_data_arr = sdc.hiframes.api.get_nan_mask(self._data)
4947+
data = self._data[~na_data_arr]
4948+
index = self.index[~na_data_arr]
4949+
return pandas.Series(data, index, self._name)
49424950

4943-
return hpat_pandas_series_dropna_impl
4951+
return hpat_pandas_series_dropna_str_impl
49444952

49454953

49464954
@sdc_overload_method(SeriesType, 'fillna')

sdc/functions/numpy_like.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
import numba
3535
import numpy
36+
import pandas
3637
import numpy as np
3738

3839
from numba import types, jit, prange, numpy_support, literally
@@ -43,6 +44,7 @@
4344
from sdc.utilities.sdc_typing_utils import TypeChecker
4445
from sdc.str_arr_ext import (StringArrayType, pre_alloc_string_array, get_utf8_size, str_arr_is_na)
4546
from sdc.utilities.utils import sdc_overload, sdc_register_jitable
47+
from sdc.utilities.prange_utils import parallel_chunks
4648

4749

4850
def astype(self, dtype):
@@ -475,6 +477,49 @@ def nanprod_impl(a):
475477
return nanprod_impl
476478

477479

480+
def dropna(arr, idx, name):
481+
pass
482+
483+
484+
@sdc_overload(dropna)
485+
def dropna_overload(arr, idx, name):
486+
dtype = arr.dtype
487+
dtype_idx = idx.dtype
488+
isnan = get_isnan(dtype)
489+
490+
def dropna_impl(arr, idx, name):
491+
chunks = parallel_chunks(len(arr))
492+
arr_len = numpy.empty(len(chunks), dtype=numpy.int64)
493+
length = 0
494+
495+
for i in prange(len(chunks)):
496+
chunk = chunks[i]
497+
res = 0
498+
for j in range(chunk.start, chunk.stop):
499+
if not isnan(arr[j]):
500+
res += 1
501+
length += res
502+
arr_len[i] = res
503+
504+
result_data = numpy.empty(shape=length, dtype=dtype)
505+
result_index = numpy.empty(shape=length, dtype=dtype_idx)
506+
for i in prange(len(chunks)):
507+
chunk = chunks[i]
508+
new_start = int(sum(arr_len[0:i]))
509+
new_stop = new_start + arr_len[i]
510+
current_pos = new_start
511+
512+
for j in range(chunk.start, chunk.stop):
513+
if not isnan(arr[j]):
514+
result_data[current_pos] = arr[j]
515+
result_index[current_pos] = idx[j]
516+
current_pos += 1
517+
518+
return pandas.Series(result_data, result_index, name)
519+
520+
return dropna_impl
521+
522+
478523
def nanmean(a):
479524
pass
480525

0 commit comments

Comments
 (0)