Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit e061965

Browse files
authored
Categorical (#819)
1 parent 21529a7 commit e061965

33 files changed

+1642
-227
lines changed

sdc/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
import sdc.datatypes.hpat_pandas_series_rolling_functions
4545
import sdc.datatypes.hpat_pandas_stringmethods_functions
4646
import sdc.datatypes.hpat_pandas_groupby_functions
47+
import sdc.datatypes.categorical.init
48+
import sdc.datatypes.series.init
4749

4850
import sdc.extensions.indexes.range_index_ext
4951

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
from numba.extending import box, unbox, NativeValue
28+
from numba.core import boxing
29+
from numba.core.imputils import lower_constant
30+
from numba.np import arrayobj
31+
from numba import types
32+
33+
from . import pandas_support
34+
from .types import (
35+
CategoricalDtypeType,
36+
Categorical,
37+
)
38+
39+
40+
@box(CategoricalDtypeType)
41+
def box_CategoricalDtype(typ, val, c):
42+
pd_dtype = pandas_support.as_dtype(typ)
43+
return c.pyapi.unserialize(c.pyapi.serialize_object(pd_dtype))
44+
45+
46+
@unbox(CategoricalDtypeType)
47+
def unbox_CategoricalDtype(typ, val, c):
48+
return NativeValue(c.context.get_dummy_value())
49+
50+
51+
@box(Categorical)
52+
def box_Categorical(typ, val, c):
53+
pandas_module_name = c.context.insert_const_string(c.builder.module, "pandas")
54+
pandas_module = c.pyapi.import_module_noblock(pandas_module_name)
55+
56+
constructor = c.pyapi.object_getattr_string(pandas_module, "Categorical")
57+
58+
empty_list = c.pyapi.list_new(c.context.get_constant(types.intp, 0))
59+
args = c.pyapi.tuple_pack([empty_list])
60+
categorical = c.pyapi.call(constructor, args)
61+
62+
dtype = box_CategoricalDtype(typ.pd_dtype, val, c)
63+
c.pyapi.object_setattr_string(categorical, "_dtype", dtype)
64+
65+
codes = boxing.box_array(typ.codes, val, c)
66+
c.pyapi.object_setattr_string(categorical, "_codes", codes)
67+
68+
c.pyapi.decref(codes)
69+
c.pyapi.decref(dtype)
70+
c.pyapi.decref(args)
71+
c.pyapi.decref(empty_list)
72+
c.pyapi.decref(constructor)
73+
c.pyapi.decref(pandas_module)
74+
return categorical
75+
76+
77+
@unbox(Categorical)
78+
def unbox_Categorical(typ, val, c):
79+
codes = c.pyapi.object_getattr_string(val, "codes")
80+
native_value = boxing.unbox_array(typ.codes, codes, c)
81+
c.pyapi.decref(codes)
82+
return native_value
83+
84+
85+
@lower_constant(Categorical)
86+
def constant_Categorical(context, builder, ty, pyval):
87+
"""
88+
Create a constant Categorical.
89+
"""
90+
return arrayobj.constant_array(context, builder, ty.codes, pyval.codes)
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
from sdc.utilities.utils import sdc_overload_attribute
28+
29+
from .types import CategoricalDtypeType
30+
31+
32+
@sdc_overload_attribute(CategoricalDtypeType, 'ordered')
33+
def pd_CategoricalDtype_categories_overload(self):
34+
ordered = self.ordered
35+
36+
def impl(self):
37+
return ordered
38+
return impl

sdc/datatypes/categorical/init.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
"""
28+
Init Numba extension for Pandas Categorical.
29+
"""
30+
31+
from . import types
32+
from . import typeof
33+
from . import models
34+
from . import boxing
35+
from . import pdimpl
36+
from . import rewrites
37+
from . import functions
38+
39+
import numba
40+
41+
42+
# register new types in numba.types for using in objmode
43+
setattr(numba.types, "CategoricalDtype", types.CategoricalDtypeType)
44+
setattr(numba.types, "Categorical", types.Categorical)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
from numba.extending import models
28+
from numba.extending import register_model
29+
30+
from .types import (
31+
CategoricalDtypeType,
32+
Categorical,
33+
)
34+
35+
36+
register_model(CategoricalDtypeType)(models.OpaqueModel)
37+
register_model(Categorical)(models.ArrayModel)
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
import pandas as pd
28+
29+
from numba import types
30+
31+
from .types import CategoricalDtypeType
32+
33+
34+
def from_dtype(pdtype):
35+
"""
36+
Return a Numba Type instance corresponding to the given Pandas *dtype*.
37+
NotImplementedError is raised if unsupported Pandas dtypes.
38+
"""
39+
# TODO: use issubclass
40+
if isinstance(pdtype, pd.CategoricalDtype):
41+
if pdtype.categories is None:
42+
categories = None
43+
else:
44+
categories = list(pdtype.categories)
45+
return CategoricalDtypeType(categories=categories,
46+
ordered=pdtype.ordered)
47+
48+
raise NotImplementedError("%r cannot be represented as a Numba type"
49+
% (pdtype,))
50+
51+
52+
def as_dtype(nbtype):
53+
"""
54+
Return a Pandas *dtype* instance corresponding to the given Numba type.
55+
NotImplementedError is raised if no correspondence is known.
56+
"""
57+
nbtype = types.unliteral(nbtype)
58+
if isinstance(nbtype, CategoricalDtypeType):
59+
return pd.CategoricalDtype(categories=nbtype.categories,
60+
ordered=nbtype.ordered)
61+
62+
raise NotImplementedError("%r cannot be represented as a Pandas dtype"
63+
% (nbtype,))

0 commit comments

Comments
 (0)