Skip to content

Commit c25ab5b

Browse files
implement fully checked kh_pymap interface
1 parent a7994c8 commit c25ab5b

File tree

6 files changed

+138
-71
lines changed

6 files changed

+138
-71
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 12 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@ Template for each `dtype` helper function for hashtable
44
WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
55
"""
66
from cpython.unicode cimport PyUnicode_AsUTF8
7-
from cpython.exc cimport PyErr_Occurred, PyErr_Fetch
8-
from cpython.ref cimport Py_XDECREF
97

108
{{py:
119

@@ -75,7 +73,7 @@ cimported_types = ['complex64',
7573
'int16',
7674
'int32',
7775
'int64',
78-
'pymap',
76+
'pymap_checked',
7977
'str',
8078
'strbox',
8179
'uint8',
@@ -1311,32 +1309,16 @@ cdef class StringHashTable(HashTable):
13111309
return labels
13121310

13131311

1314-
cdef raise_if_errors():
1315-
cdef:
1316-
object exc
1317-
PyObject *type
1318-
PyObject *value
1319-
PyObject *traceback
1320-
1321-
PyErr_Fetch(&type, &value, &traceback)
1322-
if value != NULL:
1323-
exc = <object>value
1324-
Py_XDECREF(value)
1325-
Py_XDECREF(type)
1326-
Py_XDECREF(traceback)
1327-
raise exc
1328-
1329-
13301312
cdef class PyObjectHashTable(HashTable):
13311313

13321314
def __init__(self, int64_t size_hint=1):
1333-
self.table = kh_init_pymap()
1315+
self.table = kh_init_pymap_checked()
13341316
size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
1335-
kh_resize_pymap(self.table, size_hint)
1317+
kh_resize_pymap_checked(self.table, size_hint)
13361318

13371319
def __dealloc__(self):
13381320
if self.table is not NULL:
1339-
kh_destroy_pymap(self.table)
1321+
kh_destroy_pymap_checked(self.table)
13401322
self.table = NULL
13411323

13421324
def __len__(self) -> int:
@@ -1347,7 +1329,7 @@ cdef class PyObjectHashTable(HashTable):
13471329
khiter_t k
13481330
hash(key)
13491331

1350-
k = kh_get_pymap(self.table, <PyObject*>key)
1332+
k = kh_get_pymap_checked(self.table, <PyObject*>key)
13511333
return k != self.table.n_buckets
13521334

13531335
def sizeof(self, deep: bool = False) -> int:
@@ -1374,8 +1356,7 @@ cdef class PyObjectHashTable(HashTable):
13741356
cdef:
13751357
khiter_t k
13761358

1377-
k = kh_get_pymap(self.table, <PyObject*>val)
1378-
raise_if_errors()
1359+
k = kh_get_pymap_checked(self.table, <PyObject*>val)
13791360
if k != self.table.n_buckets:
13801361
return self.table.vals[k]
13811362
else:
@@ -1389,9 +1370,8 @@ cdef class PyObjectHashTable(HashTable):
13891370

13901371
hash(key)
13911372

1392-
k = kh_put_pymap(self.table, <PyObject*>key, &ret)
1393-
raise_if_errors()
1394-
if kh_exist_pymap(self.table, k):
1373+
k = kh_put_pymap_checked(self.table, <PyObject*>key, &ret)
1374+
if kh_exist_pymap_checked(self.table, k):
13951375
self.table.vals[k] = val
13961376
else:
13971377
raise KeyError(key)
@@ -1408,8 +1388,7 @@ cdef class PyObjectHashTable(HashTable):
14081388
val = values[i]
14091389
hash(val)
14101390

1411-
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
1412-
raise_if_errors()
1391+
k = kh_put_pymap_checked(self.table, <PyObject*>val, &ret)
14131392
self.table.vals[k] = i
14141393

14151394
def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
@@ -1426,7 +1405,7 @@ cdef class PyObjectHashTable(HashTable):
14261405
val = values[i]
14271406
hash(val)
14281407

1429-
k = kh_get_pymap(self.table, <PyObject*>val)
1408+
k = kh_get_pymap_checked(self.table, <PyObject*>val)
14301409
if k != self.table.n_buckets:
14311410
locs[i] = self.table.vals[k]
14321411
else:
@@ -1504,10 +1483,10 @@ cdef class PyObjectHashTable(HashTable):
15041483
labels[i] = na_sentinel
15051484
continue
15061485

1507-
k = kh_get_pymap(self.table, <PyObject*>val)
1486+
k = kh_get_pymap_checked(self.table, <PyObject*>val)
15081487
if k == self.table.n_buckets:
15091488
# k hasn't been seen yet
1510-
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
1489+
k = kh_put_pymap_checked(self.table, <PyObject*>val, &ret)
15111490
uniques.append(val)
15121491
if return_inverse:
15131492
self.table.vals[k] = count

pandas/_libs/hashtable_func_helper.pxi.in

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,24 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
66

77
{{py:
88

9-
# name, dtype, ttype, c_type, to_c_type
10-
dtypes = [('Complex128', 'complex128', 'complex128',
11-
'khcomplex128_t', 'to_khcomplex128_t'),
12-
('Complex64', 'complex64', 'complex64',
13-
'khcomplex64_t', 'to_khcomplex64_t'),
14-
('Float64', 'float64', 'float64', 'float64_t', ''),
15-
('Float32', 'float32', 'float32', 'float32_t', ''),
16-
('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
17-
('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
18-
('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
19-
('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
20-
('Object', 'object', 'pymap', 'object', '<PyObject*>'),
21-
('Int64', 'int64', 'int64', 'int64_t', ''),
22-
('Int32', 'int32', 'int32', 'int32_t', ''),
23-
('Int16', 'int16', 'int16', 'int16_t', ''),
24-
('Int8', 'int8', 'int8', 'int8_t', '')]
9+
# name, dtype, ttype, tfunc_type, c_type, to_c_type
10+
dtypes = [('Complex128', 'complex128', 'complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
11+
('Complex64', 'complex64', 'complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
12+
('Float64', 'float64', 'float64', 'float64', 'float64_t', ''),
13+
('Float32', 'float32', 'float32', 'float32', 'float32_t', ''),
14+
('UInt64', 'uint64', 'uint64', 'uint64', 'uint64_t', ''),
15+
('UInt32', 'uint32', 'uint32', 'uint32', 'uint32_t', ''),
16+
('UInt16', 'uint16', 'uint16', 'uint16', 'uint16_t', ''),
17+
('UInt8', 'uint8', 'uint8', 'uint8', 'uint8_t', ''),
18+
('Object', 'object', 'pymap', 'pymap_checked', 'object', '<PyObject*>'),
19+
('Int64', 'int64', 'int64', 'int64', 'int64_t', ''),
20+
('Int32', 'int32', 'int32', 'int32', 'int32_t', ''),
21+
('Int16', 'int16', 'int16', 'int16', 'int16_t', ''),
22+
('Int8', 'int8', 'int8', 'int8', 'int8_t', '')]
2523

2624
}}
2725

28-
{{for name, dtype, ttype, c_type, to_c_type in dtypes}}
26+
{{for name, dtype, ttype, tfunc_type, c_type, to_c_type in dtypes}}
2927

3028

3129
@cython.wraparound(False)
@@ -55,26 +53,26 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
5553
# result_keys remembers the original order of keys
5654

5755
result_keys = {{name}}Vector()
58-
table = kh_init_{{ttype}}()
56+
table = kh_init_{{tfunc_type}}()
5957

6058
{{if dtype == 'object'}}
6159
if uses_mask:
6260
raise NotImplementedError("uses_mask not implemented with object dtype")
6361

64-
kh_resize_{{ttype}}(table, n // 10)
62+
kh_resize_{{tfunc_type}}(table, n // 10)
6563

6664
for i in range(n):
6765
val = values[i]
6866
if not dropna or not checknull(val):
69-
k = kh_get_{{ttype}}(table, {{to_c_type}}val)
67+
k = kh_get_{{tfunc_type}}(table, {{to_c_type}}val)
7068
if k != table.n_buckets:
7169
table.vals[k] += 1
7270
else:
73-
k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret)
71+
k = kh_put_{{tfunc_type}}(table, {{to_c_type}}val, &ret)
7472
table.vals[k] = 1
7573
result_keys.append(val)
7674
{{else}}
77-
kh_resize_{{ttype}}(table, n)
75+
kh_resize_{{tfunc_type}}(table, n)
7876

7977
for i in range(n):
8078
val = {{to_c_type}}(values[i])
@@ -90,11 +88,11 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
9088
if uses_mask and isna_entry:
9189
na_counter += 1
9290
else:
93-
k = kh_get_{{ttype}}(table, val)
91+
k = kh_get_{{tfunc_type}}(table, val)
9492
if k != table.n_buckets:
9593
table.vals[k] += 1
9694
else:
97-
k = kh_put_{{ttype}}(table, val, &ret)
95+
k = kh_put_{{tfunc_type}}(table, val, &ret)
9896
table.vals[k] = 1
9997
result_keys.append(val)
10098
{{endif}}
@@ -107,17 +105,17 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
107105

108106
for i in range(table.size):
109107
{{if dtype == 'object'}}
110-
k = kh_get_{{ttype}}(table, result_keys.data[i])
108+
k = kh_get_{{tfunc_type}}(table, result_keys.data[i])
111109
{{else}}
112-
k = kh_get_{{ttype}}(table, result_keys.data.data[i])
110+
k = kh_get_{{tfunc_type}}(table, result_keys.data.data[i])
113111
{{endif}}
114112
result_counts[i] = table.vals[k]
115113

116114
if na_counter > 0:
117115
result_counts[table.size] = na_counter
118116
result_keys.append(val)
119117

120-
kh_destroy_{{ttype}}(table)
118+
kh_destroy_{{tfunc_type}}(table)
121119

122120
return result_keys.to_array(), result_counts.base, na_counter
123121

@@ -138,12 +136,12 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
138136
{{endif}}
139137
Py_ssize_t i, n = len(values), first_na = -1
140138
khiter_t k
141-
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
139+
kh_{{ttype}}_t *table = kh_init_{{tfunc_type}}()
142140
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
143141
bint seen_na = False, uses_mask = mask is not None
144142
bint seen_multiple_na = False
145143

146-
kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
144+
kh_resize_{{tfunc_type}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
147145

148146
if keep not in ('last', 'first', False):
149147
raise ValueError('keep must be either "first", "last" or False')
@@ -168,7 +166,7 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
168166
seen_na = True
169167
else:
170168
value = {{to_c_type}}(values[i])
171-
kh_put_{{ttype}}(table, value, &ret)
169+
kh_put_{{tfunc_type}}(table, value, &ret)
172170
out[i] = ret == 0
173171
{{endfor}}
174172

@@ -193,16 +191,16 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons
193191

194192
else:
195193
value = {{to_c_type}}(values[i])
196-
k = kh_get_{{ttype}}(table, value)
194+
k = kh_get_{{tfunc_type}}(table, value)
197195
if k != table.n_buckets:
198196
out[table.vals[k]] = 1
199197
out[i] = 1
200198
else:
201-
k = kh_put_{{ttype}}(table, value, &ret)
199+
k = kh_put_{{tfunc_type}}(table, value, &ret)
202200
table.vals[k] = i
203201
out[i] = 0
204202

205-
kh_destroy_{{ttype}}(table)
203+
kh_destroy_{{tfunc_type}}(table)
206204
return out
207205

208206

@@ -243,11 +241,11 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
243241
{{c_type}} val
244242
{{endif}}
245243

246-
kh_{{ttype}}_t *table = kh_init_{{ttype}}()
244+
kh_{{ttype}}_t *table = kh_init_{{tfunc_type}}()
247245

248246
# construct the table
249247
n = len(values)
250-
kh_resize_{{ttype}}(table, n)
248+
kh_resize_{{tfunc_type}}(table, n)
251249

252250
{{if dtype == 'object'}}
253251
if True:
@@ -256,7 +254,7 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
256254
{{endif}}
257255
for i in range(n):
258256
val = {{to_c_type}}(values[i])
259-
kh_put_{{ttype}}(table, val, &ret)
257+
kh_put_{{tfunc_type}}(table, val, &ret)
260258

261259
# test membership
262260
n = len(arr)
@@ -269,10 +267,10 @@ cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
269267
{{endif}}
270268
for i in range(n):
271269
val = {{to_c_type}}(arr[i])
272-
k = kh_get_{{ttype}}(table, val)
270+
k = kh_get_{{tfunc_type}}(table, val)
273271
result[i] = (k != table.n_buckets)
274272

275-
kh_destroy_{{ttype}}(table)
273+
kh_destroy_{{tfunc_type}}(table)
276274
return result.view(np.bool_)
277275

278276
# ----------------------------------------------------------------------

pandas/_libs/include/pandas/vendored/klib/khash_python.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,9 @@ static inline int pyobject_cmp(PyObject *a, PyObject *b) {
218218

219219
int result = PyObject_RichCompareBool(a, b, Py_EQ);
220220
if (result < 0) {
221+
if (PyErr_Occurred() != NULL) {
222+
return 0;
223+
}
221224
return 0;
222225
}
223226
return result;
@@ -320,6 +323,9 @@ static inline khuint32_t kh_python_hash_func(PyObject *key) {
320323
}
321324

322325
if (hash == -1) {
326+
if (PyErr_Occurred() != NULL) {
327+
return 0;
328+
}
323329
return 0;
324330
}
325331
#if SIZEOF_PY_HASH_T == 4

pandas/_libs/khash.pxd

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,5 +125,13 @@ cdef extern from "pandas/vendored/klib/khash_python.h":
125125

126126
khuint_t kh_needed_n_buckets(khuint_t element_n) nogil
127127

128+
cdef kh_pymap_t* kh_init_pymap_checked()
129+
cdef void kh_destroy_pymap_checked(kh_pymap_t*)
130+
cdef void kh_clear_pymap_checked(kh_pymap_t*)
131+
cdef khuint_t kh_get_pymap_checked(kh_pymap_t*, PyObject*)
132+
cdef void kh_resize_pymap_checked(kh_pymap_t*, khuint_t)
133+
cdef khuint_t kh_put_pymap_checked(kh_pymap_t*, PyObject*, int*)
134+
cdef void kh_del_pymap_checked(kh_pymap_t*, khuint_t)
135+
cdef bint kh_exist_pymap_checked(kh_pymap_t*, khiter_t)
128136

129137
include "khash_for_primitive_helper.pxi"

0 commit comments

Comments
 (0)