Skip to content

Commit be21b2e

Browse files
committed
perf: verify for float numbers during tokenization
1 parent 10102e6 commit be21b2e

File tree

4 files changed

+93
-95
lines changed

4 files changed

+93
-95
lines changed

pandas/_libs/include/pandas/parser/pd_parser.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ typedef struct {
3737
int (*parser_trim_buffers)(parser_t *);
3838
int (*tokenize_all_rows)(parser_t *, const char *);
3939
int (*tokenize_nrows)(parser_t *, size_t, const char *);
40-
int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
41-
uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
40+
int64_t (*str_to_int64)(const char *, char, int64_t, int64_t, int *, char);
41+
uint64_t (*str_to_uint64)(uint_state *, const char *, char, int64_t, uint64_t,
4242
int *, char);
4343
double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
4444
double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
@@ -87,12 +87,14 @@ static PandasParser_CAPI *PandasParserAPI = NULL;
8787
PandasParserAPI->tokenize_all_rows((self), (encoding_errors))
8888
#define tokenize_nrows(self, nrows, encoding_errors) \
8989
PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors))
90-
#define str_to_int64(p_item, int_min, int_max, error, t_sep) \
91-
PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error), \
92-
(t_sep))
93-
#define str_to_uint64(state, p_item, int_max, uint_max, error, t_sep) \
94-
PandasParserAPI->str_to_uint64((state), (p_item), (int_max), (uint_max), \
95-
(error), (t_sep))
90+
#define str_to_int64(p_item, decimal_separator, int_min, int_max, error, \
91+
t_sep) \
92+
PandasParserAPI->str_to_int64((p_item), (decimal_separator), (int_min), \
93+
(int_max), (error), (t_sep))
94+
#define str_to_uint64(state, p_item, decimal_separator, int_max, uint_max, \
95+
error, t_sep) \
96+
PandasParserAPI->str_to_uint64((state), (p_item), (decimal_separator), \
97+
(int_max), (uint_max), (error), (t_sep))
9698
#define xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) \
9799
PandasParserAPI->xstrtod((p), (q), (decimal), (sci), (tsep), \
98100
(skip_trailing), (error), (maybe_int))

pandas/_libs/include/pandas/parser/tokenizer.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ See LICENSE for the license
1717
#define ERROR_NO_DIGITS 1
1818
#define ERROR_OVERFLOW 2
1919
#define ERROR_INVALID_CHARS 3
20+
#define ERROR_IS_FLOAT 4
2021

2122
#include <stdint.h>
2223

@@ -208,10 +209,11 @@ void uint_state_init(uint_state *self);
208209

209210
int uint64_conflict(uint_state *self);
210211

211-
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
212+
uint64_t str_to_uint64(uint_state *state, const char *p_item,
213+
char decimal_separator, int64_t int_max,
212214
uint64_t uint_max, int *error, char tsep);
213-
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
214-
int *error, char tsep);
215+
int64_t str_to_int64(const char *p_item, char decimal_separator,
216+
int64_t int_min, int64_t int_max, int *error, char tsep);
215217
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
216218
int skip_trailing, int *error, int *maybe_int);
217219
double precise_xstrtod(const char *p, char **q, char decimal, char sci,

pandas/_libs/parsers.pyx

Lines changed: 32 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ cdef extern from "pandas/parser/tokenizer.h":
149149
SKIP_LINE
150150
FINISHED
151151

152-
enum: ERROR_OVERFLOW
152+
enum: ERROR_OVERFLOW, ERROR_IS_FLOAT
153153

154154
ctypedef enum BadLineHandleMethod:
155155
ERROR,
@@ -281,10 +281,11 @@ cdef extern from "pandas/parser/pd_parser.h":
281281
int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
282282
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
283283

284-
int64_t str_to_int64(char *p_item, int64_t int_min,
284+
int64_t str_to_int64(char *p_item, char decimal_separator, int64_t int_min,
285285
int64_t int_max, int *error, char tsep) nogil
286-
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
287-
uint64_t uint_max, int *error, char tsep) nogil
286+
uint64_t str_to_uint64(uint_state *state, char *p_item, char decimal_separator,
287+
int64_t int_max, uint64_t uint_max,
288+
int *error, char tsep) nogil
288289

289290
double xstrtod(const char *p, char **q, char decimal,
290291
char sci, char tsep, int skip_trailing,
@@ -1070,21 +1071,28 @@ cdef class TextReader:
10701071
else:
10711072
col_res = None
10721073
for dt in self.dtype_cast_order:
1073-
if (dt.kind in "iu" and
1074-
self._column_has_float(i, start, end, na_filter, na_hashset)):
1075-
continue
1076-
10771074
try:
10781075
col_res, na_count = self._convert_with_dtype(
10791076
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
1080-
except ValueError:
1081-
# This error is raised from trying to convert to uint64,
1082-
# and we discover that we cannot convert to any numerical
1083-
# dtype successfully. As a result, we leave the data
1084-
# column AS IS with object dtype.
1085-
col_res, na_count = self._convert_with_dtype(
1086-
np.dtype("object"), i, start, end, 0,
1087-
0, na_hashset, na_fset)
1077+
except ValueError as e:
1078+
if str(e) == "Number is float":
1079+
try:
1080+
col_res, na_count = self._convert_with_dtype(
1081+
np.dtype("float64"), i, start, end, 0,
1082+
0, na_hashset, na_fset)
1083+
except ValueError:
1084+
col_res, na_count = self._convert_with_dtype(
1085+
np.dtype("object"), i, start, end, 0,
1086+
0, na_hashset, na_fset)
1087+
1088+
else:
1089+
# This error is raised from trying to convert to uint64,
1090+
# and we discover that we cannot convert to any numerical
1091+
# dtype successfully. As a result, we leave the data
1092+
# column AS IS with object dtype.
1093+
col_res, na_count = self._convert_with_dtype(
1094+
np.dtype("object"), i, start, end, 0,
1095+
0, na_hashset, na_fset)
10881096
except OverflowError:
10891097
try:
10901098
col_res, na_count = _try_pylong(self.parser, i, start,
@@ -1351,59 +1359,6 @@ cdef class TextReader:
13511359
else:
13521360
return None
13531361

1354-
cdef bint _column_has_float(self, Py_ssize_t col,
1355-
int64_t start, int64_t end,
1356-
bint na_filter, kh_str_starts_t *na_hashset):
1357-
"""Check if the column contains any float number."""
1358-
cdef:
1359-
Py_ssize_t i, j, lines = end - start
1360-
coliter_t it
1361-
const char *word = NULL
1362-
const char *ignored_chars = " +-"
1363-
const char *digits = "0123456789"
1364-
const char *float_indicating_chars = "eE"
1365-
char null_byte = 0
1366-
1367-
coliter_setup(&it, self.parser, col, start)
1368-
1369-
for i in range(lines):
1370-
COLITER_NEXT(it, word)
1371-
1372-
if na_filter and kh_get_str_starts_item(na_hashset, word):
1373-
continue
1374-
1375-
found_first_digit = False
1376-
j = 0
1377-
while word[j] != null_byte:
1378-
if word[j] == self.parser.decimal:
1379-
return True
1380-
elif not found_first_digit and word[j] in ignored_chars:
1381-
# no-op
1382-
pass
1383-
elif not found_first_digit and word[j] not in digits:
1384-
# word isn't numeric
1385-
return False
1386-
elif not found_first_digit and word[j] in digits:
1387-
found_first_digit = True
1388-
elif word[j] in float_indicating_chars:
1389-
# preceding chars indicates numeric and
1390-
# current char indicates float
1391-
return True
1392-
elif word[j] not in digits:
1393-
# previous characters indicates numeric
1394-
# current character shows otherwise
1395-
return False
1396-
elif word[j] in digits:
1397-
# no-op
1398-
pass
1399-
else:
1400-
raise AssertionError(
1401-
f"Unhandled case {word[j]=} {found_first_digit=}"
1402-
)
1403-
j += 1
1404-
1405-
return False
1406-
14071362
# Factor out code common to TextReader.__dealloc__ and TextReader.close
14081363
# It cannot be a class method, since calling self.close() in __dealloc__
14091364
# which causes a class attribute lookup and violates best practices
@@ -1822,6 +1777,8 @@ cdef _try_uint64(parser_t *parser, int64_t col,
18221777
if error == ERROR_OVERFLOW:
18231778
# Can't get the word variable
18241779
raise OverflowError("Overflow")
1780+
elif error == ERROR_IS_FLOAT:
1781+
raise ValueError("Number is float")
18251782
return None
18261783

18271784
if uint64_conflict(&state):
@@ -1855,14 +1812,14 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
18551812
data[i] = 0
18561813
continue
18571814

1858-
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
1815+
data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX,
18591816
&error, parser.thousands)
18601817
if error != 0:
18611818
return error
18621819
else:
18631820
for i in range(lines):
18641821
COLITER_NEXT(it, word)
1865-
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
1822+
data[i] = str_to_uint64(state, word, parser.decimal, INT64_MAX, UINT64_MAX,
18661823
&error, parser.thousands)
18671824
if error != 0:
18681825
return error
@@ -1892,6 +1849,8 @@ cdef _try_int64(parser_t *parser, int64_t col,
18921849
if error == ERROR_OVERFLOW:
18931850
# Can't get the word variable
18941851
raise OverflowError("Overflow")
1852+
elif error == ERROR_IS_FLOAT:
1853+
raise ValueError("Number is float")
18951854
return None, None
18961855

18971856
return result, na_count
@@ -1920,14 +1879,14 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
19201879
data[i] = NA
19211880
continue
19221881

1923-
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
1882+
data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX,
19241883
&error, parser.thousands)
19251884
if error != 0:
19261885
return error
19271886
else:
19281887
for i in range(lines):
19291888
COLITER_NEXT(it, word)
1930-
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
1889+
data[i] = str_to_int64(word, parser.decimal, INT64_MIN, INT64_MAX,
19311890
&error, parser.thousands)
19321891
if error != 0:
19331892
return error

pandas/_libs/src/parser/tokenizer.c

Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1834,8 +1834,8 @@ int uint64_conflict(uint_state *self) {
18341834
return self->seen_uint && (self->seen_sint || self->seen_null);
18351835
}
18361836

1837-
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
1838-
int *error, char tsep) {
1837+
int64_t str_to_int64(const char *p_item, char decimal_separator,
1838+
int64_t int_min, int64_t int_max, int *error, char tsep) {
18391839
const char *p = p_item;
18401840
// Skip leading spaces.
18411841
while (isspace_ascii(*p)) {
@@ -1879,7 +1879,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18791879
d = *++p;
18801880
} else {
18811881
*error = ERROR_OVERFLOW;
1882-
return 0;
1882+
break;
18831883
}
18841884
}
18851885
} else {
@@ -1890,7 +1890,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18901890
d = *++p;
18911891
} else {
18921892
*error = ERROR_OVERFLOW;
1893-
return 0;
1893+
break;
18941894
}
18951895
}
18961896
}
@@ -1917,7 +1917,7 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
19171917

19181918
} else {
19191919
*error = ERROR_OVERFLOW;
1920-
return 0;
1920+
break;
19211921
}
19221922
}
19231923
} else {
@@ -1929,28 +1929,46 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
19291929

19301930
} else {
19311931
*error = ERROR_OVERFLOW;
1932-
return 0;
1932+
break;
19331933
}
19341934
}
19351935
}
19361936
}
19371937

1938+
if (*error == ERROR_OVERFLOW) {
1939+
// advance digits
1940+
while (*p != '\0' && isdigit_ascii(*p)) {
1941+
p++;
1942+
}
1943+
1944+
// check if is float
1945+
if (*p == decimal_separator || *p == 'e' || *p == 'E') {
1946+
*error = ERROR_IS_FLOAT;
1947+
}
1948+
return 0;
1949+
}
1950+
19381951
// Skip trailing spaces.
19391952
while (isspace_ascii(*p)) {
19401953
++p;
19411954
}
19421955

19431956
// Did we use up all the characters?
19441957
if (*p) {
1945-
*error = ERROR_INVALID_CHARS;
1958+
if (*p == decimal_separator || *p == 'e' || *p == 'E') {
1959+
*error = ERROR_IS_FLOAT;
1960+
} else {
1961+
*error = ERROR_INVALID_CHARS;
1962+
}
19461963
return 0;
19471964
}
19481965

19491966
*error = 0;
19501967
return number;
19511968
}
19521969

1953-
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
1970+
uint64_t str_to_uint64(uint_state *state, const char *p_item,
1971+
char decimal_separator, int64_t int_max,
19541972
uint64_t uint_max, int *error, char tsep) {
19551973
const char *p = p_item;
19561974
// Skip leading spaces.
@@ -1997,7 +2015,7 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
19972015

19982016
} else {
19992017
*error = ERROR_OVERFLOW;
2000-
return 0;
2018+
break;
20012019
}
20022020
}
20032021
} else {
@@ -2009,19 +2027,36 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
20092027

20102028
} else {
20112029
*error = ERROR_OVERFLOW;
2012-
return 0;
2030+
break;
20132031
}
20142032
}
20152033
}
20162034

2035+
if (*error == ERROR_OVERFLOW) {
2036+
// advance digits
2037+
while (*p != '\0' && isdigit_ascii(*p)) {
2038+
p++;
2039+
}
2040+
2041+
// check if is float
2042+
if (*p == decimal_separator || *p == 'e' || *p == 'E') {
2043+
*error = ERROR_IS_FLOAT;
2044+
}
2045+
return 0;
2046+
}
2047+
20172048
// Skip trailing spaces.
20182049
while (isspace_ascii(*p)) {
20192050
++p;
20202051
}
20212052

20222053
// Did we use up all the characters?
20232054
if (*p) {
2024-
*error = ERROR_INVALID_CHARS;
2055+
if (*p == decimal_separator || *p == 'e' || *p == 'E') {
2056+
*error = ERROR_IS_FLOAT;
2057+
} else {
2058+
*error = ERROR_INVALID_CHARS;
2059+
}
20252060
return 0;
20262061
}
20272062

0 commit comments

Comments
 (0)