Skip to content

Commit 56a329a

Browse files
committed
Merge remote-tracking branch 'upstream/main' into aijams-take-function-invalid-dtype
2 parents 8bacb94 + e95948f commit 56a329a

File tree

30 files changed

+338
-240
lines changed

30 files changed

+338
-240
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,7 @@ Other API changes
655655
an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining
656656
the dtype of the resulting Index (:issue:`60797`)
657657
- :class:`IncompatibleFrequency` now subclasses ``TypeError`` instead of ``ValueError``. As a result, joins with mismatched frequencies now cast to object like other non-comparable joins, and arithmetic with indexes with mismatched frequencies align (:issue:`55782`)
658+
- :class:`Series` "flex" methods like :meth:`Series.add` no longer allow passing a :class:`DataFrame` for ``other``; use the DataFrame reversed method instead (:issue:`46179`)
658659
- :meth:`CategoricalIndex.append` no longer attempts to cast different-dtype indexes to the caller's dtype (:issue:`41626`)
659660
- :meth:`ExtensionDtype.construct_array_type` is now a regular method instead of a ``classmethod`` (:issue:`58663`)
660661
- Comparison operations between :class:`Index` and :class:`Series` now consistently return :class:`Series` regardless of which object is on the left or right (:issue:`36759`)
@@ -874,6 +875,7 @@ Other Removals
874875
- Removed the ``method`` keyword in ``ExtensionArray.fillna``, implement ``ExtensionArray._pad_or_backfill`` instead (:issue:`53621`)
875876
- Removed the attribute ``dtypes`` from :class:`.DataFrameGroupBy` (:issue:`51997`)
876877
- Enforced deprecation of ``argmin``, ``argmax``, ``idxmin``, and ``idxmax`` returning a result when ``skipna=False`` and an NA value is encountered or all values are NA values; these operations will now raise in such cases (:issue:`33941`, :issue:`51276`)
878+
- Enforced deprecation of storage option "pyarrow_numpy" for :class:`StringDtype` (:issue:`60152`)
877879
- Removed specifying ``include_groups=True`` in :class:`.DataFrameGroupBy.apply` and :class:`.Resampler.apply` (:issue:`7155`)
878880

879881
.. ---------------------------------------------------------------------------

pandas/_libs/include/pandas/parser/pd_parser.h

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,8 @@ typedef struct {
3737
int (*parser_trim_buffers)(parser_t *);
3838
int (*tokenize_all_rows)(parser_t *, const char *);
3939
int (*tokenize_nrows)(parser_t *, size_t, const char *);
40-
int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
41-
uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
42-
int *, char);
40+
int64_t (*str_to_int64)(const char *, int *, char);
41+
uint64_t (*str_to_uint64)(uint_state *, const char *, int *, char);
4342
double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
4443
double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
4544
int *);
@@ -87,12 +86,10 @@ static PandasParser_CAPI *PandasParserAPI = NULL;
8786
PandasParserAPI->tokenize_all_rows((self), (encoding_errors))
8887
#define tokenize_nrows(self, nrows, encoding_errors) \
8988
PandasParserAPI->tokenize_nrows((self), (nrows), (encoding_errors))
90-
#define str_to_int64(p_item, int_min, int_max, error, t_sep) \
91-
PandasParserAPI->str_to_int64((p_item), (int_min), (int_max), (error), \
92-
(t_sep))
93-
#define str_to_uint64(state, p_item, int_max, uint_max, error, t_sep) \
94-
PandasParserAPI->str_to_uint64((state), (p_item), (int_max), (uint_max), \
95-
(error), (t_sep))
89+
#define str_to_int64(p_item, error, t_sep) \
90+
PandasParserAPI->str_to_int64((p_item), (error), (t_sep))
91+
#define str_to_uint64(state, p_item, error, t_sep) \
92+
PandasParserAPI->str_to_uint64((state), (p_item), (error), (t_sep))
9693
#define xstrtod(p, q, decimal, sci, tsep, skip_trailing, error, maybe_int) \
9794
PandasParserAPI->xstrtod((p), (q), (decimal), (sci), (tsep), \
9895
(skip_trailing), (error), (maybe_int))

pandas/_libs/include/pandas/parser/tokenizer.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,10 +208,9 @@ void uint_state_init(uint_state *self);
208208

209209
int uint64_conflict(uint_state *self);
210210

211-
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
212-
uint64_t uint_max, int *error, char tsep);
213-
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
214-
int *error, char tsep);
211+
uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error,
212+
char tsep);
213+
int64_t str_to_int64(const char *p_item, int *error, char tsep);
215214
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
216215
int skip_trailing, int *error, int *maybe_int);
217216
double precise_xstrtod(const char *p, char **q, char decimal, char sci,

pandas/_libs/parsers.pyx

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,6 @@ from numpy cimport (
6363
cnp.import_array()
6464

6565
from pandas._libs cimport util
66-
from pandas._libs.util cimport (
67-
INT64_MAX,
68-
INT64_MIN,
69-
UINT64_MAX,
70-
)
7166

7267
from pandas._libs import lib
7368

@@ -281,10 +276,8 @@ cdef extern from "pandas/parser/pd_parser.h":
281276
int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
282277
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
283278

284-
int64_t str_to_int64(char *p_item, int64_t int_min,
285-
int64_t int_max, int *error, char tsep) nogil
286-
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
287-
uint64_t uint_max, int *error, char tsep) nogil
279+
int64_t str_to_int64(char *p_item, int *error, char tsep) nogil
280+
uint64_t str_to_uint64(uint_state *state, char *p_item, int *error, char tsep) nogil
288281

289282
double xstrtod(const char *p, char **q, char decimal,
290283
char sci, char tsep, int skip_trailing,
@@ -1855,15 +1848,13 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
18551848
data[i] = 0
18561849
continue
18571850

1858-
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
1859-
&error, parser.thousands)
1851+
data[i] = str_to_uint64(state, word, &error, parser.thousands)
18601852
if error != 0:
18611853
return error
18621854
else:
18631855
for i in range(lines):
18641856
COLITER_NEXT(it, word)
1865-
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
1866-
&error, parser.thousands)
1857+
data[i] = str_to_uint64(state, word, &error, parser.thousands)
18671858
if error != 0:
18681859
return error
18691860

@@ -1920,15 +1911,13 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
19201911
data[i] = NA
19211912
continue
19221913

1923-
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
1924-
&error, parser.thousands)
1914+
data[i] = str_to_int64(word, &error, parser.thousands)
19251915
if error != 0:
19261916
return error
19271917
else:
19281918
for i in range(lines):
19291919
COLITER_NEXT(it, word)
1930-
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
1931-
&error, parser.thousands)
1920+
data[i] = str_to_int64(word, &error, parser.thousands)
19321921
if error != 0:
19331922
return error
19341923

pandas/_libs/src/parser/tokenizer.c

Lines changed: 84 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,15 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
2323
#include <float.h>
2424
#include <math.h>
2525
#include <stdbool.h>
26+
#include <stdlib.h>
2627

2728
#include "pandas/portable.h"
2829
#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
2930

31+
// Arrow256 allows up to 76 decimal digits.
32+
// We rounded up to the next power of 2.
33+
#define PROCESSED_WORD_CAPACITY 128
34+
3035
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
3136
int64_t start) {
3237
// column i, starting at 0
@@ -1834,114 +1839,86 @@ int uint64_conflict(uint_state *self) {
18341839
return self->seen_uint && (self->seen_sint || self->seen_null);
18351840
}
18361841

1837-
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
1838-
int *error, char tsep) {
1842+
/* Copy a string without `char_to_remove` into `output`.
1843+
*/
1844+
static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
1845+
const char *str, size_t str_len,
1846+
char char_to_remove) {
1847+
const char *left = str;
1848+
const char *end_ptr = str + str_len;
1849+
size_t bytes_written = 0;
1850+
1851+
while (left < end_ptr) {
1852+
const size_t remaining_bytes_to_read = end_ptr - left;
1853+
const char *right = memchr(left, char_to_remove, remaining_bytes_to_read);
1854+
1855+
if (!right) {
1856+
// If it doesn't find the char to remove, just copy until EOS.
1857+
right = end_ptr;
1858+
}
1859+
1860+
const size_t chunk_size = right - left;
1861+
1862+
if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
1863+
return -1;
1864+
}
1865+
memcpy(&output[bytes_written], left, chunk_size);
1866+
bytes_written += chunk_size;
1867+
1868+
left = right + 1;
1869+
}
1870+
1871+
output[bytes_written] = '\0';
1872+
return 0;
1873+
}
1874+
1875+
int64_t str_to_int64(const char *p_item, int *error, char tsep) {
18391876
const char *p = p_item;
18401877
// Skip leading spaces.
18411878
while (isspace_ascii(*p)) {
18421879
++p;
18431880
}
18441881

18451882
// Handle sign.
1846-
const bool isneg = *p == '-' ? true : false;
1883+
const bool has_sign = *p == '-' || *p == '+';
18471884
// Handle sign.
1848-
if (isneg || (*p == '+')) {
1849-
p++;
1850-
}
1885+
const char *digit_start = has_sign ? p + 1 : p;
18511886

18521887
// Check that there is a first digit.
1853-
if (!isdigit_ascii(*p)) {
1888+
if (!isdigit_ascii(*digit_start)) {
18541889
// Error...
18551890
*error = ERROR_NO_DIGITS;
18561891
return 0;
18571892
}
18581893

1859-
int64_t number = 0;
1860-
if (isneg) {
1861-
// If number is greater than pre_min, at least one more digit
1862-
// can be processed without overflowing.
1863-
int dig_pre_min = -(int_min % 10);
1864-
int64_t pre_min = int_min / 10;
1865-
1866-
// Process the digits.
1867-
char d = *p;
1868-
if (tsep != '\0') {
1869-
while (1) {
1870-
if (d == tsep) {
1871-
d = *++p;
1872-
continue;
1873-
} else if (!isdigit_ascii(d)) {
1874-
break;
1875-
}
1876-
if ((number > pre_min) ||
1877-
((number == pre_min) && (d - '0' <= dig_pre_min))) {
1878-
number = number * 10 - (d - '0');
1879-
d = *++p;
1880-
} else {
1881-
*error = ERROR_OVERFLOW;
1882-
return 0;
1883-
}
1884-
}
1885-
} else {
1886-
while (isdigit_ascii(d)) {
1887-
if ((number > pre_min) ||
1888-
((number == pre_min) && (d - '0' <= dig_pre_min))) {
1889-
number = number * 10 - (d - '0');
1890-
d = *++p;
1891-
} else {
1892-
*error = ERROR_OVERFLOW;
1893-
return 0;
1894-
}
1895-
}
1894+
char buffer[PROCESSED_WORD_CAPACITY];
1895+
const size_t str_len = strlen(p);
1896+
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
1897+
const int status = copy_string_without_char(buffer, p, str_len, tsep);
1898+
if (status != 0) {
1899+
// Word is too big, probably will cause an overflow
1900+
*error = ERROR_OVERFLOW;
1901+
return 0;
18961902
}
1897-
} else {
1898-
// If number is less than pre_max, at least one more digit
1899-
// can be processed without overflowing.
1900-
int64_t pre_max = int_max / 10;
1901-
int dig_pre_max = int_max % 10;
1902-
1903-
// Process the digits.
1904-
char d = *p;
1905-
if (tsep != '\0') {
1906-
while (1) {
1907-
if (d == tsep) {
1908-
d = *++p;
1909-
continue;
1910-
} else if (!isdigit_ascii(d)) {
1911-
break;
1912-
}
1913-
if ((number < pre_max) ||
1914-
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1915-
number = number * 10 + (d - '0');
1916-
d = *++p;
1903+
p = buffer;
1904+
}
19171905

1918-
} else {
1919-
*error = ERROR_OVERFLOW;
1920-
return 0;
1921-
}
1922-
}
1923-
} else {
1924-
while (isdigit_ascii(d)) {
1925-
if ((number < pre_max) ||
1926-
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1927-
number = number * 10 + (d - '0');
1928-
d = *++p;
1906+
char *endptr;
1907+
int64_t number = strtoll(p, &endptr, 10);
19291908

1930-
} else {
1931-
*error = ERROR_OVERFLOW;
1932-
return 0;
1933-
}
1934-
}
1935-
}
1909+
if (errno == ERANGE) {
1910+
*error = ERROR_OVERFLOW;
1911+
errno = 0;
1912+
return 0;
19361913
}
19371914

19381915
// Skip trailing spaces.
1939-
while (isspace_ascii(*p)) {
1940-
++p;
1916+
while (isspace_ascii(*endptr)) {
1917+
++endptr;
19411918
}
19421919

19431920
// Did we use up all the characters?
1944-
if (*p) {
1921+
if (*endptr) {
19451922
*error = ERROR_INVALID_CHARS;
19461923
return 0;
19471924
}
@@ -1950,8 +1927,8 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
19501927
return number;
19511928
}
19521929

1953-
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
1954-
uint64_t uint_max, int *error, char tsep) {
1930+
uint64_t str_to_uint64(uint_state *state, const char *p_item, int *error,
1931+
char tsep) {
19551932
const char *p = p_item;
19561933
// Skip leading spaces.
19571934
while (isspace_ascii(*p)) {
@@ -1974,58 +1951,39 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
19741951
return 0;
19751952
}
19761953

1977-
// If number is less than pre_max, at least one more digit
1978-
// can be processed without overflowing.
1979-
//
1980-
// Process the digits.
1981-
uint64_t number = 0;
1982-
const uint64_t pre_max = uint_max / 10;
1983-
const uint64_t dig_pre_max = uint_max % 10;
1984-
char d = *p;
1985-
if (tsep != '\0') {
1986-
while (1) {
1987-
if (d == tsep) {
1988-
d = *++p;
1989-
continue;
1990-
} else if (!isdigit_ascii(d)) {
1991-
break;
1992-
}
1993-
if ((number < pre_max) ||
1994-
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
1995-
number = number * 10 + (d - '0');
1996-
d = *++p;
1997-
1998-
} else {
1999-
*error = ERROR_OVERFLOW;
2000-
return 0;
2001-
}
1954+
char buffer[PROCESSED_WORD_CAPACITY];
1955+
const size_t str_len = strlen(p);
1956+
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
1957+
const int status = copy_string_without_char(buffer, p, str_len, tsep);
1958+
if (status != 0) {
1959+
// Word is too big, probably will cause an overflow
1960+
*error = ERROR_OVERFLOW;
1961+
return 0;
20021962
}
2003-
} else {
2004-
while (isdigit_ascii(d)) {
2005-
if ((number < pre_max) ||
2006-
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
2007-
number = number * 10 + (d - '0');
2008-
d = *++p;
1963+
p = buffer;
1964+
}
20091965

2010-
} else {
2011-
*error = ERROR_OVERFLOW;
2012-
return 0;
2013-
}
2014-
}
1966+
char *endptr;
1967+
uint64_t number = strtoull(p, &endptr, 10);
1968+
1969+
if (errno == ERANGE) {
1970+
*error = ERROR_OVERFLOW;
1971+
errno = 0;
1972+
return 0;
20151973
}
20161974

20171975
// Skip trailing spaces.
2018-
while (isspace_ascii(*p)) {
2019-
++p;
1976+
while (isspace_ascii(*endptr)) {
1977+
++endptr;
20201978
}
20211979

20221980
// Did we use up all the characters?
2023-
if (*p) {
1981+
if (*endptr) {
20241982
*error = ERROR_INVALID_CHARS;
20251983
return 0;
20261984
}
20271985

2028-
if (number > (uint64_t)int_max) {
1986+
if (number > (uint64_t)INT64_MAX) {
20291987
state->seen_uint = 1;
20301988
}
20311989

0 commit comments

Comments
 (0)