Skip to content

Commit 3a9b8a3

Browse files
committed
Merge branch 'main' into api-filter-select
2 parents 7a69821 + 533821c commit 3a9b8a3

File tree

18 files changed

+254
-182
lines changed

18 files changed

+254
-182
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -717,6 +717,7 @@ Other Deprecations
717717
- Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.unstack` and :meth:`DataFrame.unstack` (:issue:`12189`, :issue:`53868`)
718718
- Deprecated :meth:`Series.filter` and :meth:`DataFrame.filter`, renaming these to ``select`` (:issue:`26642`)
719719
- Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.shift` and :meth:`DataFrame.shift` (:issue:`53802`)
720+
- Deprecated option "future.no_silent_downcasting", as it is no longer used. In a future version accessing this option will raise (:issue:`59502`)
720721
- Deprecated slicing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` using a ``datetime.date`` object, explicitly cast to :class:`Timestamp` instead (:issue:`35830`)
721722

722723
.. ---------------------------------------------------------------------------
@@ -1013,12 +1014,13 @@ Strings
10131014
^^^^^^^
10141015
- Bug in :meth:`Series.str.zfill` raising ``AttributeError`` for :class:`ArrowDtype` (:issue:`61485`)
10151016
- Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)
1017+
- Bug in multiplication with a :class:`StringDtype` incorrectly allowing multiplying by bools; explicitly cast to integers instead (:issue:`62595`)
10161018

10171019
Interval
10181020
^^^^^^^^
10191021
- :meth:`Index.is_monotonic_decreasing`, :meth:`Index.is_monotonic_increasing`, and :meth:`Index.is_unique` could incorrectly be ``False`` for an ``Index`` created from a slice of another ``Index``. (:issue:`57911`)
1022+
- Bug in :class:`Index`, :class:`Series`, :class:`DataFrame` constructors when given a sequence of :class:`Interval` subclass objects casting them to :class:`Interval` (:issue:`46945`)
10201023
- Bug in :func:`interval_range` where start and end numeric types were always cast to 64 bit (:issue:`57268`)
1021-
-
10221024

10231025
Indexing
10241026
^^^^^^^^

pandas/_libs/lib.pyx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2255,7 +2255,8 @@ cpdef bint is_interval_array(ndarray values):
22552255
for i in range(n):
22562256
val = values[i]
22572257

2258-
if isinstance(val, Interval):
2258+
if type(val) is Interval:
2259+
# GH#46945 catch Interval exactly, excluding subclasses
22592260
if closed is None:
22602261
closed = val.closed
22612262
numeric = (

pandas/_libs/src/parser/tokenizer.c

Lines changed: 80 additions & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,15 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
2323
#include <float.h>
2424
#include <math.h>
2525
#include <stdbool.h>
26+
#include <stdlib.h>
2627

2728
#include "pandas/portable.h"
2829
#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
2930

31+
// Arrow256 allows up to 76 decimal digits.
32+
// We rounded up to the next power of 2.
33+
#define PROCESSED_WORD_CAPACITY 128
34+
3035
void coliter_setup(coliter_t *self, parser_t *parser, int64_t i,
3136
int64_t start) {
3237
// column i, starting at 0
@@ -1834,6 +1839,39 @@ int uint64_conflict(uint_state *self) {
18341839
return self->seen_uint && (self->seen_sint || self->seen_null);
18351840
}
18361841

1842+
/* Copy a string without `char_to_remove` into `output`.
1843+
*/
1844+
static int copy_string_without_char(char output[PROCESSED_WORD_CAPACITY],
1845+
const char *str, size_t str_len,
1846+
char char_to_remove) {
1847+
const char *left = str;
1848+
const char *end_ptr = str + str_len;
1849+
size_t bytes_written = 0;
1850+
1851+
while (left < end_ptr) {
1852+
const size_t remaining_bytes_to_read = end_ptr - left;
1853+
const char *right = memchr(left, char_to_remove, remaining_bytes_to_read);
1854+
1855+
if (!right) {
1856+
// If it doesn't find the char to remove, just copy until EOS.
1857+
right = end_ptr;
1858+
}
1859+
1860+
const size_t chunk_size = right - left;
1861+
1862+
if (chunk_size + bytes_written >= PROCESSED_WORD_CAPACITY) {
1863+
return -1;
1864+
}
1865+
memcpy(&output[bytes_written], left, chunk_size);
1866+
bytes_written += chunk_size;
1867+
1868+
left = right + 1;
1869+
}
1870+
1871+
output[bytes_written] = '\0';
1872+
return 0;
1873+
}
1874+
18371875
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18381876
int *error, char tsep) {
18391877
const char *p = p_item;
@@ -1843,105 +1881,45 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18431881
}
18441882

18451883
// Handle sign.
1846-
const bool isneg = *p == '-' ? true : false;
1884+
const bool has_sign = *p == '-' || *p == '+';
18471885
// Handle sign.
1848-
if (isneg || (*p == '+')) {
1849-
p++;
1850-
}
1886+
const char *digit_start = has_sign ? p + 1 : p;
18511887

18521888
// Check that there is a first digit.
1853-
if (!isdigit_ascii(*p)) {
1889+
if (!isdigit_ascii(*digit_start)) {
18541890
// Error...
18551891
*error = ERROR_NO_DIGITS;
18561892
return 0;
18571893
}
18581894

1859-
int64_t number = 0;
1860-
if (isneg) {
1861-
// If number is greater than pre_min, at least one more digit
1862-
// can be processed without overflowing.
1863-
int dig_pre_min = -(int_min % 10);
1864-
int64_t pre_min = int_min / 10;
1865-
1866-
// Process the digits.
1867-
char d = *p;
1868-
if (tsep != '\0') {
1869-
while (1) {
1870-
if (d == tsep) {
1871-
d = *++p;
1872-
continue;
1873-
} else if (!isdigit_ascii(d)) {
1874-
break;
1875-
}
1876-
if ((number > pre_min) ||
1877-
((number == pre_min) && (d - '0' <= dig_pre_min))) {
1878-
number = number * 10 - (d - '0');
1879-
d = *++p;
1880-
} else {
1881-
*error = ERROR_OVERFLOW;
1882-
return 0;
1883-
}
1884-
}
1885-
} else {
1886-
while (isdigit_ascii(d)) {
1887-
if ((number > pre_min) ||
1888-
((number == pre_min) && (d - '0' <= dig_pre_min))) {
1889-
number = number * 10 - (d - '0');
1890-
d = *++p;
1891-
} else {
1892-
*error = ERROR_OVERFLOW;
1893-
return 0;
1894-
}
1895-
}
1895+
char buffer[PROCESSED_WORD_CAPACITY];
1896+
const size_t str_len = strlen(p);
1897+
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
1898+
const int status = copy_string_without_char(buffer, p, str_len, tsep);
1899+
if (status != 0) {
1900+
// Word is too big, probably will cause an overflow
1901+
*error = ERROR_OVERFLOW;
1902+
return 0;
18961903
}
1897-
} else {
1898-
// If number is less than pre_max, at least one more digit
1899-
// can be processed without overflowing.
1900-
int64_t pre_max = int_max / 10;
1901-
int dig_pre_max = int_max % 10;
1902-
1903-
// Process the digits.
1904-
char d = *p;
1905-
if (tsep != '\0') {
1906-
while (1) {
1907-
if (d == tsep) {
1908-
d = *++p;
1909-
continue;
1910-
} else if (!isdigit_ascii(d)) {
1911-
break;
1912-
}
1913-
if ((number < pre_max) ||
1914-
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1915-
number = number * 10 + (d - '0');
1916-
d = *++p;
1904+
p = buffer;
1905+
}
19171906

1918-
} else {
1919-
*error = ERROR_OVERFLOW;
1920-
return 0;
1921-
}
1922-
}
1923-
} else {
1924-
while (isdigit_ascii(d)) {
1925-
if ((number < pre_max) ||
1926-
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1927-
number = number * 10 + (d - '0');
1928-
d = *++p;
1907+
char *endptr;
1908+
int64_t number = strtoll(p, &endptr, 10);
19291909

1930-
} else {
1931-
*error = ERROR_OVERFLOW;
1932-
return 0;
1933-
}
1934-
}
1935-
}
1910+
if (errno == ERANGE || number > int_max || number < int_min) {
1911+
*error = ERROR_OVERFLOW;
1912+
errno = 0;
1913+
return 0;
19361914
}
19371915

19381916
// Skip trailing spaces.
1939-
while (isspace_ascii(*p)) {
1940-
++p;
1917+
while (isspace_ascii(*endptr)) {
1918+
++endptr;
19411919
}
19421920

19431921
// Did we use up all the characters?
1944-
if (*p) {
1922+
if (*endptr) {
19451923
*error = ERROR_INVALID_CHARS;
19461924
return 0;
19471925
}
@@ -1974,53 +1952,34 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
19741952
return 0;
19751953
}
19761954

1977-
// If number is less than pre_max, at least one more digit
1978-
// can be processed without overflowing.
1979-
//
1980-
// Process the digits.
1981-
uint64_t number = 0;
1982-
const uint64_t pre_max = uint_max / 10;
1983-
const uint64_t dig_pre_max = uint_max % 10;
1984-
char d = *p;
1985-
if (tsep != '\0') {
1986-
while (1) {
1987-
if (d == tsep) {
1988-
d = *++p;
1989-
continue;
1990-
} else if (!isdigit_ascii(d)) {
1991-
break;
1992-
}
1993-
if ((number < pre_max) ||
1994-
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
1995-
number = number * 10 + (d - '0');
1996-
d = *++p;
1997-
1998-
} else {
1999-
*error = ERROR_OVERFLOW;
2000-
return 0;
2001-
}
1955+
char buffer[PROCESSED_WORD_CAPACITY];
1956+
const size_t str_len = strlen(p);
1957+
if (tsep != '\0' && memchr(p, tsep, str_len) != NULL) {
1958+
const int status = copy_string_without_char(buffer, p, str_len, tsep);
1959+
if (status != 0) {
1960+
// Word is too big, probably will cause an overflow
1961+
*error = ERROR_OVERFLOW;
1962+
return 0;
20021963
}
2003-
} else {
2004-
while (isdigit_ascii(d)) {
2005-
if ((number < pre_max) ||
2006-
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
2007-
number = number * 10 + (d - '0');
2008-
d = *++p;
1964+
p = buffer;
1965+
}
20091966

2010-
} else {
2011-
*error = ERROR_OVERFLOW;
2012-
return 0;
2013-
}
2014-
}
1967+
char *endptr;
1968+
uint64_t number = strtoull(p, &endptr, 10);
1969+
1970+
if (errno == ERANGE || number > uint_max) {
1971+
*error = ERROR_OVERFLOW;
1972+
errno = 0;
1973+
return 0;
20151974
}
20161975

20171976
// Skip trailing spaces.
2018-
while (isspace_ascii(*p)) {
2019-
++p;
1977+
while (isspace_ascii(*endptr)) {
1978+
++endptr;
20201979
}
20211980

20221981
// Did we use up all the characters?
2023-
if (*p) {
1982+
if (*endptr) {
20241983
*error = ERROR_INVALID_CHARS;
20251984
return 0;
20261985
}

pandas/conftest.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -689,7 +689,9 @@ def _create_mi_with_dt64tz_level():
689689
"categorical": CategoricalIndex(list("abcd") * 2),
690690
"interval": IntervalIndex.from_breaks(np.linspace(0, 100, num=11)),
691691
"empty": Index([]),
692-
"tuples": MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])),
692+
"tuples": MultiIndex.from_tuples(
693+
zip(["foo", "bar", "baz"], [1, 2, 3], strict=True)
694+
),
693695
"mi-with-dt64tz-level": _create_mi_with_dt64tz_level(),
694696
"multi": _create_multiindex(),
695697
"repeats": Index([0, 0, 1, 1, 2, 2]),
@@ -1874,7 +1876,9 @@ def any_numeric_dtype(request):
18741876
("period", [Period(2013), pd.NaT, Period(2018)]),
18751877
("interval", [Interval(0, 1), np.nan, Interval(0, 2)]),
18761878
]
1877-
ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id
1879+
ids = [
1880+
pair[0] for pair in _any_skipna_inferred_dtype
1881+
] # use inferred type as fixture-id
18781882

18791883

18801884
@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids)

pandas/core/arrays/string_.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,6 +1113,16 @@ def _cmp_method(self, other, op):
11131113
other = np.asarray(other)
11141114
other = other[valid]
11151115

1116+
other_dtype = getattr(other, "dtype", None)
1117+
if op.__name__.strip("_") in ["mul", "rmul"] and (
1118+
lib.is_bool(other) or lib.is_np_dtype(other_dtype, "b")
1119+
):
1120+
# GH#62595
1121+
raise TypeError(
1122+
"Cannot multiply StringArray by bools. "
1123+
"Explicitly cast to integers instead."
1124+
)
1125+
11161126
if op.__name__ in ops.ARITHMETIC_BINOPS:
11171127
result = np.empty_like(self._ndarray, dtype="object")
11181128
result[mask] = self.dtype.na_value

pandas/core/config_init.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
is_text,
2929
)
3030

31+
from pandas.errors import Pandas4Warning
32+
3133
# compute
3234

3335
use_bottleneck_doc = """
@@ -899,10 +901,10 @@ def register_converter_cb(key: str) -> None:
899901
cf.register_option(
900902
"no_silent_downcasting",
901903
False,
902-
"Whether to opt-in to the future behavior which will *not* silently "
903-
"downcast results from Series and DataFrame `where`, `mask`, and `clip` "
904-
"methods. "
905-
"Silent downcasting will be removed in pandas 3.0 "
906-
"(at which point this option will be deprecated).",
904+
"This option is deprecated and will be removed in a future version. "
905+
"It has no effect.",
907906
validator=is_one_of_factory([True, False]),
908907
)
908+
909+
# GH#59502
910+
cf.deprecate_option("future.no_silent_downcasting", Pandas4Warning)

pandas/io/formats/info.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1096,7 +1096,7 @@ def headers(self) -> Sequence[str]:
10961096

10971097
def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]:
10981098
"""Iterator with string representation of body data without counts."""
1099-
yield from self._gen_dtypes()
1099+
yield from ([dtype] for dtype in self._gen_dtypes())
11001100

11011101
def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]:
11021102
"""Iterator with string representation of body data with counts."""

pandas/tests/arithmetic/test_string.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,25 @@ def test_pyarrow_numpy_string_invalid():
112112

113113
with pytest.raises(TypeError, match="Invalid comparison"):
114114
ser > ser4
115+
116+
117+
def test_mul_bool_invalid(any_string_dtype):
118+
# GH#62595
119+
dtype = any_string_dtype
120+
ser = Series(["a", "b", "c"], dtype=dtype)
121+
122+
if dtype == object:
123+
pytest.skip("This is not expect to raise")
124+
elif dtype.storage == "python":
125+
msg = "Cannot multiply StringArray by bools. Explicitly cast to integers"
126+
else:
127+
msg = "Can only string multiply by an integer"
128+
129+
with pytest.raises(TypeError, match=msg):
130+
False * ser
131+
with pytest.raises(TypeError, match=msg):
132+
ser * True
133+
with pytest.raises(TypeError, match=msg):
134+
ser * np.array([True, False, True], dtype=bool)
135+
with pytest.raises(TypeError, match=msg):
136+
np.array([True, False, True], dtype=bool) * ser

0 commit comments

Comments
 (0)