Skip to content

Commit d5d4db4

Browse files
Merge remote-tracking branch 'upstream/main' into cow-ea-readonly
2 parents a4accf8 + 08d21d7 commit d5d4db4

File tree

7 files changed

+79
-20
lines changed

7 files changed

+79
-20
lines changed

doc/source/whatsnew/v2.3.3.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,19 +22,23 @@ become the default string dtype in pandas 3.0. See
2222

2323
Improvements
2424
^^^^^^^^^^^^
25+
- Update :meth:`DataFrame.select_dtypes` to keep selecting ``str`` columns when
26+
specifying ``include=["object"]`` for backwards compatibility. In a future
27+
release, this will be deprecated and code for pandas 3+ should be updated to
28+
do ``include=["str"]`` (:issue:`61916`)
2529
- Accessing the underlying NumPy array of a DataFrame or Series will return a read-only
2630
array if the array shares data with the original DataFrame or Series (:ref:`copy_on_write_read_only_na`).
2731
This logic is expanded to accessing the underlying pandas ExtensionArray
2832
through ``.array`` (or ``.values`` depending on the dtype) as well (:issue:`61925`).
2933

30-
3134
.. _whatsnew_233.string_fixes.bugs:
3235

3336
Bug fixes
3437
^^^^^^^^^
3538
- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
3639
- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
3740
with a compiled regex and custom flags (:issue:`62240`)
41+
- Fix :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`)
3842

3943
.. ---------------------------------------------------------------------------
4044
.. _whatsnew_233.contributors:

pandas/core/arrays/_arrow_string_mixins.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -326,8 +326,12 @@ def _str_fullmatch(
326326
flags: int = 0,
327327
na: Scalar | lib.NoDefault = lib.no_default,
328328
):
329-
if not pat.endswith("$") or pat.endswith("\\$"):
330-
pat = f"{pat}$"
329+
if (not pat.endswith("$") or pat.endswith("\\$")) and not pat.startswith("^"):
330+
pat = f"^({pat})$"
331+
elif not pat.endswith("$") or pat.endswith("\\$"):
332+
pat = f"^({pat[1:]})$"
333+
elif not pat.startswith("^"):
334+
pat = f"^({pat[0:-1]})$"
331335
return self._str_match(pat, case, flags, na)
332336

333337
def _str_find(self, sub: str, start: int = 0, end: int | None = None):

pandas/core/dtypes/cast.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,9 @@ def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:
865865
np.dtype("<U").type, # type: ignore[arg-type]
866866
}
867867
if non_string_dtypes != dtype_set:
868-
raise TypeError("string dtypes are not allowed, use 'object' instead")
868+
raise TypeError(
869+
"numpy string dtypes are not allowed, use 'str' or 'object' instead"
870+
)
869871

870872

871873
def coerce_indexer_dtype(indexer, categories) -> np.ndarray:

pandas/core/frame.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5150,10 +5150,14 @@ def check_int_infer_dtype(dtypes):
51505150
def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
51515151
# GH 46870: BooleanDtype._is_numeric == True but should be excluded
51525152
dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
5153-
return issubclass(dtype.type, tuple(dtypes_set)) or (
5154-
np.number in dtypes_set
5155-
and getattr(dtype, "_is_numeric", False)
5156-
and not is_bool_dtype(dtype)
5153+
return (
5154+
issubclass(dtype.type, tuple(dtypes_set))
5155+
or (
5156+
np.number in dtypes_set
5157+
and getattr(dtype, "_is_numeric", False)
5158+
and not is_bool_dtype(dtype)
5159+
)
5160+
or (dtype.type is str and np.object_ in dtypes_set)
51575161
)
51585162

51595163
def predicate(arr: ArrayLike) -> bool:

pandas/tests/extension/test_arrow.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1870,23 +1870,28 @@ def test_str_match(pat, case, na, exp):
18701870

18711871
@pytest.mark.parametrize(
18721872
"pat, case, na, exp",
1873+
# Note: keep cases in sync with
1874+
# pandas/tests/strings/test_find_replace.py::test_str_fullmatch_extra_cases
18731875
[
1874-
["abc", False, None, [True, True, False, None]],
1876+
["abc", False, None, [True, False, False, None]],
18751877
["Abc", True, None, [False, False, False, None]],
18761878
["bc", True, None, [False, False, False, None]],
1877-
["ab", False, None, [True, True, False, None]],
1878-
["a[a-z]{2}", False, None, [True, True, False, None]],
1879+
["ab", False, None, [False, False, False, None]],
1880+
["a[a-z]{2}", False, None, [True, False, False, None]],
18791881
["A[a-z]{1}", True, None, [False, False, False, None]],
18801882
# GH Issue: #56652
18811883
["abc$", False, None, [True, False, False, None]],
18821884
["abc\\$", False, None, [False, True, False, None]],
18831885
["Abc$", True, None, [False, False, False, None]],
18841886
["Abc\\$", True, None, [False, False, False, None]],
1887+
# https://github.com/pandas-dev/pandas/issues/61072
1888+
["(abc)|(abx)", True, None, [True, False, False, None]],
1889+
["((abc)|(abx))", True, None, [True, False, False, None]],
18851890
],
18861891
)
18871892
def test_str_fullmatch(pat, case, na, exp):
18881893
ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string()))
1889-
result = ser.str.match(pat, case=case, na=na)
1894+
result = ser.str.fullmatch(pat, case=case, na=na)
18901895
expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
18911896
tm.assert_series_equal(result, expected)
18921897

pandas/tests/frame/methods/test_select_dtypes.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,10 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string):
102102
ri = df.select_dtypes(include=[str])
103103
tm.assert_frame_equal(ri, ei)
104104

105+
ri = df.select_dtypes(include=["object"])
106+
ei = df[["a"]]
107+
tm.assert_frame_equal(ri, ei)
108+
105109
def test_select_dtypes_exclude_using_list_like(self):
106110
df = DataFrame(
107111
{
@@ -309,17 +313,15 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_strin
309313
df["g"] = df.f.diff()
310314
assert not hasattr(np, "u8")
311315
r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"])
312-
if using_infer_string:
313-
e = df[["b"]]
314-
else:
315-
e = df[["a", "b"]]
316+
# if using_infer_string:
317+
# TODO warn
318+
e = df[["a", "b"]]
316319
tm.assert_frame_equal(r, e)
317320

318321
r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"])
319-
if using_infer_string:
320-
e = df[["b", "g"]]
321-
else:
322-
e = df[["a", "b", "g"]]
322+
# if using_infer_string:
323+
# TODO warn
324+
e = df[["a", "b", "g"]]
323325
tm.assert_frame_equal(r, e)
324326

325327
def test_select_dtypes_empty(self):

pandas/tests/strings/test_find_replace.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,6 +1075,44 @@ def test_fullmatch_compiled_regex(any_string_dtype):
10751075
values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE)
10761076

10771077

1078+
@pytest.mark.parametrize(
1079+
"pat, case, na, exp",
1080+
# Note: keep cases in sync with
1081+
# pandas/tests/extension/test_arrow.py::test_str_fullmatch
1082+
[
1083+
["abc", False, None, [True, False, False, None]],
1084+
["Abc", True, None, [False, False, False, None]],
1085+
["bc", True, None, [False, False, False, None]],
1086+
["ab", False, None, [False, False, False, None]],
1087+
["a[a-z]{2}", False, None, [True, False, False, None]],
1088+
["A[a-z]{1}", True, None, [False, False, False, None]],
1089+
# GH Issue: #56652
1090+
["abc$", False, None, [True, False, False, None]],
1091+
["abc\\$", False, None, [False, True, False, None]],
1092+
["Abc$", True, None, [False, False, False, None]],
1093+
["Abc\\$", True, None, [False, False, False, None]],
1094+
# https://github.com/pandas-dev/pandas/issues/61072
1095+
["(abc)|(abx)", True, None, [True, False, False, None]],
1096+
["((abc)|(abx))", True, None, [True, False, False, None]],
1097+
],
1098+
)
1099+
def test_str_fullmatch_extra_cases(any_string_dtype, pat, case, na, exp):
1100+
ser = Series(["abc", "abc$", "$abc", None], dtype=any_string_dtype)
1101+
result = ser.str.fullmatch(pat, case=case, na=na)
1102+
1103+
if any_string_dtype == "str":
1104+
# NaN propagates as False
1105+
exp[-1] = False
1106+
expected_dtype = bool
1107+
else:
1108+
expected_dtype = (
1109+
"object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
1110+
)
1111+
expected = Series([True, False, np.nan, False], dtype=expected_dtype)
1112+
expected = Series(exp, dtype=expected_dtype)
1113+
tm.assert_series_equal(result, expected)
1114+
1115+
10781116
# --------------------------------------------------------------------------------------
10791117
# str.findall
10801118
# --------------------------------------------------------------------------------------

0 commit comments

Comments
 (0)