Merge remote-tracking branch 'upstream/main' into cow-ea-readonly

jorisvandenbossche · jorisvandenbossche · commit d5d4db46b630 · 2025-09-21T17:14:00.000+02:00
diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst
@@ -22,19 +22,23 @@ become the default string dtype in pandas 3.0. See
 
 Improvements
 ^^^^^^^^^^^^
+- Update :meth:`DataFrame.select_dtypes` to keep selecting ``str`` columns when
+  specifying ``include=["object"]`` for backwards compatibility. In a future
+  release, this will be deprecated and code for pandas 3+ should be updated to
+  do ``include=["str"]`` (:issue:`61916`)
 - Accessing the underlying NumPy array of a DataFrame or Series will return a read-only
   array if the array shares data with the original DataFrame or Series (:ref:`copy_on_write_read_only_na`).
   This logic is expanded to accessing the underlying pandas ExtensionArray
   through ``.array`` (or ``.values`` depending on the dtype) as well (:issue:`61925`).
 
-
 .. _whatsnew_233.string_fixes.bugs:
 
 Bug fixes
 ^^^^^^^^^
 - Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
 - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
   with a compiled regex and custom flags (:issue:`62240`)
+- Fix :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_233.contributors:
diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
@@ -326,8 +326,12 @@ def _str_fullmatch(
         flags: int = 0,
         na: Scalar | lib.NoDefault = lib.no_default,
     ):
-        if not pat.endswith("$") or pat.endswith("\\$"):
-            pat = f"{pat}$"
+        if (not pat.endswith("$") or pat.endswith("\\$")) and not pat.startswith("^"):
+            pat = f"^({pat})$"
+        elif not pat.endswith("$") or pat.endswith("\\$"):
+            pat = f"^({pat[1:]})$"
+        elif not pat.startswith("^"):
+            pat = f"^({pat[0:-1]})$"
         return self._str_match(pat, case, flags, na)
 
     def _str_find(self, sub: str, start: int = 0, end: int | None = None):
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -865,7 +865,9 @@ def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:
         np.dtype("<U").type,  # type: ignore[arg-type]
     }
     if non_string_dtypes != dtype_set:
-        raise TypeError("string dtypes are not allowed, use 'object' instead")
+        raise TypeError(
+            "numpy string dtypes are not allowed, use 'str' or 'object' instead"
+        )
 
 
 def coerce_indexer_dtype(indexer, categories) -> np.ndarray:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5150,10 +5150,14 @@ def check_int_infer_dtype(dtypes):
         def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
             # GH 46870: BooleanDtype._is_numeric == True but should be excluded
             dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
-            return issubclass(dtype.type, tuple(dtypes_set)) or (
-                np.number in dtypes_set
-                and getattr(dtype, "_is_numeric", False)
-                and not is_bool_dtype(dtype)
+            return (
+                issubclass(dtype.type, tuple(dtypes_set))
+                or (
+                    np.number in dtypes_set
+                    and getattr(dtype, "_is_numeric", False)
+                    and not is_bool_dtype(dtype)
+                )
+                or (dtype.type is str and np.object_ in dtypes_set)
             )
 
         def predicate(arr: ArrayLike) -> bool:
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -1870,23 +1870,28 @@ def test_str_match(pat, case, na, exp):
 
 @pytest.mark.parametrize(
     "pat, case, na, exp",
+    # Note: keep cases in sync with
+    # pandas/tests/strings/test_find_replace.py::test_str_fullmatch_extra_cases
     [
-        ["abc", False, None, [True, True, False, None]],
+        ["abc", False, None, [True, False, False, None]],
         ["Abc", True, None, [False, False, False, None]],
         ["bc", True, None, [False, False, False, None]],
-        ["ab", False, None, [True, True, False, None]],
-        ["a[a-z]{2}", False, None, [True, True, False, None]],
+        ["ab", False, None, [False, False, False, None]],
+        ["a[a-z]{2}", False, None, [True, False, False, None]],
         ["A[a-z]{1}", True, None, [False, False, False, None]],
         # GH Issue: #56652
         ["abc$", False, None, [True, False, False, None]],
         ["abc\\$", False, None, [False, True, False, None]],
         ["Abc$", True, None, [False, False, False, None]],
         ["Abc\\$", True, None, [False, False, False, None]],
+        # https://github.com/pandas-dev/pandas/issues/61072
+        ["(abc)|(abx)", True, None, [True, False, False, None]],
+        ["((abc)|(abx))", True, None, [True, False, False, None]],
     ],
 )
 def test_str_fullmatch(pat, case, na, exp):
     ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string()))
-    result = ser.str.match(pat, case=case, na=na)
+    result = ser.str.fullmatch(pat, case=case, na=na)
     expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
     tm.assert_series_equal(result, expected)
 
diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py
@@ -102,6 +102,10 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string):
             ri = df.select_dtypes(include=[str])
             tm.assert_frame_equal(ri, ei)
 
+        ri = df.select_dtypes(include=["object"])
+        ei = df[["a"]]
+        tm.assert_frame_equal(ri, ei)
+
     def test_select_dtypes_exclude_using_list_like(self):
         df = DataFrame(
             {
@@ -309,17 +313,15 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_strin
         df["g"] = df.f.diff()
         assert not hasattr(np, "u8")
         r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"])
-        if using_infer_string:
-            e = df[["b"]]
-        else:
-            e = df[["a", "b"]]
+        # if using_infer_string:
+        #     TODO warn
+        e = df[["a", "b"]]
         tm.assert_frame_equal(r, e)
 
         r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"])
-        if using_infer_string:
-            e = df[["b", "g"]]
-        else:
-            e = df[["a", "b", "g"]]
+        # if using_infer_string:
+        #     TODO warn
+        e = df[["a", "b", "g"]]
         tm.assert_frame_equal(r, e)
 
     def test_select_dtypes_empty(self):
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
@@ -1075,6 +1075,44 @@ def test_fullmatch_compiled_regex(any_string_dtype):
         values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE)
 
 
+@pytest.mark.parametrize(
+    "pat, case, na, exp",
+    # Note: keep cases in sync with
+    # pandas/tests/extension/test_arrow.py::test_str_fullmatch
+    [
+        ["abc", False, None, [True, False, False, None]],
+        ["Abc", True, None, [False, False, False, None]],
+        ["bc", True, None, [False, False, False, None]],
+        ["ab", False, None, [False, False, False, None]],
+        ["a[a-z]{2}", False, None, [True, False, False, None]],
+        ["A[a-z]{1}", True, None, [False, False, False, None]],
+        # GH Issue: #56652
+        ["abc$", False, None, [True, False, False, None]],
+        ["abc\\$", False, None, [False, True, False, None]],
+        ["Abc$", True, None, [False, False, False, None]],
+        ["Abc\\$", True, None, [False, False, False, None]],
+        # https://github.com/pandas-dev/pandas/issues/61072
+        ["(abc)|(abx)", True, None, [True, False, False, None]],
+        ["((abc)|(abx))", True, None, [True, False, False, None]],
+    ],
+)
+def test_str_fullmatch_extra_cases(any_string_dtype, pat, case, na, exp):
+    ser = Series(["abc", "abc$", "$abc", None], dtype=any_string_dtype)
+    result = ser.str.fullmatch(pat, case=case, na=na)
+
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        exp[-1] = False
+        expected_dtype = bool
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([True, False, np.nan, False], dtype=expected_dtype)
+    expected = Series(exp, dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
 # --------------------------------------------------------------------------------------
 # str.findall
 # --------------------------------------------------------------------------------------

Original file line number	Diff line number	Diff line change
`@@ -865,7 +865,9 @@ def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:`
`865`	`865`	`np.dtype("<U").type, # type: ignore[arg-type]`
`866`	`866`	`}`
`867`	`867`	`if non_string_dtypes != dtype_set:`
`868`		`- raise TypeError("string dtypes are not allowed, use 'object' instead")`
	`868`	`+ raise TypeError(`
	`869`	`+ "numpy string dtypes are not allowed, use 'str' or 'object' instead"`
	`870`	`+ )`
`869`	`871`
`870`	`872`
`871`	`873`	`def coerce_indexer_dtype(indexer, categories) -> np.ndarray:`