BUG: Categorical(Series[object]) not preserving categories.dtype as object

jbrockmendel · jbrockmendel · commit 583a905222e1 · 2025-10-19T12:22:35.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -936,6 +936,7 @@ Bug fixes
 
 Categorical
 ^^^^^^^^^^^
+- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the ``categories.dtype`` is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings) (:issue:`61778`)
 - Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
 - Bug in :func:`testing.assert_index_equal` raising ``TypeError`` instead of ``AssertionError`` for incomparable ``CategoricalIndex`` when ``check_categorical=True`` and ``exact=False`` (:issue:`61935`)
 - Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -458,6 +458,10 @@ def __init__(
                 codes = arr.indices.to_numpy()
                 dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
             else:
+                preserve_object = False
+                if isinstance(values, (ABCIndex, ABCSeries)) and values.dtype == object:
+                    # GH#61778
+                    preserve_object = True
                 if not isinstance(values, ABCIndex):
                     # in particular RangeIndex xref test_index_equal_range_categories
                     values = sanitize_array(values, None)
@@ -474,7 +478,14 @@ def __init__(
                             "by passing in a categories argument."
                         ) from err
 
-                # we're inferring from values
+                if preserve_object:
+                    # GH#61778 wrap categories in an Index to prevent dtype
+                    #  inference in the CategoricalDtype constructor
+                    from pandas import Index
+
+                    categories = Index(categories, dtype=object, copy=False)
+
+                # if not preserve_obejct, we're inferring from values
                 dtype = CategoricalDtype(categories, dtype.ordered)
 
         elif isinstance(values.dtype, CategoricalDtype):
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -809,3 +809,24 @@ def test_range_values_preserves_rangeindex_categories(self, values, categories):
         result = Categorical(values=values, categories=categories).categories
         expected = RangeIndex(range(5))
         tm.assert_index_equal(result, expected, exact=True)
+
+    def test_categorical_preserve_object_dtype_from_pandas(self, using_infer_string):
+        # GH#61778
+        pylist = ["foo", "bar", "baz"]
+        ser = Series(pylist, dtype="object")
+        idx = Index(pylist, dtype="object")
+        arr = np.array(pylist, dtype="object")
+
+        cat_from_ser = Categorical(ser)
+        cat_from_idx = Categorical(idx)
+        cat_from_arr = Categorical(arr)
+        cat_from_list = Categorical(pylist)
+
+        # Series/Index with object dtype: infer string
+        # dtype if all elements are strings
+        assert cat_from_ser.categories.dtype == object
+        assert cat_from_idx.categories.dtype == object
+
+        # Numpy array or list: infer string dtype
+        assert cat_from_arr.categories.dtype == "str"
+        assert cat_from_list.categories.dtype == "str"
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
@@ -347,6 +347,10 @@ def test_against_frame_and_seriesgroupby(
             index_frame = expected.index.to_frame(index=False)
             index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
             index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
+            both_dtype = index_frame["both"].dtype
+            index_frame = index_frame.astype(
+                {"gender": both_dtype, "education": both_dtype}
+            )
             del index_frame["both"]
             index_frame2 = index_frame.rename({0: None}, axis=1)
             expected.index = MultiIndex.from_frame(index_frame2)
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -2673,8 +2673,6 @@ def test_pivot_integer_bug(self, any_string_dtype):
 
         result = df.pivot(index=1, columns=0, values=2)
         expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype)
-        if any_string_dtype == "object":
-            expected_columns = expected_columns.astype("str")
         tm.assert_index_equal(result.columns, expected_columns)
 
     def test_pivot_index_none(self):