Skip to content

Commit 583a905

Browse files
committed
BUG: Categorical(Series[object]) not preserving categories.dtype as object
1 parent a329dc3 commit 583a905

File tree

5 files changed

+38
-3
lines changed

5 files changed

+38
-3
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,7 @@ Bug fixes
936936

937937
Categorical
938938
^^^^^^^^^^^
939+
- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the ``categories.dtype`` is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings) (:issue:`61778`)
939940
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
940941
- Bug in :func:`testing.assert_index_equal` raising ``TypeError`` instead of ``AssertionError`` for incomparable ``CategoricalIndex`` when ``check_categorical=True`` and ``exact=False`` (:issue:`61935`)
941942
- Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`)

pandas/core/arrays/categorical.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,10 @@ def __init__(
458458
codes = arr.indices.to_numpy()
459459
dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
460460
else:
461+
preserve_object = False
462+
if isinstance(values, (ABCIndex, ABCSeries)) and values.dtype == object:
463+
# GH#61778
464+
preserve_object = True
461465
if not isinstance(values, ABCIndex):
462466
# in particular RangeIndex xref test_index_equal_range_categories
463467
values = sanitize_array(values, None)
@@ -474,7 +478,14 @@ def __init__(
474478
"by passing in a categories argument."
475479
) from err
476480

477-
# we're inferring from values
481+
if preserve_object:
482+
# GH#61778 wrap categories in an Index to prevent dtype
483+
# inference in the CategoricalDtype constructor
484+
from pandas import Index
485+
486+
categories = Index(categories, dtype=object, copy=False)
487+
488+
# if not preserve_obejct, we're inferring from values
478489
dtype = CategoricalDtype(categories, dtype.ordered)
479490

480491
elif isinstance(values.dtype, CategoricalDtype):

pandas/tests/arrays/categorical/test_constructors.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -809,3 +809,24 @@ def test_range_values_preserves_rangeindex_categories(self, values, categories):
809809
result = Categorical(values=values, categories=categories).categories
810810
expected = RangeIndex(range(5))
811811
tm.assert_index_equal(result, expected, exact=True)
812+
813+
def test_categorical_preserve_object_dtype_from_pandas(self, using_infer_string):
814+
# GH#61778
815+
pylist = ["foo", "bar", "baz"]
816+
ser = Series(pylist, dtype="object")
817+
idx = Index(pylist, dtype="object")
818+
arr = np.array(pylist, dtype="object")
819+
820+
cat_from_ser = Categorical(ser)
821+
cat_from_idx = Categorical(idx)
822+
cat_from_arr = Categorical(arr)
823+
cat_from_list = Categorical(pylist)
824+
825+
# Series/Index with object dtype: infer string
826+
# dtype if all elements are strings
827+
assert cat_from_ser.categories.dtype == object
828+
assert cat_from_idx.categories.dtype == object
829+
830+
# Numpy array or list: infer string dtype
831+
assert cat_from_arr.categories.dtype == "str"
832+
assert cat_from_list.categories.dtype == "str"

pandas/tests/groupby/methods/test_value_counts.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,10 @@ def test_against_frame_and_seriesgroupby(
347347
index_frame = expected.index.to_frame(index=False)
348348
index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
349349
index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
350+
both_dtype = index_frame["both"].dtype
351+
index_frame = index_frame.astype(
352+
{"gender": both_dtype, "education": both_dtype}
353+
)
350354
del index_frame["both"]
351355
index_frame2 = index_frame.rename({0: None}, axis=1)
352356
expected.index = MultiIndex.from_frame(index_frame2)

pandas/tests/reshape/test_pivot.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2673,8 +2673,6 @@ def test_pivot_integer_bug(self, any_string_dtype):
26732673

26742674
result = df.pivot(index=1, columns=0, values=2)
26752675
expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype)
2676-
if any_string_dtype == "object":
2677-
expected_columns = expected_columns.astype("str")
26782676
tm.assert_index_equal(result.columns, expected_columns)
26792677

26802678
def test_pivot_index_none(self):

0 commit comments

Comments
 (0)