diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index d37eebef5c0c0..f02745abc52ce 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -637,6 +637,8 @@ with a bool :class:`numpy.ndarray`. DatetimeTZDtype.tz PeriodDtype.freq IntervalDtype.subtype + StringDtype.storage + StringDtype.na_value ********* Utilities diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 802c0d02fc21f..8f404dbf461c8 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -753,7 +753,10 @@ Differences in behavior will be primarily due to the kind of NA value. The four :class:`StringDtype` variants ====================================== -There are four :class:`StringDtype` variants that are available to users. +There are four :class:`StringDtype` variants that are available to users, +controlled by the ``storage`` and ``na_value`` parameters of :class:`StringDtype`. +At runtime, these can be checked via the :attr:`StringDtype.storage` +and :attr:`StringDtype.na_value` attributes. Python storage with ``np.nan`` values ------------------------------------- diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f36b22b10aa34..d9c5b48bff759 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -119,7 +119,8 @@ class StringDtype(StorageExtensionDtype): Attributes ---------- - None + storage + na_value Methods ------- @@ -149,8 +150,24 @@ def name(self) -> str: # type: ignore[override] # follows NumPy semantics, which uses nan. @property def na_value(self) -> libmissing.NAType | float: # type: ignore[override] + """ + The missing value representation for this dtype. + + This value indicates which missing value semantics are used by this dtype. + Returns ``np.nan`` for the default string dtype with NumPy semantics, + and ``pd.NA`` for the opt-in string dtype with pandas NA semantics. + """ return self._na_value + @property + def storage(self) -> str: + """ + The storage backend for this dtype. + + Can be either "pyarrow" or "python". + """ + return self._storage + _metadata = ("storage", "_na_value") # type: ignore[assignment] def __init__( @@ -185,7 +202,7 @@ def __init__( elif na_value is not libmissing.NA: raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}") - self.storage = cast(str, storage) + self._storage = cast(str, storage) self._na_value = na_value def __repr__(self) -> str: @@ -211,7 +228,7 @@ def __eq__(self, other: object) -> bool: def __setstate__(self, state: MutableMapping[str, Any]) -> None: # back-compat for pandas < 2.3, where na_value did not yet exist - self.storage = state.pop("storage", "python") + self._storage = state.pop("storage", "python") self._na_value = state.pop("_na_value", libmissing.NA) def __hash__(self) -> int: @@ -306,7 +323,7 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: # if both python and pyarrow storage -> priority to pyarrow storage = "pyarrow" else: - storage = next(iter(storages)) # type: ignore[assignment] + storage = next(iter(storages)) na_value: libmissing.NAType | float if len(na_values) == 2: diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 59f360650ff8c..93915a4d1673e 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -458,8 +458,8 @@ class StorageExtensionDtype(ExtensionDtype): name: str _metadata = ("storage",) - def __init__(self, storage: str | None = None) -> None: - self.storage = storage + def __init__(self, storage: str) -> None: + self._storage = storage def __repr__(self) -> str: return f"{self.name}[{self.storage}]" @@ -480,6 +480,10 @@ def __hash__(self) -> int: def na_value(self) -> libmissing.NAType: return libmissing.NA + @property + def storage(self) -> str: + return self._storage + @set_module("pandas.api.extensions") def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: