Merge branch 'main' into doc/clarify-parentheses-vs-brackets-62314

AnandMukherjee2004 · web-flow · commit 0a20c3b4ee4e · 2025-09-16T01:44:33.000+05:30
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1083,6 +1083,7 @@ Reshaping
 - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
 - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
+- Bug in :meth:`DataFrame.unstack` raising an error with indexes containing ``NaN`` with ``sort=False`` (:issue:`61221`)
 - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`)
 - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
 - Bug in :meth:`DataFrame.pivot_table` incorrectly ignoring the ``values`` argument when also supplied to the ``index`` or ``columns`` parameters (:issue:`57876`, :issue:`61292`)
diff --git a/environment.yml b/environment.yml
@@ -91,6 +91,9 @@ dependencies:
   - sphinx
   - sphinx-design
   - sphinx-copybutton
+
+  # static typing
+  - scipy-stubs
   - types-python-dateutil
   - types-PyMySQL
   - types-pytz
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -584,19 +584,13 @@ def raise_assert_detail(
 
     if isinstance(left, np.ndarray):
         left = pprint_thing(left)
-    elif isinstance(left, (CategoricalDtype, NumpyEADtype)):
+    elif isinstance(left, (CategoricalDtype, StringDtype, NumpyEADtype)):
         left = repr(left)
-    elif isinstance(left, StringDtype):
-        # TODO(infer_string) this special case could be avoided if we have
-        # a more informative repr https://github.com/pandas-dev/pandas/issues/59342
-        left = f"StringDtype(storage={left.storage}, na_value={left.na_value})"
 
     if isinstance(right, np.ndarray):
         right = pprint_thing(right)
-    elif isinstance(right, (CategoricalDtype, NumpyEADtype)):
+    elif isinstance(right, (CategoricalDtype, StringDtype, NumpyEADtype)):
         right = repr(right)
-    elif isinstance(right, StringDtype):
-        right = f"StringDtype(storage={right.storage}, na_value={right.na_value})"
 
     msg += f"""
 [left]:  {left}
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -14,6 +14,7 @@
     lib,
     missing as libmissing,
 )
+from pandas.util._decorators import set_module
 
 from pandas.core.dtypes.common import is_list_like
 from pandas.core.dtypes.dtypes import register_extension_dtype
@@ -39,6 +40,7 @@
 
 
 @register_extension_dtype
+@set_module("pandas")
 class BooleanDtype(BaseMaskedDtype):
     """
     Extension dtype for boolean data.
diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
@@ -8,6 +8,8 @@
 
 import numpy as np
 
+from pandas.util._decorators import set_module
+
 from pandas.core.dtypes.base import register_extension_dtype
 from pandas.core.dtypes.common import is_float_dtype
 
@@ -168,13 +170,15 @@ class FloatingArray(NumericArray):
 
 
 @register_extension_dtype
+@set_module("pandas")
 class Float32Dtype(FloatingDtype):
     type = np.float32
     name: ClassVar[str] = "Float32"
     __doc__ = _dtype_docstring.format(dtype="float32")
 
 
 @register_extension_dtype
+@set_module("pandas")
 class Float64Dtype(FloatingDtype):
     type = np.float64
     name: ClassVar[str] = "Float64"
diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -8,6 +8,8 @@
 
 import numpy as np
 
+from pandas.util._decorators import set_module
+
 from pandas.core.dtypes.base import register_extension_dtype
 from pandas.core.dtypes.common import is_integer_dtype
 
@@ -218,55 +220,63 @@ class IntegerArray(NumericArray):
 
 
 @register_extension_dtype
+@set_module("pandas")
 class Int8Dtype(IntegerDtype):
     type = np.int8
     name: ClassVar[str] = "Int8"
     __doc__ = _dtype_docstring.format(dtype="int8")
 
 
 @register_extension_dtype
+@set_module("pandas")
 class Int16Dtype(IntegerDtype):
     type = np.int16
     name: ClassVar[str] = "Int16"
     __doc__ = _dtype_docstring.format(dtype="int16")
 
 
 @register_extension_dtype
+@set_module("pandas")
 class Int32Dtype(IntegerDtype):
     type = np.int32
     name: ClassVar[str] = "Int32"
     __doc__ = _dtype_docstring.format(dtype="int32")
 
 
 @register_extension_dtype
+@set_module("pandas")
 class Int64Dtype(IntegerDtype):
     type = np.int64
     name: ClassVar[str] = "Int64"
     __doc__ = _dtype_docstring.format(dtype="int64")
 
 
 @register_extension_dtype
+@set_module("pandas")
 class UInt8Dtype(IntegerDtype):
     type = np.uint8
     name: ClassVar[str] = "UInt8"
     __doc__ = _dtype_docstring.format(dtype="uint8")
 
 
 @register_extension_dtype
+@set_module("pandas")
 class UInt16Dtype(IntegerDtype):
     type = np.uint16
     name: ClassVar[str] = "UInt16"
     __doc__ = _dtype_docstring.format(dtype="uint16")
 
 
 @register_extension_dtype
+@set_module("pandas")
 class UInt32Dtype(IntegerDtype):
     type = np.uint32
     name: ClassVar[str] = "UInt32"
     __doc__ = _dtype_docstring.format(dtype="uint32")
 
 
 @register_extension_dtype
+@set_module("pandas")
 class UInt64Dtype(IntegerDtype):
     type = np.uint64
     name: ClassVar[str] = "UInt64"
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -123,7 +123,7 @@ class PandasExtensionDtype(ExtensionDtype):
     # problem dealing with multiple inheritance from PandasExtensionDtype
     # and ExtensionDtype's @properties in the subclasses below. The kind and
     # type variables in those subclasses are explicitly typed below.
-    subdtype = None
+    subdtype: DtypeObj | None = None
     str: str_type
     num = 100
     shape: tuple[int, ...] = ()
@@ -1604,7 +1604,7 @@ class BaseMaskedDtype(ExtensionDtype):
     Base class for dtypes for BaseMaskedArray subclasses.
     """
 
-    base = None
+    base: DtypeObj | None = None
     type: type
     _internal_fill_value: Scalar
 
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
@@ -45,6 +45,7 @@
 )
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
     from typing import TypeAlias
 
     from pandas import Index
@@ -548,7 +549,7 @@ def _interpolate_scipy_wrapper(
     new_x = np.asarray(new_x)
 
     # ignores some kwargs that could be passed along.
-    alt_methods = {
+    alt_methods: dict[str, Callable[..., np.ndarray]] = {
         "barycentric": interpolate.barycentric_interpolate,
         "krogh": interpolate.krogh_interpolate,
         "from_derivatives": _from_derivatives,
@@ -566,6 +567,7 @@ def _interpolate_scipy_wrapper(
         "cubic",
         "polynomial",
     ]
+    terp: Callable[..., np.ndarray] | None
     if method in interp1d_methods:
         if method == "polynomial":
             kind = order
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -693,6 +693,10 @@ def nanmean(
     >>> nanops.nanmean(s.values)
     np.float64(1.5)
     """
+    if values.dtype == object and len(values) > 1_000 and mask is None:
+        # GH#54754 if we are going to fail, try to fail-fast
+        nanmean(values[:1000], axis=axis, skipna=skipna)
+
     dtype = values.dtype
     values, mask = _get_values(values, skipna, fill_value=0, mask=mask)
     dtype_sum = _get_dtype_max(dtype)
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -128,8 +128,11 @@ def __init__(
 
         self.level = self.index._get_level_number(level)
 
-        # when index includes `nan`, need to lift levels/strides by 1
-        self.lift = 1 if -1 in self.index.codes[self.level] else 0
+        # `nan` values have code `-1`, when sorting, we lift to assign them
+        # at index 0
+        self.has_nan = -1 in self.index.codes[self.level]
+        should_lift = self.has_nan and self.sort
+        self.lift = 1 if should_lift else 0
 
         # Note: the "pop" below alters these in-place.
         self.new_index_levels = list(self.index.levels)
@@ -138,8 +141,16 @@ def __init__(
         self.removed_name = self.new_index_names.pop(self.level)
         self.removed_level = self.new_index_levels.pop(self.level)
         self.removed_level_full = index.levels[self.level]
+        self.unique_nan_index: int = -1
         if not self.sort:
-            unique_codes = unique(self.index.codes[self.level])
+            unique_codes: np.ndarray = unique(self.index.codes[self.level])
+            if self.has_nan:
+                # drop nan codes, because they are not represented in level
+                nan_mask = unique_codes == -1
+
+                unique_codes = unique_codes[~nan_mask]
+                self.unique_nan_index = np.flatnonzero(nan_mask)[0]
+
             self.removed_level = self.removed_level.take(unique_codes)
             self.removed_level_full = self.removed_level_full.take(unique_codes)
 
@@ -210,7 +221,7 @@ def _make_selectors(self) -> None:
         ngroups = len(obs_ids)
 
         comp_index = ensure_platform_int(comp_index)
-        stride = self.index.levshape[self.level] + self.lift
+        stride = self.index.levshape[self.level] + self.has_nan
         self.full_shape = ngroups, stride
 
         selector = self.sorted_labels[-1] + stride * comp_index + self.lift
@@ -362,13 +373,13 @@ def get_new_values(self, values, fill_value=None):
 
     def get_new_columns(self, value_columns: Index | None):
         if value_columns is None:
-            if self.lift == 0:
+            if not self.has_nan:
                 return self.removed_level._rename(name=self.removed_name)
 
             lev = self.removed_level.insert(0, item=self.removed_level._na_value)
             return lev.rename(self.removed_name)
 
-        stride = len(self.removed_level) + self.lift
+        stride = len(self.removed_level) + self.has_nan
         width = len(value_columns)
         propagator = np.repeat(np.arange(width), stride)
 
@@ -401,12 +412,21 @@ def _repeater(self) -> np.ndarray:
         if len(self.removed_level_full) != len(self.removed_level):
             # In this case, we remap the new codes to the original level:
             repeater = self.removed_level_full.get_indexer(self.removed_level)
-            if self.lift:
+            if self.has_nan:
+                # insert nan index at first position
                 repeater = np.insert(repeater, 0, -1)
         else:
             # Otherwise, we just use each level item exactly once:
-            stride = len(self.removed_level) + self.lift
+            stride = len(self.removed_level) + self.has_nan
             repeater = np.arange(stride) - self.lift
+            if self.has_nan and not self.sort:
+                assert self.unique_nan_index > -1, (
+                    "`unique_nan_index` not properly initialized"
+                )
+                # assign -1 where should be nan according to the unique values.
+                repeater[self.unique_nan_index] = -1
+                # compensate for the removed index level
+                repeater[self.unique_nan_index + 1 :] -= 1
 
         return repeater
 
diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py
@@ -524,7 +524,10 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
             # TODO: handle record value which are lists, at least error
             #       reasonably
             data = nested_to_record(data, sep=sep, max_level=max_level)
-        return DataFrame(data, index=index)
+        result = DataFrame(data, index=index)
+        if record_prefix is not None:
+            result = result.rename(columns=lambda x: f"{record_prefix}{x}")
+        return result
     elif not isinstance(record_path, list):
         record_path = [record_path]
 
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
@@ -260,7 +260,7 @@ def _get_ind(y: np.ndarray, ind):
 
     @classmethod
     # error: Signature of "_plot" incompatible with supertype "MPLPlot"
-    def _plot(  #  type: ignore[override]
+    def _plot(  # type: ignore[override]
         cls,
         ax: Axes,
         y: np.ndarray,
@@ -277,6 +277,8 @@ def _plot(  #  type: ignore[override]
         y = remove_na_arraylike(y)
         gkde = gaussian_kde(y, bw_method=bw_method, weights=weights)
 
+        # gaussian_kde.evaluate(None) raises TypeError, so pyright requires this check
+        assert ind is not None
         y = gkde.evaluate(ind)
         lines = MPLPlot._plot(ax, ind, y, style=style, **kwds)
         return lines
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
@@ -412,11 +412,23 @@ def test_util_in_top_level(self):
 def test_set_module():
     assert pd.DataFrame.__module__ == "pandas"
     assert pd.CategoricalDtype.__module__ == "pandas"
+    assert pd.DatetimeTZDtype.__module__ == "pandas"
     assert pd.PeriodDtype.__module__ == "pandas"
     assert pd.IntervalDtype.__module__ == "pandas"
     assert pd.SparseDtype.__module__ == "pandas"
     assert pd.ArrowDtype.__module__ == "pandas"
     assert pd.StringDtype.__module__ == "pandas"
+    assert pd.BooleanDtype.__module__ == "pandas"
+    assert pd.Int8Dtype.__module__ == "pandas"
+    assert pd.Int16Dtype.__module__ == "pandas"
+    assert pd.Int32Dtype.__module__ == "pandas"
+    assert pd.Int64Dtype.__module__ == "pandas"
+    assert pd.UInt8Dtype.__module__ == "pandas"
+    assert pd.UInt16Dtype.__module__ == "pandas"
+    assert pd.UInt32Dtype.__module__ == "pandas"
+    assert pd.UInt64Dtype.__module__ == "pandas"
+    assert pd.Float32Dtype.__module__ == "pandas"
+    assert pd.Float64Dtype.__module__ == "pandas"
     assert pd.Index.__module__ == "pandas"
     assert pd.CategoricalIndex.__module__ == "pandas"
     assert pd.DatetimeIndex.__module__ == "pandas"
diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -1386,6 +1386,43 @@ def test_unstack_sort_false(frame_or_series, dtype):
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "levels2, expected_columns",
+    [
+        (
+            [None, 1, 2, 3],
+            [("value", np.nan), ("value", 1), ("value", 2), ("value", 3)],
+        ),
+        (
+            [1, None, 2, 3],
+            [("value", 1), ("value", np.nan), ("value", 2), ("value", 3)],
+        ),
+        (
+            [1, 2, None, 3],
+            [("value", 1), ("value", 2), ("value", np.nan), ("value", 3)],
+        ),
+        (
+            [1, 2, 3, None],
+            [("value", 1), ("value", 2), ("value", 3), ("value", np.nan)],
+        ),
+    ],
+    ids=["nan=first", "nan=second", "nan=third", "nan=last"],
+)
+def test_unstack_sort_false_nan(levels2, expected_columns):
+    # GH#61221
+    levels1 = ["b", "a"]
+    index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
+    df = DataFrame({"value": [0, 1, 2, 3, 4, 5, 6, 7]}, index=index)
+    result = df.unstack(level="level2", sort=False)
+    expected_data = [[0, 4], [1, 5], [2, 6], [3, 7]]
+    expected = DataFrame(
+        dict(zip(expected_columns, expected_data)),
+        index=Index(["b", "a"], name="level1"),
+        columns=MultiIndex.from_tuples(expected_columns, names=[None, "level2"]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 def test_unstack_fill_frame_object():
     # GH12815 Test unstacking with object.
     data = Series(["a", "b", "c", "a"], dtype="object")
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py
diff --git a/requirements-dev.txt b/requirements-dev.txt