Merge branch 'main' into enh-excel-header-filter

antznette1 · web-flow · commit f282a3eddb7a · 2025-10-19T15:48:40.000+01:00
diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
@@ -318,7 +318,9 @@ Setting a new column automatically aligns the data by the indexes:
 
 .. ipython:: python
 
-   s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
+   s1 = pd.Series(
+      [1, 2, 3, 4, 5, 6],
+      index=pd.date_range("20130102", periods=6))
    s1
    df["F"] = s1
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1001,6 +1001,7 @@ Numeric
 - Bug in :meth:`Series.dot` returning ``object`` dtype for :class:`ArrowDtype` and nullable-dtype data (:issue:`61375`)
 - Bug in :meth:`Series.std` and :meth:`Series.var` when using complex-valued data (:issue:`61645`)
 - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
+- Bug in arithmetic operations between objects with numpy-nullable dtype and :class:`ArrowDtype` incorrectly raising (:issue:`58602`)
 
 Conversion
 ^^^^^^^^^^
@@ -1056,6 +1057,7 @@ MultiIndex
 - :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`)
 - Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`)
 - Bug in :class:`DataFrame` arithmetic operations with :class:`Series` in case of unaligned MultiIndex (:issue:`61009`)
+- Bug in :meth:`MultiIndex.union` raising when indexes have duplicates with differing names (:issue:`62059`)
 - Bug in :meth:`MultiIndex.from_tuples` causing wrong output with input of type tuples having NaN values (:issue:`60695`, :issue:`60988`)
 - Bug in :meth:`DataFrame.__setitem__` where column alignment logic would reindex the assigned value with an empty index, incorrectly setting all values to ``NaN``.(:issue:`61841`)
 - Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` where reindexing :class:`Index` to a :class:`MultiIndex` would incorrectly set all values to ``NaN``.(:issue:`60923`)
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
@@ -37,7 +37,10 @@
     is_string_dtype,
     pandas_dtype,
 )
-from pandas.core.dtypes.dtypes import BaseMaskedDtype
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    BaseMaskedDtype,
+)
 from pandas.core.dtypes.missing import (
     array_equivalent,
     is_valid_na_for_dtype,
@@ -767,6 +770,10 @@ def _arith_method(self, other, op):
         pd_op = ops.get_array_op(op)
         other = ensure_wrapped_if_datetimelike(other)
 
+        if isinstance(other, ExtensionArray) and isinstance(other.dtype, ArrowDtype):
+            # GH#58602
+            return NotImplemented
+
         if op_name in {"pow", "rpow"} and isinstance(other, np.bool_):
             # Avoid DeprecationWarning: In future, it will be an error
             #  for 'np.bool_' scalars to be interpreted as an index
@@ -843,7 +850,11 @@ def _cmp_method(self, other, op) -> BooleanArray:
 
         mask = None
 
-        if isinstance(other, BaseMaskedArray):
+        if isinstance(other, ExtensionArray) and isinstance(other.dtype, ArrowDtype):
+            # GH#58602
+            return NotImplemented
+
+        elif isinstance(other, BaseMaskedArray):
             other, mask = other._data, other._mask
 
         elif is_list_like(other):
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -85,23 +85,17 @@
 
 from pandas.io.formats import printing
 
-# See https://github.com/python/typing/issues/684
 if TYPE_CHECKING:
     from collections.abc import (
         Callable,
         Sequence,
     )
-    from enum import Enum
+    from types import EllipsisType
     from typing import (
         Protocol,
         type_check_only,
     )
 
-    class ellipsis(Enum):
-        Ellipsis = "..."
-
-    Ellipsis = ellipsis.Ellipsis
-
     from scipy.sparse import (
         csc_array,
         csc_matrix,
@@ -134,10 +128,6 @@ def tocsc(self, /) -> csc_array | csc_matrix: ...
     from pandas import Series
 
 
-else:
-    ellipsis = type(Ellipsis)
-
-
 # ----------------------------------------------------------------------------
 # Array
 
@@ -974,31 +964,22 @@ def __getitem__(self, key: ScalarIndexer) -> Any: ...
     @overload
     def __getitem__(
         self,
-        key: SequenceIndexer | tuple[int | ellipsis, ...],
+        key: SequenceIndexer | tuple[int | EllipsisType, ...],
     ) -> Self: ...
 
     def __getitem__(
         self,
-        key: PositionalIndexer | tuple[int | ellipsis, ...],
+        key: PositionalIndexer | tuple[int | EllipsisType, ...],
     ) -> Self | Any:
         if isinstance(key, tuple):
             key = unpack_tuple_and_ellipses(key)
-            if key is Ellipsis:
+            if key is ...:
                 raise ValueError("Cannot slice with Ellipsis")
 
         if is_integer(key):
             return self._get_val_at(key)
         elif isinstance(key, tuple):
-            # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"
-            # for "ndarray[Any, Any]"; expected type
-            # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
-            # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[
-            # Union[bool_, integer[Any]]]]], _NestedSequence[Union[
-            # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[
-            # dtype[Union[bool_, integer[Any]]]], _NestedSequence[
-            # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],
-            # _NestedSequence[Union[bool, int]]], ...]]"
-            data_slice = self.to_dense()[key]  # type: ignore[index]
+            data_slice = self.to_dense()[key]
         elif isinstance(key, slice):
             # Avoid densifying when handling contiguous slices
             if key.step is None or key.step == 1:
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -2308,8 +2308,35 @@ def kind(self) -> str:
 
     @cache_readonly
     def itemsize(self) -> int:
-        """Return the number of bytes in this dtype"""
-        return self.numpy_dtype.itemsize
+        """
+        Return the number of bytes in this dtype.
+
+        For Arrow-backed dtypes:
+        - Returns the fixed-width bit size divided by 8 for standard fixed-width types.
+        - For boolean types, returns the NumPy itemsize.
+        - Falls back to the NumPy dtype itemsize for variable-width & unsupported types.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> import pandas as pd
+        >>> dtype = pd.ArrowDtype(pa.int32())
+        >>> dtype.itemsize
+        4
+
+        >>> dtype = pd.ArrowDtype(pa.bool_())
+        >>> dtype.itemsize  # falls back to numpy dtype
+        1
+        """
+        if pa.types.is_boolean(self.pyarrow_dtype):
+            return self.numpy_dtype.itemsize
+
+        # Use pyarrow itemsize for fixed-width data types
+        # e.g. int32 -> 32 bits // 8 = 4 bytes
+        try:
+            return self.pyarrow_dtype.bit_width // 8
+        except (ValueError, AttributeError, NotImplementedError):
+            return self.numpy_dtype.itemsize
 
     def construct_array_type(self) -> type_t[ArrowExtensionArray]:
         """
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6104,10 +6104,15 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
         """
         Propagate metadata from other to self.
 
+        This is the default implementation. Subclasses may override this method to
+        implement their own metadata handling.
+
         Parameters
         ----------
         other : the object from which to get the attributes that we are going
-            to propagate
+            to propagate. If ``other`` has an ``input_objs`` attribute, then
+            this attribute must contain an iterable of objects, each with an
+            ``attrs`` attribute.
         method : str, optional
             A passed method name providing context on where ``__finalize__``
             was called.
@@ -6116,6 +6121,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
 
                The value passed as `method` are not currently considered
                stable across pandas releases.
+
+        Notes
+        -----
+        In case ``other`` has an ``input_objs`` attribute, this method only
+        propagates its metadata if each object in ``input_objs`` has the exact
+        same metadata as the others.
         """
         if isinstance(other, NDFrame):
             if other.attrs:
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -3946,7 +3946,9 @@ def _union(self, other, sort) -> MultiIndex:
         if other.has_duplicates:
             # This is only necessary if other has dupes,
             # otherwise difference is faster
-            result = super()._union(other, sort)
+            result = super(MultiIndex, self.rename(result_names))._union(
+                other.rename(result_names), sort
+            )
 
             if isinstance(result, MultiIndex):
                 return result
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -1366,7 +1366,7 @@ def nankurt(
     # With a few modifications, like using the maximum value instead of the averages
     # and some adaptations because they use the average and we use the sum for `m2`.
     # We need to estimate an upper bound to the error to consider the data constant.
-    # Lets call:
+    # Let's call:
     # x: true value in data
     # y: floating point representation
     # e: relative approximation error
@@ -1377,7 +1377,7 @@ def nankurt(
     # (|x - y|/|x|)² <= e²
     # Σ (|x - y|/|x|)² <= ne²
     #
-    # Lets say that the fperr upper bound for m2 is constrained by the summation.
+    # Let's say that the fperr upper bound for m2 is constrained by the summation.
     # |m2 - y|/|m2| <= ne²
     # |m2 - y| <= n|m2|e²
     #
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -1129,16 +1129,15 @@ def _reindex_and_concat(
         return result
 
     def get_result(self) -> DataFrame:
+        """
+        Execute the merge.
+        """
         if self.indicator:
             self.left, self.right = self._indicator_pre_merge(self.left, self.right)
 
         join_index, left_indexer, right_indexer = self._get_join_info()
 
         result = self._reindex_and_concat(join_index, left_indexer, right_indexer)
-        result = result.__finalize__(
-            types.SimpleNamespace(input_objs=[self.left, self.right]),
-            method=self._merge_type,
-        )
 
         if self.indicator:
             result = self._indicator_post_merge(result)
@@ -1167,6 +1166,13 @@ def _indicator_name(self) -> str | None:
     def _indicator_pre_merge(
         self, left: DataFrame, right: DataFrame
     ) -> tuple[DataFrame, DataFrame]:
+        """
+        Add one indicator column to each of the left and right inputs.
+
+        These columns are used to produce another column in the output of the
+        merge, indicating for each row of the output whether it was produced
+        using the left, right or both inputs.
+        """
         columns = left.columns.union(right.columns)
 
         for i in ["_left_indicator", "_right_indicator"]:
@@ -1193,6 +1199,12 @@ def _indicator_pre_merge(
 
     @final
     def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
+        """
+        Add an indicator column to the merge result.
+
+        This column indicates for each row of the output whether it was produced using
+        the left, right or both inputs.
+        """
         result["_left_indicator"] = result["_left_indicator"].fillna(0)
         result["_right_indicator"] = result["_right_indicator"].fillna(0)
 
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -3702,6 +3702,94 @@ def test_pow_with_all_na_float():
     tm.assert_series_equal(result, expected)
 
 
+def test_mul_numpy_nullable_with_pyarrow_float():
+    # GH#58602
+    left = pd.Series(range(5), dtype="Float64")
+    right = pd.Series(range(5), dtype="float64[pyarrow]")
+
+    expected = pd.Series([0, 1, 4, 9, 16], dtype="float64[pyarrow]")
+
+    result = left * right
+    tm.assert_series_equal(result, expected)
+
+    result2 = right * left
+    tm.assert_series_equal(result2, expected)
+
+    # while we're here, let's check __eq__
+    result3 = left == right
+    expected3 = pd.Series([True] * 5, dtype="bool[pyarrow]")
+    tm.assert_series_equal(result3, expected3)
+
+    result4 = right == left
+    tm.assert_series_equal(result4, expected3)
+
+
+@pytest.mark.parametrize(
+    "type_name, expected_size",
+    [
+        # Integer types
+        ("int8", 1),
+        ("int16", 2),
+        ("int32", 4),
+        ("int64", 8),
+        ("uint8", 1),
+        ("uint16", 2),
+        ("uint32", 4),
+        ("uint64", 8),
+        # Floating point types
+        ("float16", 2),
+        ("float32", 4),
+        ("float64", 8),
+        # Boolean
+        ("bool_", 1),
+        # Date and timestamp types
+        ("date32", 4),
+        ("date64", 8),
+        ("timestamp", 8),
+        # Time types
+        ("time32", 4),
+        ("time64", 8),
+        # Decimal types
+        ("decimal128", 16),
+        ("decimal256", 32),
+    ],
+)
+def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size):
+    # GH 57948
+
+    parametric_type_map = {
+        "timestamp": pa.timestamp("ns"),
+        "time32": pa.time32("s"),
+        "time64": pa.time64("ns"),
+        "decimal128": pa.decimal128(38, 10),
+        "decimal256": pa.decimal256(76, 10),
+    }
+
+    if type_name in parametric_type_map:
+        arrow_type = parametric_type_map.get(type_name)
+    else:
+        arrow_type = getattr(pa, type_name)()
+    dtype = ArrowDtype(arrow_type)
+
+    if type_name == "bool_":
+        expected_size = dtype.numpy_dtype.itemsize
+
+    assert dtype.itemsize == expected_size, (
+        f"{type_name} expected {expected_size}, got {dtype.itemsize} "
+        f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})"
+    )
+
+
+@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"])
+def test_arrow_dtype_itemsize_variable_width(type_name):
+    # GH 57948
+
+    arrow_type = getattr(pa, type_name)()
+    dtype = ArrowDtype(arrow_type)
+
+    assert dtype.itemsize == dtype.numpy_dtype.itemsize
+
+
 def test_cast_pontwise_result_decimal_nan():
     # GH#62522 we don't want to get back null[pyarrow] here
     ser = pd.Series([], dtype="float64[pyarrow]")
diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py
diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py