Skip to content

Commit f282a3e

Browse files
authored
Merge branch 'main' into enh-excel-header-filter
2 parents 6c9feed + a329dc3 commit f282a3e

File tree

13 files changed

+314
-46
lines changed

13 files changed

+314
-46
lines changed

doc/source/user_guide/10min.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,9 @@ Setting a new column automatically aligns the data by the indexes:
318318

319319
.. ipython:: python
320320
321-
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
321+
s1 = pd.Series(
322+
[1, 2, 3, 4, 5, 6],
323+
index=pd.date_range("20130102", periods=6))
322324
s1
323325
df["F"] = s1
324326

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,6 +1001,7 @@ Numeric
10011001
- Bug in :meth:`Series.dot` returning ``object`` dtype for :class:`ArrowDtype` and nullable-dtype data (:issue:`61375`)
10021002
- Bug in :meth:`Series.std` and :meth:`Series.var` when using complex-valued data (:issue:`61645`)
10031003
- Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
1004+
- Bug in arithmetic operations between objects with numpy-nullable dtype and :class:`ArrowDtype` incorrectly raising (:issue:`58602`)
10041005

10051006
Conversion
10061007
^^^^^^^^^^
@@ -1056,6 +1057,7 @@ MultiIndex
10561057
- :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`)
10571058
- Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`)
10581059
- Bug in :class:`DataFrame` arithmetic operations with :class:`Series` in case of unaligned MultiIndex (:issue:`61009`)
1060+
- Bug in :meth:`MultiIndex.union` raising when indexes have duplicates with differing names (:issue:`62059`)
10591061
- Bug in :meth:`MultiIndex.from_tuples` causing wrong output with input of type tuples having NaN values (:issue:`60695`, :issue:`60988`)
10601062
- Bug in :meth:`DataFrame.__setitem__` where column alignment logic would reindex the assigned value with an empty index, incorrectly setting all values to ``NaN``.(:issue:`61841`)
10611063
- Bug in :meth:`DataFrame.reindex` and :meth:`Series.reindex` where reindexing :class:`Index` to a :class:`MultiIndex` would incorrectly set all values to ``NaN``.(:issue:`60923`)

pandas/core/arrays/masked.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@
3737
is_string_dtype,
3838
pandas_dtype,
3939
)
40-
from pandas.core.dtypes.dtypes import BaseMaskedDtype
40+
from pandas.core.dtypes.dtypes import (
41+
ArrowDtype,
42+
BaseMaskedDtype,
43+
)
4144
from pandas.core.dtypes.missing import (
4245
array_equivalent,
4346
is_valid_na_for_dtype,
@@ -767,6 +770,10 @@ def _arith_method(self, other, op):
767770
pd_op = ops.get_array_op(op)
768771
other = ensure_wrapped_if_datetimelike(other)
769772

773+
if isinstance(other, ExtensionArray) and isinstance(other.dtype, ArrowDtype):
774+
# GH#58602
775+
return NotImplemented
776+
770777
if op_name in {"pow", "rpow"} and isinstance(other, np.bool_):
771778
# Avoid DeprecationWarning: In future, it will be an error
772779
# for 'np.bool_' scalars to be interpreted as an index
@@ -843,7 +850,11 @@ def _cmp_method(self, other, op) -> BooleanArray:
843850

844851
mask = None
845852

846-
if isinstance(other, BaseMaskedArray):
853+
if isinstance(other, ExtensionArray) and isinstance(other.dtype, ArrowDtype):
854+
# GH#58602
855+
return NotImplemented
856+
857+
elif isinstance(other, BaseMaskedArray):
847858
other, mask = other._data, other._mask
848859

849860
elif is_list_like(other):

pandas/core/arrays/sparse/array.py

Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -85,23 +85,17 @@
8585

8686
from pandas.io.formats import printing
8787

88-
# See https://github.com/python/typing/issues/684
8988
if TYPE_CHECKING:
9089
from collections.abc import (
9190
Callable,
9291
Sequence,
9392
)
94-
from enum import Enum
93+
from types import EllipsisType
9594
from typing import (
9695
Protocol,
9796
type_check_only,
9897
)
9998

100-
class ellipsis(Enum):
101-
Ellipsis = "..."
102-
103-
Ellipsis = ellipsis.Ellipsis
104-
10599
from scipy.sparse import (
106100
csc_array,
107101
csc_matrix,
@@ -134,10 +128,6 @@ def tocsc(self, /) -> csc_array | csc_matrix: ...
134128
from pandas import Series
135129

136130

137-
else:
138-
ellipsis = type(Ellipsis)
139-
140-
141131
# ----------------------------------------------------------------------------
142132
# Array
143133

@@ -974,31 +964,22 @@ def __getitem__(self, key: ScalarIndexer) -> Any: ...
974964
@overload
975965
def __getitem__(
976966
self,
977-
key: SequenceIndexer | tuple[int | ellipsis, ...],
967+
key: SequenceIndexer | tuple[int | EllipsisType, ...],
978968
) -> Self: ...
979969

980970
def __getitem__(
981971
self,
982-
key: PositionalIndexer | tuple[int | ellipsis, ...],
972+
key: PositionalIndexer | tuple[int | EllipsisType, ...],
983973
) -> Self | Any:
984974
if isinstance(key, tuple):
985975
key = unpack_tuple_and_ellipses(key)
986-
if key is Ellipsis:
976+
if key is ...:
987977
raise ValueError("Cannot slice with Ellipsis")
988978

989979
if is_integer(key):
990980
return self._get_val_at(key)
991981
elif isinstance(key, tuple):
992-
# error: Invalid index type "Tuple[Union[int, ellipsis], ...]"
993-
# for "ndarray[Any, Any]"; expected type
994-
# "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
995-
# integer[Any]]]], _NestedSequence[_SupportsArray[dtype[
996-
# Union[bool_, integer[Any]]]]], _NestedSequence[Union[
997-
# bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[
998-
# dtype[Union[bool_, integer[Any]]]], _NestedSequence[
999-
# _SupportsArray[dtype[Union[bool_, integer[Any]]]]],
1000-
# _NestedSequence[Union[bool, int]]], ...]]"
1001-
data_slice = self.to_dense()[key] # type: ignore[index]
982+
data_slice = self.to_dense()[key]
1002983
elif isinstance(key, slice):
1003984
# Avoid densifying when handling contiguous slices
1004985
if key.step is None or key.step == 1:

pandas/core/dtypes/dtypes.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2308,8 +2308,35 @@ def kind(self) -> str:
23082308

23092309
@cache_readonly
23102310
def itemsize(self) -> int:
2311-
"""Return the number of bytes in this dtype"""
2312-
return self.numpy_dtype.itemsize
2311+
"""
2312+
Return the number of bytes in this dtype.
2313+
2314+
For Arrow-backed dtypes:
2315+
- Returns the fixed-width bit size divided by 8 for standard fixed-width types.
2316+
- For boolean types, returns the NumPy itemsize.
2317+
- Falls back to the NumPy dtype itemsize for variable-width & unsupported types.
2318+
2319+
Examples
2320+
--------
2321+
>>> import pyarrow as pa
2322+
>>> import pandas as pd
2323+
>>> dtype = pd.ArrowDtype(pa.int32())
2324+
>>> dtype.itemsize
2325+
4
2326+
2327+
>>> dtype = pd.ArrowDtype(pa.bool_())
2328+
>>> dtype.itemsize # falls back to numpy dtype
2329+
1
2330+
"""
2331+
if pa.types.is_boolean(self.pyarrow_dtype):
2332+
return self.numpy_dtype.itemsize
2333+
2334+
# Use pyarrow itemsize for fixed-width data types
2335+
# e.g. int32 -> 32 bits // 8 = 4 bytes
2336+
try:
2337+
return self.pyarrow_dtype.bit_width // 8
2338+
except (ValueError, AttributeError, NotImplementedError):
2339+
return self.numpy_dtype.itemsize
23132340

23142341
def construct_array_type(self) -> type_t[ArrowExtensionArray]:
23152342
"""

pandas/core/generic.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6104,10 +6104,15 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
61046104
"""
61056105
Propagate metadata from other to self.
61066106
6107+
This is the default implementation. Subclasses may override this method to
6108+
implement their own metadata handling.
6109+
61076110
Parameters
61086111
----------
61096112
other : the object from which to get the attributes that we are going
6110-
to propagate
6113+
to propagate. If ``other`` has an ``input_objs`` attribute, then
6114+
this attribute must contain an iterable of objects, each with an
6115+
``attrs`` attribute.
61116116
method : str, optional
61126117
A passed method name providing context on where ``__finalize__``
61136118
was called.
@@ -6116,6 +6121,12 @@ def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
61166121
61176122
The value passed as `method` are not currently considered
61186123
stable across pandas releases.
6124+
6125+
Notes
6126+
-----
6127+
In case ``other`` has an ``input_objs`` attribute, this method only
6128+
propagates its metadata if each object in ``input_objs`` has the exact
6129+
same metadata as the others.
61196130
"""
61206131
if isinstance(other, NDFrame):
61216132
if other.attrs:

pandas/core/indexes/multi.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3946,7 +3946,9 @@ def _union(self, other, sort) -> MultiIndex:
39463946
if other.has_duplicates:
39473947
# This is only necessary if other has dupes,
39483948
# otherwise difference is faster
3949-
result = super()._union(other, sort)
3949+
result = super(MultiIndex, self.rename(result_names))._union(
3950+
other.rename(result_names), sort
3951+
)
39503952

39513953
if isinstance(result, MultiIndex):
39523954
return result

pandas/core/nanops.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,7 +1366,7 @@ def nankurt(
13661366
# With a few modifications, like using the maximum value instead of the averages
13671367
# and some adaptations because they use the average and we use the sum for `m2`.
13681368
# We need to estimate an upper bound to the error to consider the data constant.
1369-
# Lets call:
1369+
# Let's call:
13701370
# x: true value in data
13711371
# y: floating point representation
13721372
# e: relative approximation error
@@ -1377,7 +1377,7 @@ def nankurt(
13771377
# (|x - y|/|x|)² <= e²
13781378
# Σ (|x - y|/|x|)² <= ne²
13791379
#
1380-
# Lets say that the fperr upper bound for m2 is constrained by the summation.
1380+
# Let's say that the fperr upper bound for m2 is constrained by the summation.
13811381
# |m2 - y|/|m2| <= ne²
13821382
# |m2 - y| <= n|m2|e²
13831383
#

pandas/core/reshape/merge.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,16 +1129,15 @@ def _reindex_and_concat(
11291129
return result
11301130

11311131
def get_result(self) -> DataFrame:
1132+
"""
1133+
Execute the merge.
1134+
"""
11321135
if self.indicator:
11331136
self.left, self.right = self._indicator_pre_merge(self.left, self.right)
11341137

11351138
join_index, left_indexer, right_indexer = self._get_join_info()
11361139

11371140
result = self._reindex_and_concat(join_index, left_indexer, right_indexer)
1138-
result = result.__finalize__(
1139-
types.SimpleNamespace(input_objs=[self.left, self.right]),
1140-
method=self._merge_type,
1141-
)
11421141

11431142
if self.indicator:
11441143
result = self._indicator_post_merge(result)
@@ -1167,6 +1166,13 @@ def _indicator_name(self) -> str | None:
11671166
def _indicator_pre_merge(
11681167
self, left: DataFrame, right: DataFrame
11691168
) -> tuple[DataFrame, DataFrame]:
1169+
"""
1170+
Add one indicator column to each of the left and right inputs.
1171+
1172+
These columns are used to produce another column in the output of the
1173+
merge, indicating for each row of the output whether it was produced
1174+
using the left, right or both inputs.
1175+
"""
11701176
columns = left.columns.union(right.columns)
11711177

11721178
for i in ["_left_indicator", "_right_indicator"]:
@@ -1193,6 +1199,12 @@ def _indicator_pre_merge(
11931199

11941200
@final
11951201
def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
1202+
"""
1203+
Add an indicator column to the merge result.
1204+
1205+
This column indicates for each row of the output whether it was produced using
1206+
the left, right or both inputs.
1207+
"""
11961208
result["_left_indicator"] = result["_left_indicator"].fillna(0)
11971209
result["_right_indicator"] = result["_right_indicator"].fillna(0)
11981210

pandas/tests/extension/test_arrow.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3702,6 +3702,94 @@ def test_pow_with_all_na_float():
37023702
tm.assert_series_equal(result, expected)
37033703

37043704

3705+
def test_mul_numpy_nullable_with_pyarrow_float():
3706+
# GH#58602
3707+
left = pd.Series(range(5), dtype="Float64")
3708+
right = pd.Series(range(5), dtype="float64[pyarrow]")
3709+
3710+
expected = pd.Series([0, 1, 4, 9, 16], dtype="float64[pyarrow]")
3711+
3712+
result = left * right
3713+
tm.assert_series_equal(result, expected)
3714+
3715+
result2 = right * left
3716+
tm.assert_series_equal(result2, expected)
3717+
3718+
# while we're here, let's check __eq__
3719+
result3 = left == right
3720+
expected3 = pd.Series([True] * 5, dtype="bool[pyarrow]")
3721+
tm.assert_series_equal(result3, expected3)
3722+
3723+
result4 = right == left
3724+
tm.assert_series_equal(result4, expected3)
3725+
3726+
3727+
@pytest.mark.parametrize(
3728+
"type_name, expected_size",
3729+
[
3730+
# Integer types
3731+
("int8", 1),
3732+
("int16", 2),
3733+
("int32", 4),
3734+
("int64", 8),
3735+
("uint8", 1),
3736+
("uint16", 2),
3737+
("uint32", 4),
3738+
("uint64", 8),
3739+
# Floating point types
3740+
("float16", 2),
3741+
("float32", 4),
3742+
("float64", 8),
3743+
# Boolean
3744+
("bool_", 1),
3745+
# Date and timestamp types
3746+
("date32", 4),
3747+
("date64", 8),
3748+
("timestamp", 8),
3749+
# Time types
3750+
("time32", 4),
3751+
("time64", 8),
3752+
# Decimal types
3753+
("decimal128", 16),
3754+
("decimal256", 32),
3755+
],
3756+
)
3757+
def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size):
3758+
# GH 57948
3759+
3760+
parametric_type_map = {
3761+
"timestamp": pa.timestamp("ns"),
3762+
"time32": pa.time32("s"),
3763+
"time64": pa.time64("ns"),
3764+
"decimal128": pa.decimal128(38, 10),
3765+
"decimal256": pa.decimal256(76, 10),
3766+
}
3767+
3768+
if type_name in parametric_type_map:
3769+
arrow_type = parametric_type_map.get(type_name)
3770+
else:
3771+
arrow_type = getattr(pa, type_name)()
3772+
dtype = ArrowDtype(arrow_type)
3773+
3774+
if type_name == "bool_":
3775+
expected_size = dtype.numpy_dtype.itemsize
3776+
3777+
assert dtype.itemsize == expected_size, (
3778+
f"{type_name} expected {expected_size}, got {dtype.itemsize} "
3779+
f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})"
3780+
)
3781+
3782+
3783+
@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"])
3784+
def test_arrow_dtype_itemsize_variable_width(type_name):
3785+
# GH 57948
3786+
3787+
arrow_type = getattr(pa, type_name)()
3788+
dtype = ArrowDtype(arrow_type)
3789+
3790+
assert dtype.itemsize == dtype.numpy_dtype.itemsize
3791+
3792+
37053793
def test_cast_pontwise_result_decimal_nan():
37063794
# GH#62522 we don't want to get back null[pyarrow] here
37073795
ser = pd.Series([], dtype="float64[pyarrow]")

0 commit comments

Comments
 (0)