From f7be1ae945079c763aa7d7135b3e4e6e518c0ec2 Mon Sep 17 00:00:00 2001 From: Abhijit Chakraborty <99414748+mraabhijit@users.noreply.github.com> Date: Sat, 18 Oct 2025 23:46:00 +0530 Subject: [PATCH] FIX: itemsize wrong for date32[day][pyarrow] dtype #57948 (#62657) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Yaswanth Kumar <155723049+VYaswanthKumar@users.noreply.github.com> Co-authored-by: Navya Srivastava <143343265+Navya1707@users.noreply.github.com> Co-authored-by: krishna datta <19500807+krishna-datta@users.noreply.github.com> Co-authored-by: ZA1815 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akashisang <151737560+Akashisang@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Co-authored-by: BreezeLune <1066773178@qq.com> Co-authored-by: jbrockmendel Co-authored-by: Aokizy2 <3441854632@qq.com> Co-authored-by: aokizy <14817191+aokizy2@user.noreply.gitee.com> Co-authored-by: Sumeet Bhatnagar <69593471+nemo-1999@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/masked.py | 15 ++++- pandas/core/dtypes/dtypes.py | 31 +++++++++- pandas/tests/extension/test_arrow.py | 88 ++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 016e553cf2092..470129d6d860b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1001,6 +1001,7 @@ Numeric - Bug in :meth:`Series.dot` returning ``object`` dtype for :class:`ArrowDtype` and nullable-dtype data (:issue:`61375`) - Bug in :meth:`Series.std` and :meth:`Series.var` when using complex-valued data (:issue:`61645`) - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`) +- Bug in arithmetic operations between objects with numpy-nullable dtype and :class:`ArrowDtype` incorrectly raising (:issue:`58602`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 57efde1a928bc..cdba53662e6fa 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -37,7 +37,10 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import BaseMaskedDtype +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + BaseMaskedDtype, +) from pandas.core.dtypes.missing import ( array_equivalent, is_valid_na_for_dtype, @@ -767,6 +770,10 @@ def _arith_method(self, other, op): pd_op = ops.get_array_op(op) other = ensure_wrapped_if_datetimelike(other) + if isinstance(other, ExtensionArray) and isinstance(other.dtype, ArrowDtype): + # GH#58602 + return NotImplemented + if op_name in {"pow", "rpow"} and isinstance(other, np.bool_): # Avoid DeprecationWarning: In future, it will be an error # for 'np.bool_' scalars to be interpreted as an index @@ -843,7 +850,11 @@ def _cmp_method(self, other, op) -> BooleanArray: mask = None - if isinstance(other, BaseMaskedArray): + if isinstance(other, ExtensionArray) and isinstance(other.dtype, ArrowDtype): + # GH#58602 + return NotImplemented + + elif isinstance(other, BaseMaskedArray): other, mask = other._data, other._mask elif is_list_like(other): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1e6761b2e1db0..6d99f9df73282 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2308,8 +2308,35 @@ def kind(self) -> str: @cache_readonly def itemsize(self) -> int: - """Return the number of bytes in this dtype""" - return self.numpy_dtype.itemsize + """ + Return the number of bytes in this dtype. + + For Arrow-backed dtypes: + - Returns the fixed-width bit size divided by 8 for standard fixed-width types. + - For boolean types, returns the NumPy itemsize. + - Falls back to the NumPy dtype itemsize for variable-width & unsupported types. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> dtype = pd.ArrowDtype(pa.int32()) + >>> dtype.itemsize + 4 + + >>> dtype = pd.ArrowDtype(pa.bool_()) + >>> dtype.itemsize # falls back to numpy dtype + 1 + """ + if pa.types.is_boolean(self.pyarrow_dtype): + return self.numpy_dtype.itemsize + + # Use pyarrow itemsize for fixed-width data types + # e.g. int32 -> 32 bits // 8 = 4 bytes + try: + return self.pyarrow_dtype.bit_width // 8 + except (ValueError, AttributeError, NotImplementedError): + return self.numpy_dtype.itemsize def construct_array_type(self) -> type_t[ArrowExtensionArray]: """ diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f39f7acdc57bf..2aa1b658fdf7b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3702,6 +3702,94 @@ def test_pow_with_all_na_float(): tm.assert_series_equal(result, expected) +def test_mul_numpy_nullable_with_pyarrow_float(): + # GH#58602 + left = pd.Series(range(5), dtype="Float64") + right = pd.Series(range(5), dtype="float64[pyarrow]") + + expected = pd.Series([0, 1, 4, 9, 16], dtype="float64[pyarrow]") + + result = left * right + tm.assert_series_equal(result, expected) + + result2 = right * left + tm.assert_series_equal(result2, expected) + + # while we're here, let's check __eq__ + result3 = left == right + expected3 = pd.Series([True] * 5, dtype="bool[pyarrow]") + tm.assert_series_equal(result3, expected3) + + result4 = right == left + tm.assert_series_equal(result4, expected3) + + +@pytest.mark.parametrize( + "type_name, expected_size", + [ + # Integer types + ("int8", 1), + ("int16", 2), + ("int32", 4), + ("int64", 8), + ("uint8", 1), + ("uint16", 2), + ("uint32", 4), + ("uint64", 8), + # Floating point types + ("float16", 2), + ("float32", 4), + ("float64", 8), + # Boolean + ("bool_", 1), + # Date and timestamp types + ("date32", 4), + ("date64", 8), + ("timestamp", 8), + # Time types + ("time32", 4), + ("time64", 8), + # Decimal types + ("decimal128", 16), + ("decimal256", 32), + ], +) +def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size): + # GH 57948 + + parametric_type_map = { + "timestamp": pa.timestamp("ns"), + "time32": pa.time32("s"), + "time64": pa.time64("ns"), + "decimal128": pa.decimal128(38, 10), + "decimal256": pa.decimal256(76, 10), + } + + if type_name in parametric_type_map: + arrow_type = parametric_type_map.get(type_name) + else: + arrow_type = getattr(pa, type_name)() + dtype = ArrowDtype(arrow_type) + + if type_name == "bool_": + expected_size = dtype.numpy_dtype.itemsize + + assert dtype.itemsize == expected_size, ( + f"{type_name} expected {expected_size}, got {dtype.itemsize} " + f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})" + ) + + +@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"]) +def test_arrow_dtype_itemsize_variable_width(type_name): + # GH 57948 + + arrow_type = getattr(pa, type_name)() + dtype = ArrowDtype(arrow_type) + + assert dtype.itemsize == dtype.numpy_dtype.itemsize + + def test_cast_pontwise_result_decimal_nan(): # GH#62522 we don't want to get back null[pyarrow] here ser = pd.Series([], dtype="float64[pyarrow]")