diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 03ad8ed162c95..0db7870255b69 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -889,6 +889,8 @@ Datetimelike - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) +- Removed the special casing for sequences of Python ``date`` objects in ``DatetimeIndex.get_indexer`` and related indexing logic. + Indexing a ``DatetimeIndex`` with Python ``date`` objects now behaves consistently with other types. (:issue:`62158`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 59ac122e4f9ea..de54ae81641cf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,9 @@ from __future__ import annotations from collections import abc -from datetime import datetime +from datetime import ( + datetime, +) import functools from itertools import zip_longest import operator @@ -39,7 +41,6 @@ no_default, ) from pandas._libs.tslibs import ( - OutOfBoundsDatetime, Timestamp, tz_compare, ) @@ -6204,11 +6205,6 @@ def _maybe_downcast_for_indexing(self, other: Index) -> tuple[Index, Index]: # standardize on UTC return self.tz_convert("UTC"), other.tz_convert("UTC") - elif self.inferred_type == "date" and isinstance(other, ABCDatetimeIndex): - try: - return type(other)(self), other - except OutOfBoundsDatetime: - return self, other elif self.inferred_type == "timedelta" and isinstance(other, ABCTimedeltaIndex): # TODO: we dont have tests that get here return type(other)(self), other @@ -6309,6 +6305,29 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return dtype.kind == "b" elif is_numeric_dtype(self.dtype): return is_numeric_dtype(dtype) + # GH#62158 + elif isinstance(dtype, ArrowDtype): + import pyarrow as pa + + pa_dtype = dtype.pyarrow_dtype + if dtype.kind != "M": + if self.dtype.kind == "b": + return dtype.kind == "b" + if is_numeric_dtype(self.dtype): + return pa.types.is_integer(pa_dtype) or pa.types.is_floating( + pa_dtype + ) + if self.dtype.kind == "m" and pa.types.is_duration(pa_dtype): + return True + return False + if self.dtype.kind != "M": + return False + if pa.types.is_date(pa_dtype): + return False + if pa.types.is_timestamp(pa_dtype): + if (pa_dtype.tz is None) ^ (getattr(self, "tz", None) is None): + return False + return True # TODO: this was written assuming we only get here with object-dtype, # which is no longer correct. Can we specialize for EA? return True diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 0c99b08cb30c4..3545c6a23ee6c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1880,6 +1880,30 @@ def test_add_new_column_infer_string(): tm.assert_frame_equal(df, expected) +def test_datetime_indexer_consistency_pyarrow_date32(): + # GH#62158 + pytest.importorskip("pyarrow", minversion="13.0.0") + import pyarrow as pa + + ser = Series(["2016-01-01"], dtype="date32[pyarrow]") + ser3 = ser.astype("datetime64[ns]") + dti = Index(ser3) + + # Make sure we don't treat Arrow date as timestamp + dtype = ser.dtype.pyarrow_dtype + assert not (pa.types.is_timestamp(dtype) and not pa.types.is_date(dtype)) + + with pytest.raises(KeyError): + dti.get_loc(ser[0]) + + # get_indexer returns -1 for both Arrow array and object-cast + result = dti.get_indexer(ser.values) + tm.assert_numpy_array_equal(result, np.array([-1], dtype=np.intp)) + + result_obj = dti.get_indexer(ser.values.astype(object)) + tm.assert_numpy_array_equal(result_obj, np.array([-1], dtype=np.intp)) + + class TestSetitemValidation: # This is adapted from pandas/tests/arrays/masked/test_indexing.py def _check_setitem_invalid(self, df, invalid, indexer): diff --git a/pandas/tests/frame/methods/test_asfreq.py b/pandas/tests/frame/methods/test_asfreq.py index 1c3c41e2e0299..727011ece9dc7 100644 --- a/pandas/tests/frame/methods/test_asfreq.py +++ b/pandas/tests/frame/methods/test_asfreq.py @@ -192,9 +192,10 @@ def test_asfreq_with_date_object_index(self, frame_or_series): ts2 = ts.copy() ts2.index = [x.date() for x in ts2.index] - result = ts2.asfreq("4h", method="ffill") - expected = ts.asfreq("4h", method="ffill") - tm.assert_equal(result, expected) + with pytest.raises( + TypeError, match="Cannot compare Timestamp with datetime.date" + ): + ts2.asfreq("4h", method="ffill") def test_asfreq_with_unsorted_index(self, frame_or_series): # GH#39805 diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index c44345273466c..0bc5c7006cf7b 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -514,10 +514,11 @@ def test_contains_nonunique(self, vals): class TestGetIndexer: def test_get_indexer_date_objs(self): + # Behavior for get_indexer with date objects changed in GH#62158. rng = date_range("1/1/2000", periods=20) result = rng.get_indexer(rng.map(lambda x: x.date())) - expected = rng.get_indexer(rng) + expected = np.full(len(rng), -1, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) def test_get_indexer(self): @@ -562,17 +563,22 @@ def test_get_indexer(self): idx.get_indexer(idx[[0]], method="nearest", tolerance="foo") @pytest.mark.parametrize( - "target", + "target, expected", [ - [date(2020, 1, 1), Timestamp("2020-01-02")], - [Timestamp("2020-01-01"), date(2020, 1, 2)], + ( + [date(2020, 1, 1), Timestamp("2020-01-02")], + np.array([-1, 1], dtype=np.intp), + ), + ( + [Timestamp("2020-01-01"), Timestamp(date(2020, 1, 2))], + np.array([0, 1], dtype=np.intp), + ), ], ) - def test_get_indexer_mixed_dtypes(self, target): + def test_get_indexer_mixed_dtypes(self, target, expected): # https://github.com/pandas-dev/pandas/issues/33741 values = DatetimeIndex([Timestamp("2020-01-01"), Timestamp("2020-01-02")]) result = values.get_indexer(target) - expected = np.array([0, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 35a9742d653db..ec31abdaf1711 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -758,6 +758,8 @@ def test_datetime_understood(self, unit): tm.assert_series_equal(result, expected) def test_align_date_objects_with_datetimeindex(self): + # GH#62158: v3.0.0 - DatetimeIndex no longer matches Python date labels. + # The result is always all-NaN and the union index. rng = date_range("1/1/2000", periods=20) ts = Series(np.random.default_rng(2).standard_normal(20), index=rng) @@ -767,10 +769,20 @@ def test_align_date_objects_with_datetimeindex(self): result = ts + ts2 result2 = ts2 + ts - expected = ts + ts[5:] - expected.index = expected.index._with_freq(None) - tm.assert_series_equal(result, expected) - tm.assert_series_equal(result2, expected) + + date_labels = [x.date() for x in rng[5:]] + expected_index_result = Index(list(rng) + date_labels, dtype=object) + expected_index_result2 = Index(date_labels + list(rng), dtype=object) + + # Length and index checks + assert len(result) == 35 + tm.assert_index_equal(result.index, expected_index_result) + tm.assert_index_equal(result2.index, expected_index_result2) + assert result.index.dtype == object + + # All NaN because there are no matching labels now + assert result.isna().all() + assert result2.isna().all() class TestNamePreservation: