Merge main

rhshadrach · rhshadrach · commit 9c128d9806cf · 2025-11-01T12:48:54.000-04:00
diff --git a/README.md b/README.md
@@ -179,7 +179,7 @@ If you are simply looking to start working with the pandas codebase, navigate to
 
 You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas).
 
-Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it!
+Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’... you can do something about it!
 
 Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Slack](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack).
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -794,6 +794,7 @@ Other Deprecations
 - Deprecated allowing ``fill_value`` that cannot be held in the original dtype (excepting NA values for integer and bool dtypes) in :meth:`Series.shift` and :meth:`DataFrame.shift` (:issue:`53802`)
 - Deprecated backward-compatibility behavior for :meth:`DataFrame.select_dtypes` matching "str" dtype when ``np.object_`` is specified (:issue:`61916`)
 - Deprecated option "future.no_silent_downcasting", as it is no longer used. In a future version accessing this option will raise (:issue:`59502`)
+- Deprecated silent casting of non-datetime 'other' to datetime in :meth:`Series.combine_first` (:issue:`62931`)
 - Deprecated slicing on a :class:`Series` or :class:`DataFrame` with a :class:`DatetimeIndex` using a ``datetime.date`` object, explicitly cast to :class:`Timestamp` instead (:issue:`35830`)
 - Deprecated the 'inplace' keyword from :meth:`Resampler.interpolate`, as passing ``True`` raises ``AttributeError`` (:issue:`58690`)
 
@@ -1031,13 +1032,15 @@ Datetimelike
 - Bug in :class:`Timestamp` constructor failing to raise when given a ``np.datetime64`` object with non-standard unit (:issue:`25611`)
 - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
 - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
+- Bug in :func:`infer_freq` with a :class:`Series` with :class:`ArrowDtype` timestamp dtype incorrectly raising ``TypeError`` (:issue:`58403`)
 - Bug in :func:`to_datetime` where passing an ``lxml.etree._ElementUnicodeResult`` together with ``format`` raised  ``TypeError``. Now subclasses of ``str`` are handled. (:issue:`60933`)
 - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
 - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
 - Bug in :meth:`DataFrame.fillna` raising an ``AssertionError`` instead of ``OutOfBoundsDatetime`` when filling a ``datetime64[ns]`` column with an out-of-bounds timestamp. Now correctly raises ``OutOfBoundsDatetime``. (:issue:`61208`)
 - Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` casting ``datetime64`` and ``timedelta64`` columns to ``float64`` and losing precision (:issue:`60850`)
 - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
 - Bug in :meth:`DateOffset.rollback` (and subclass methods) with ``normalize=True`` rolling back one offset too long (:issue:`32616`)
+- Bug in :meth:`DatetimeIndex.asof` with a string key giving incorrect results (:issue:`50946`)
 - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`)
 - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`)
 - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`)
@@ -1241,10 +1244,13 @@ Reshaping
 - Bug in :func:`concat` with mixed integer and bool dtypes incorrectly casting the bools to integers (:issue:`45101`)
 - Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
 - Bug in :meth:`DataFrame.combine_first` not preserving the column order (:issue:`60427`)
+- Bug in :meth:`DataFrame.combine_first` with non-unique columns incorrectly raising (:issue:`29135`)
+- Bug in :meth:`DataFrame.combine` with non-unique columns incorrectly raising (:issue:`51340`)
 - Bug in :meth:`DataFrame.explode` producing incorrect result for :class:`pyarrow.large_list` type (:issue:`61091`)
 - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
 - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
 - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
+- Bug in :meth:`Series.combine_first` incorrectly replacing ``None`` entries with ``NaN`` (:issue:`58977`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
 - Bug in :meth:`DataFrame.unstack` raising an error with indexes containing ``NaN`` with ``sort=False`` (:issue:`61221`)
 - Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -9038,16 +9038,6 @@ def combine(
         0  0 -5.0
         1  0  4.0
 
-        However, if the same element in both dataframes is None, that None
-        is preserved
-
-        >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]})
-        >>> df2 = pd.DataFrame({"A": [1, 1], "B": [None, 3]})
-        >>> df1.combine(df2, take_smaller, fill_value=-5)
-            A    B
-        0  0 -5.0
-        1  0  3.0
-
         Example that demonstrates the use of `overwrite` and behavior when
         the axis differ between the dataframes.
 
@@ -9106,11 +9096,14 @@ def combine(
 
         # preserve column order
         new_columns = self.columns.union(other_columns, sort=False)
+        this = this.reindex(new_columns, axis=1)
+        other = other.reindex(new_columns, axis=1)
+
         do_fill = fill_value is not None
         result = {}
-        for col in new_columns:
-            series = this[col]
-            other_series = other[col]
+        for i in range(this.shape[1]):
+            series = this.iloc[:, i]
+            other_series = other.iloc[:, i]
 
             this_dtype = series.dtype
             other_dtype = other_series.dtype
@@ -9121,7 +9114,7 @@ def combine(
             # don't overwrite columns unnecessarily
             # DO propagate if this column is not in the intersection
             if not overwrite and other_mask.all():
-                result[col] = this[col].copy()
+                result[i] = series.copy()
                 continue
 
             if do_fill:
@@ -9130,7 +9123,7 @@ def combine(
                 series[this_mask] = fill_value
                 other_series[other_mask] = fill_value
 
-            if col not in self.columns:
+            if new_columns[i] not in self.columns:
                 # If self DataFrame does not have col in other DataFrame,
                 # try to promote series, which is all NaN, as other_dtype.
                 new_dtype = other_dtype
@@ -9155,10 +9148,10 @@ def combine(
                     arr, new_dtype
                 )
 
-            result[col] = arr
+            result[i] = arr
 
-        # convert_objects just in case
-        frame_result = self._constructor(result, index=new_index, columns=new_columns)
+        frame_result = self._constructor(result, index=new_index)
+        frame_result.columns = new_columns
         return frame_result.__finalize__(self, method="combine")
 
     def combine_first(self, other: DataFrame) -> DataFrame:
@@ -9222,9 +9215,14 @@ def combiner(x: Series, y: Series):
             combined = self.combine(other, combiner, overwrite=False)
 
         dtypes = {
+            # Check for isinstance(..., (np.dtype, ExtensionDtype))
+            #  to prevent raising on non-unique columns see GH#29135.
+            #  Note we will just not-cast in these cases.
             col: find_common_type([self.dtypes[col], other.dtypes[col]])
             for col in self.columns.intersection(other.columns)
-            if combined.dtypes[col] != self.dtypes[col]
+            if isinstance(combined.dtypes[col], (np.dtype, ExtensionDtype))
+            and isinstance(self.dtypes[col], (np.dtype, ExtensionDtype))
+            and combined.dtypes[col] != self.dtypes[col]
         }
 
         if dtypes:
@@ -13822,8 +13820,8 @@ def quantile(
         0.1  1    1
         0.5  3  100
 
-        Specifying `numeric_only=False` will also compute the quantile of
-        datetime and timedelta data.
+        Specifying `numeric_only=False` will compute the quantiles for all
+        columns.
 
         >>> df = pd.DataFrame(
         ...     {
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -4168,7 +4168,7 @@ def reindex(
         limit : int, optional
             Maximum number of consecutive labels in ``target`` to match for
             inexact matches.
-        tolerance : int or float, optional
+        tolerance : int, float, or list-like, optional
             Maximum distance between original and new labels for inexact
             matches. The values of the index at the matching locations must
             satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
@@ -5675,7 +5675,7 @@ def asof(self, label):
                 return self._na_value
         else:
             if isinstance(loc, slice):
-                loc = loc.indices(len(self))[-1]
+                return self[loc][-1]
 
         return self[loc]
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -87,7 +87,6 @@
 )
 from pandas.core.dtypes.dtypes import (
     ExtensionDtype,
-    SparseDtype,
 )
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
@@ -3112,8 +3111,8 @@ def combine(
 
         Combine the Series and `other` using `func` to perform elementwise
         selection for combined Series.
-        `fill_value` is assumed when value is missing at some index
-        from one of the two objects being combined.
+        `fill_value` is assumed when value is not present at some index
+        from one of the two Series being combined.
 
         Parameters
         ----------
@@ -3254,9 +3253,6 @@ def combine_first(self, other) -> Series:
         if self.dtype == other.dtype:
             if self.index.equals(other.index):
                 return self.mask(self.isna(), other)
-            elif self._can_hold_na and not isinstance(self.dtype, SparseDtype):
-                this, other = self.align(other, join="outer")
-                return this.mask(this.isna(), other)
 
         new_index = self.index.union(other.index)
 
@@ -3271,6 +3267,16 @@ def combine_first(self, other) -> Series:
         if this.dtype.kind == "M" and other.dtype.kind != "M":
             # TODO: try to match resos?
             other = to_datetime(other)
+            warnings.warn(
+                # GH#62931
+                "Silently casting non-datetime 'other' to datetime in "
+                "Series.combine_first is deprecated and will be removed "
+                "in a future version. Explicitly cast before calling "
+                "combine_first instead.",
+                Pandas4Warning,
+                stacklevel=find_stack_level(),
+            )
+
         combined = concat([this, other])  # nobug
         combined = combined.reindex(new_index)
         return combined.__finalize__(self, method="combine_first")
diff --git a/pandas/tests/frame/methods/test_combine.py b/pandas/tests/frame/methods/test_combine.py
@@ -45,3 +45,19 @@ def test_combine_generic(self, float_frame):
         )
         tm.assert_frame_equal(chunk, exp)
         tm.assert_frame_equal(chunk2, exp)
+
+    def test_combine_nonunique_columns(self):
+        # GH#51340
+
+        df = pd.DataFrame({"A": range(5), "B": range(5)})
+        df.columns = ["A", "A"]
+
+        other = df.copy()
+        df.iloc[1, :] = None
+
+        def combiner(a, b):
+            return b
+
+        result = df.combine(other, combiner)
+        expected = other.astype("float64")
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
@@ -413,6 +413,18 @@ def test_combine_first_preserve_EA_precision(self, wide_val, dtype):
         expected = DataFrame({"A": [wide_val, 5, wide_val]}, dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
+    def test_combine_first_non_unique_columns(self):
+        # GH#29135
+        df1 = DataFrame([[1, np.nan], [3, 4]], columns=["P", "Q"], index=["A", "B"])
+        df2 = DataFrame(
+            [[5, 6, 7], [8, 9, np.nan]], columns=["P", "Q", "Q"], index=["A", "B"]
+        )
+        result = df1.combine_first(df2)
+        expected = DataFrame(
+            [[1, 6.0, 7.0], [3, 4.0, 4.0]], index=["A", "B"], columns=["P", "Q", "Q"]
+        )
+        tm.assert_frame_equal(result, expected)
+
 
 @pytest.mark.parametrize(
     "scalar1, scalar2",
diff --git a/pandas/tests/indexes/datetimes/methods/test_asof.py b/pandas/tests/indexes/datetimes/methods/test_asof.py
@@ -1,6 +1,7 @@
 from datetime import timedelta
 
 from pandas import (
+    DatetimeIndex,
     Index,
     Timestamp,
     date_range,
@@ -28,3 +29,18 @@ def test_asof(self):
 
         dt = index[0].to_pydatetime()
         assert isinstance(index.asof(dt), Timestamp)
+
+    def test_asof_datetime_string(self):
+        # GH#50946
+
+        dti = date_range("2021-08-05", "2021-08-10", freq="1D")
+
+        key = "2021-08-09"
+        res = dti.asof(key)
+        exp = dti[4]
+        assert res == exp
+
+        # add a non-midnight time caused a bug
+        dti2 = DatetimeIndex(list(dti) + ["2021-08-11 00:00:01"])
+        res = dti2.asof(key)
+        assert res == exp
diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py
@@ -2,6 +2,8 @@
 
 import numpy as np
 
+from pandas.errors import Pandas4Warning
+
 import pandas as pd
 from pandas import (
     Period,
@@ -75,9 +77,14 @@ def test_combine_first_dt64(self, unit):
         xp = to_datetime(Series(["2010", "2011"])).dt.as_unit(unit)
         tm.assert_series_equal(rs, xp)
 
+    def test_combine_first_dt64_casting_deprecation(self, unit):
+        # GH#62931
         s0 = to_datetime(Series(["2010", np.nan])).dt.as_unit(unit)
         s1 = Series([np.nan, "2011"])
-        rs = s0.combine_first(s1)
+
+        msg = "Silently casting non-datetime 'other' to datetime"
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            rs = s0.combine_first(s1)
 
         xp = Series([datetime(2010, 1, 1), "2011"], dtype=f"datetime64[{unit}]")
 
@@ -144,3 +151,12 @@ def test_combine_mixed_timezone(self):
             ),
         )
         tm.assert_series_equal(result, expected)
+
+    def test_combine_first_none_not_nan(self):
+        # GH#58977
+        s1 = Series([None, None, None], index=["a", "b", "c"])
+        s2 = Series([None, None, None], index=["b", "c", "d"])
+
+        result = s1.combine_first(s2)
+        expected = Series([None] * 4, index=["a", "b", "c", "d"])
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py
@@ -13,6 +13,7 @@
 from pandas._libs.tslibs.offsets import _get_offset
 from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG
 from pandas.compat import is_platform_windows
+import pandas.util._test_decorators as td
 
 from pandas import (
     DatetimeIndex,
@@ -542,3 +543,16 @@ def test_infer_freq_non_nano_tzaware(tz_aware_fixture):
 
     res = frequencies.infer_freq(dta)
     assert res == "B"
+
+
+@td.skip_if_no("pyarrow")
+def test_infer_freq_pyarrow():
+    # GH#58403
+    data = ["2022-01-01T10:00:00", "2022-01-01T10:00:30", "2022-01-01T10:01:00"]
+    pd_series = Series(data).astype("timestamp[s][pyarrow]")
+    pd_index = Index(data).astype("timestamp[s][pyarrow]")
+
+    assert frequencies.infer_freq(pd_index.values) == "30s"
+    assert frequencies.infer_freq(pd_series.values) == "30s"
+    assert frequencies.infer_freq(pd_index) == "30s"
+    assert frequencies.infer_freq(pd_series) == "30s"
diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py
@@ -37,6 +37,7 @@
 
 from pandas.core.dtypes.common import is_numeric_dtype
 from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
     DatetimeTZDtype,
     PeriodDtype,
 )
@@ -132,6 +133,14 @@ def infer_freq(
 
     if isinstance(index, ABCSeries):
         values = index._values
+
+        if isinstance(index.dtype, ArrowDtype):
+            import pyarrow as pa
+
+            if pa.types.is_timestamp(values.dtype.pyarrow_dtype):
+                # GH#58403
+                values = values._to_datetimearray()
+
         if not (
             lib.is_np_dtype(values.dtype, "mM")
             or isinstance(values.dtype, DatetimeTZDtype)