diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d11ab82294be1..0396d1704b579 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -374,6 +374,71 @@ In cases with mixed-resolution inputs, the highest resolution is used: In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype Out[2]: dtype(' Index: """ Extract combined index: return intersection or union (depending on the @@ -81,7 +81,8 @@ def get_objs_combined_axis( axis : {0 or 'index', 1 or 'outer'}, default 0 The axis to extract indexes from. sort : bool, default True - Whether the result index should come out sorted or not. + Whether the result index should come out sorted or not. NoDefault + use for deprecation in GH#57335. Returns ------- @@ -108,7 +109,7 @@ def _get_distinct_objs(objs: list[Index]) -> list[Index]: def _get_combined_index( indexes: list[Index], intersect: bool = False, - sort: bool = False, + sort: bool | lib.NoDefault = False, ) -> Index: """ Return the union or intersection of indexes. @@ -121,7 +122,8 @@ def _get_combined_index( If True, calculate the intersection between indexes. Otherwise, calculate the union. sort : bool, default False - Whether the result index should come out sorted or not. + Whether the result index should come out sorted or not. NoDefault + used for deprecation of GH#57335 Returns ------- @@ -138,10 +140,10 @@ def _get_combined_index( for other in indexes[1:]: index = index.intersection(other) else: - index = union_indexes(indexes, sort=False) + index = union_indexes(indexes, sort=sort if sort is lib.no_default else False) index = ensure_index(index) - if sort: + if sort and sort is not lib.no_default: index = safe_sort_index(index) return index @@ -180,7 +182,7 @@ def safe_sort_index(index: Index) -> Index: return index -def union_indexes(indexes, sort: bool | None = True) -> Index: +def union_indexes(indexes, sort: bool | lib.NoDefault = True) -> Index: """ Return the union of indexes. @@ -190,7 +192,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: ---------- indexes : list of Index or list objects sort : bool, default True - Whether the result index should come out sorted or not. + Whether the result index should come out sorted or not. NoDefault + used for deprecation of GH#57335. Returns ------- @@ -201,7 +204,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: if len(indexes) == 1: result = indexes[0] if isinstance(result, list): - if not sort: + if not sort or sort is lib.no_default: result = Index(result) else: result = Index(sorted(result)) @@ -227,7 +230,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index: raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") if num_dtis == len(indexes): - sort = True + if sort is lib.no_default: + sort = True result = indexes[0] elif num_dtis > 1: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index db7f33d5c017f..7d5d6bac9db41 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -45,7 +45,9 @@ ensure_index, get_objs_combined_axis, get_unanimous_names, + union_indexes, ) +from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.internals import concatenate_managers if TYPE_CHECKING: @@ -162,7 +164,7 @@ def concat( levels=None, names: list[HashableT] | None = None, verify_integrity: bool = False, - sort: bool = False, + sort: bool | lib.NoDefault = lib.no_default, copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | Series: """ @@ -405,13 +407,40 @@ def concat( "Only can inner (intersect) or outer (union) join the other axis" ) - if not is_bool(sort): + objs, keys, ndims = _clean_keys_and_objs(objs, keys) + + if sort is lib.no_default: + if axis == 0: + non_concat_axis = [ + obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name]) + for obj in objs + ] + else: + non_concat_axis = [obj.index for obj in objs] + + if ( + intersect + or any(not isinstance(index, DatetimeIndex) for index in non_concat_axis) + or all( + prev is curr for prev, curr in zip(non_concat_axis, non_concat_axis[1:]) + ) + or ( + all( + prev[-1] <= curr[0] and prev.is_monotonic_increasing + for prev, curr in zip(non_concat_axis, non_concat_axis[1:]) + if not prev.empty and not curr.empty + ) + and non_concat_axis[-1].is_monotonic_increasing + ) + ): + # Sorting or not will not impact the result. + sort = False + elif not is_bool(sort): raise ValueError( f"The 'sort' keyword only accepts boolean values; {sort} was passed." ) - sort = bool(sort) - - objs, keys, ndims = _clean_keys_and_objs(objs, keys) + else: + sort = bool(sort) # select an object to be our result reference sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect) @@ -436,9 +465,10 @@ def concat( if len(ndims) > 1: objs = _sanitize_mixed_ndim(objs, sample, ignore_index, bm_axis) + orig_axis = axis axis = 1 - bm_axis if is_frame else 0 names = names or getattr(keys, "names", None) - return _get_result( + result = _get_result( objs, is_series, bm_axis, @@ -452,6 +482,28 @@ def concat( axis, ) + if sort is lib.no_default: + if orig_axis == 0: + non_concat_axis = [ + obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name]) + for obj in objs + ] + else: + non_concat_axis = [obj.index for obj in objs] + no_sort_result_index = union_indexes(non_concat_axis, sort=False) + orig = result.index if orig_axis == 1 else result.columns + if not no_sort_result_index.equals(orig): + msg = ( + "Sorting by default when concatenating all DatetimeIndex is " + "deprecated. In the future, pandas will respect the default " + "of `sort=False`. Specify `sort=True` or `sort=False` to " + "silence this message. If you see this warnings when not " + "directly calling concat, report a bug to pandas." + ) + warnings.warn(msg, Pandas4Warning, stacklevel=find_stack_level()) + + return result + def _sanitize_mixed_ndim( objs: list[Series | DataFrame], @@ -510,7 +562,7 @@ def _get_result( bm_axis: AxisInt, ignore_index: bool, intersect: bool, - sort: bool, + sort: bool | lib.NoDefault, keys: Iterable[Hashable] | None, levels, verify_integrity: bool, @@ -620,7 +672,7 @@ def new_axes( objs: list[Series | DataFrame], bm_axis: AxisInt, intersect: bool, - sort: bool, + sort: bool | lib.NoDefault, keys: Iterable[Hashable] | None, names: list[HashableT] | None, axis: AxisInt, diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index a9afb5dbd11d7..06793769b39bb 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -915,6 +915,36 @@ def test_listlike_lambda(ops, by_row, expected): tm.assert_equal(result, expected) +def test_listlike_datetime_index_unsorted(): + # https://github.com/pandas-dev/pandas/pull/62843 + values = [datetime(2024, 1, 1), datetime(2024, 1, 2), datetime(2024, 1, 3)] + df = DataFrame({"a": [1, 2]}, index=[values[1], values[0]]) + result = df.apply([lambda x: x, lambda x: x.shift(freq="D")], by_row=False) + expected = DataFrame( + [[1.0, 2.0], [2.0, np.nan], [np.nan, 1.0]], + index=[values[1], values[0], values[2]], + columns=MultiIndex([["a"], [""]], codes=[[0, 0], [0, 0]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_dictlike_datetime_index_unsorted(): + # https://github.com/pandas-dev/pandas/pull/62843 + values = [datetime(2024, 1, 1), datetime(2024, 1, 2), datetime(2024, 1, 3)] + df = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[values[1], values[0]]) + result = df.apply( + {"a": lambda x: x, "b": lambda x: x.shift(freq="D")}, by_row=False + ) + expected = DataFrame( + { + "a": [1.0, 2.0, np.nan], + "b": [4.0, np.nan, 3.0], + }, + index=[values[1], values[0], values[2]], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "ops", [ diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 37a5b0dec9f82..9ad98aa6b3090 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -794,3 +794,17 @@ def test_shift_invalid_fill_value_deprecation(self): df["a"].shift(1, fill_value=NaT) with tm.assert_produces_warning(Pandas4Warning, match=msg): df["b"].shift(1, fill_value=NaT) + + def test_shift_dt_index_multiple_periods_unsorted(self): + # https://github.com/pandas-dev/pandas/pull/62843 + values = date_range("1/1/2000", periods=4, freq="D") + df = DataFrame({"a": [1, 2]}, index=[values[1], values[0]]) + result = df.shift(periods=[1, 2], freq="D") + expected = DataFrame( + { + "a_1": [1.0, 2.0, np.nan], + "a_2": [2.0, np.nan, 1.0], + }, + index=[values[2], values[1], values[3]], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/methods/test_groupby_shift_diff.py b/pandas/tests/groupby/methods/test_groupby_shift_diff.py index 1256046d81949..60e5fd9fa1863 100644 --- a/pandas/tests/groupby/methods/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/methods/test_groupby_shift_diff.py @@ -248,3 +248,21 @@ def test_group_shift_with_multiple_periods_and_both_fill_and_freq_deprecated(): msg = "Passing a 'freq' together with a 'fill_value'" with pytest.raises(ValueError, match=msg): df.groupby("b")[["a"]].shift([1, 2], fill_value=1, freq="h") + + +def test_groupby_shift_multiple_periods_unsorted_index(): + # https://github.com/pandas-dev/pandas/pull/62843 + idx = date_range("1/1/2000", periods=4, freq="h") + df = DataFrame( + {"a": [1, 2, 3], "b": [True, True, False]}, + index=[idx[2], idx[0], idx[1]], + ) + result = df.groupby("b")[["a"]].shift([0, 1], freq="h") + expected = DataFrame( + { + "a_0": [1.0, 2.0, 3.0, np.nan], + "a_1": [3.0, np.nan, 2.0, 1.0], + }, + index=[idx[2], idx[0], idx[1], idx[3]], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 2d0eb5d14a1d9..7d0e534cb7689 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -10,7 +10,10 @@ import numpy as np import pytest -from pandas.errors import InvalidIndexError +from pandas.errors import ( + InvalidIndexError, + Pandas4Warning, +) import pandas as pd from pandas import ( @@ -434,7 +437,9 @@ def test_concat_bug_1719(self): # to join with union # these two are of different length! left = concat([ts1, ts2], join="outer", axis=1) - right = concat([ts2, ts1], join="outer", axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex is deprecated" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + right = concat([ts2, ts1], join="outer", axis=1) assert len(left) == len(right) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 0cf3192ea3a74..b1cba7ee31eac 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.errors import Pandas4Warning + import pandas as pd from pandas import ( DataFrame, @@ -69,7 +71,9 @@ def test_concat_datetime_timezone(self): idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo") df3 = DataFrame({"b": [1, 2, 3]}, index=idx3) - result = concat([df1, df3], axis=1) + msg = "Sorting by default when concatenating all DatetimeIndex" + with tm.assert_produces_warning(Pandas4Warning, match=msg): + result = concat([df1, df3], axis=1) exp_idx = DatetimeIndex( [ diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 31df52645f3f9..c866b81a5349e 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -1,3 +1,4 @@ +from datetime import datetime import re import numpy as np @@ -714,6 +715,20 @@ def test_join_many_sort_nonunique(self, how, sort): result = df.join([df2], how=how, sort=sort) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"]) + def test_join_many_datetime_unsorted(self, how): + # https://github.com/pandas-dev/pandas/pull/62843 + index = Index([datetime(2024, 1, 2), datetime(2024, 1, 1)]) + df = DataFrame({"a": [1, 2]}, index=index) + df2 = DataFrame({"b": [1, 2]}, index=index) + result = df.join([df2], how=how) + if how == "outer": + # Outer always sorts the index. + expected = DataFrame({"a": [2, 1], "b": [2, 1]}, index=[index[1], index[0]]) + else: + expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=index) + tm.assert_frame_equal(result, expected) + def test_join_many_mixed(self): df = DataFrame( np.random.default_rng(2).standard_normal((8, 4)), diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py index 68ca807bde145..72ea6544b1631 100644 --- a/pandas/tests/strings/test_cat.py +++ b/pandas/tests/strings/test_cat.py @@ -1,3 +1,4 @@ +from datetime import datetime import re import numpy as np @@ -343,6 +344,19 @@ def test_str_cat_align_mixed_inputs(join_type): s.str.cat([t, z], join=join_type) +def test_str_cat_datetime_index_unsorted(join_type): + # https://github.com/pandas-dev/pandas/pull/62843 + values = [datetime(2024, 1, 1), datetime(2024, 1, 2)] + s = Series(["a", "b"], index=[values[1], values[0]]) + others = Series(["c", "d"], index=[values[0], values[1]]) + result = s.str.cat(others, join=join_type) + if join_type == "outer" or join_type == "right": + expected = Series(["bc", "ad"], index=[values[0], values[1]]) + else: + expected = Series(["ad", "bc"], index=[values[1], values[0]]) + tm.assert_series_equal(result, expected) + + def test_str_cat_all_na(index_or_series, index_or_series2): # GH 24044 box = index_or_series