Skip to content

Commit 603120b

Browse files
committed
DEPR/BUG: Do not ignore sort in concat for DatetimeIndex
1 parent ea75dd7 commit 603120b

File tree

6 files changed

+149
-22
lines changed

6 files changed

+149
-22
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,63 @@ In cases with mixed-resolution inputs, the highest resolution is used:
371371
In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype
372372
Out[2]: dtype('<M8[ns]')
373373
374+
.. _whatsnew_300.api_breaking.concat_datetime_sorting:
375+
376+
:func:`concat` no longer ignores ``sort`` when all objects have a :class:`DatetimeIndex`
377+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
378+
379+
When all objects passed to :func:`concat` have a :class:`DatetimeIndex`,
380+
passing ``sort=False`` will now result in the non-concatenation axis not
381+
being sorted. Previously, the result would always be sorted along
382+
the non-concatenation axis even when ``sort=False`` is passed.
383+
384+
If you do not specify the ``sort`` argument, pandas will continue to return a
385+
sorted result but this behavior is deprecated and you will receive a warning.
386+
In order to make this less noisy for users, pandas checks if not sorting would
387+
impact the result and only warns when it would. This check can be expensive,
388+
and users can skip the check by explicitly specifying ``sort=True`` or
389+
``sort=False``.
390+
391+
This deprecation can also impact pandas' internal usage of :func:`concat`.
392+
While we have investigated uses of :func:`concat` to determine if this could lead
393+
to a change in behavior of other functions and methods in the API, it is
394+
possible some have been missed. In order to be cautious here, pandas has *not*
395+
added ``sort=False`` to any internal calls where we believe behavior should not change.
396+
If we have missed something, users will not experience a behavior change but they
397+
will receive a warning about :func:`concat` even though they are not directly
398+
calling this function. If this does occur, we ask users to open an issue so that
399+
we may address any potential behavior changes.
400+
401+
.. ipython:: python
402+
403+
idx1 = pd.date_range("2025-01-02", periods=3, freq="h")
404+
df1 = pd.DataFrame({"a": [1, 2, 3]}, index=idx1)
405+
df1
406+
407+
idx2 = pd.date_range("2025-01-01", periods=3, freq="h")
408+
df2 = pd.DataFrame({"b": [1, 2, 3]}, index=idx2)
409+
df2
410+
411+
*Old behavior*
412+
413+
.. code-block:: ipython
414+
415+
In [3]: pd.concat([df1, df2], axis=1, sort=False)
416+
Out[3]:
417+
a b
418+
2025-01-01 00:00:00 NaN 1.0
419+
2025-01-01 01:00:00 NaN 2.0
420+
2025-01-01 02:00:00 NaN 3.0
421+
2025-01-02 00:00:00 1.0 NaN
422+
2025-01-02 01:00:00 2.0 NaN
423+
2025-01-02 02:00:00 3.0 NaN
424+
425+
*New behavior*
426+
427+
.. ipython:: python
428+
429+
pd.concat([df1, df2], axis=1, sort=False)
430+
374431
.. _whatsnew_300.api_breaking.value_counts_sorting:
375432

376433
Changed behavior in :meth:`DataFrame.value_counts` and :meth:`DataFrameGroupBy.value_counts` when ``sort=False``

pandas/core/indexes/api.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def get_objs_combined_axis(
6464
objs,
6565
intersect: bool = False,
6666
axis: Axis = 0,
67-
sort: bool = True,
67+
sort: bool | lib.NoDefault = True,
6868
) -> Index:
6969
"""
7070
Extract combined index: return intersection or union (depending on the
@@ -81,7 +81,8 @@ def get_objs_combined_axis(
8181
axis : {0 or 'index', 1 or 'outer'}, default 0
8282
The axis to extract indexes from.
8383
sort : bool, default True
84-
Whether the result index should come out sorted or not.
84+
Whether the result index should come out sorted or not. NoDefault
85+
use for deprecation in GH#57335.
8586
8687
Returns
8788
-------
@@ -108,7 +109,7 @@ def _get_distinct_objs(objs: list[Index]) -> list[Index]:
108109
def _get_combined_index(
109110
indexes: list[Index],
110111
intersect: bool = False,
111-
sort: bool = False,
112+
sort: bool | lib.NoDefault = False,
112113
) -> Index:
113114
"""
114115
Return the union or intersection of indexes.
@@ -121,7 +122,8 @@ def _get_combined_index(
121122
If True, calculate the intersection between indexes. Otherwise,
122123
calculate the union.
123124
sort : bool, default False
124-
Whether the result index should come out sorted or not.
125+
Whether the result index should come out sorted or not. NoDefault
126+
used for deprecation of GH#57335
125127
126128
Returns
127129
-------
@@ -138,10 +140,10 @@ def _get_combined_index(
138140
for other in indexes[1:]:
139141
index = index.intersection(other)
140142
else:
141-
index = union_indexes(indexes, sort=False)
143+
index = union_indexes(indexes, sort=sort if sort is lib.no_default else False)
142144
index = ensure_index(index)
143145

144-
if sort:
146+
if sort and sort is not lib.no_default:
145147
index = safe_sort_index(index)
146148
return index
147149

@@ -180,7 +182,7 @@ def safe_sort_index(index: Index) -> Index:
180182
return index
181183

182184

183-
def union_indexes(indexes, sort: bool | None = True) -> Index:
185+
def union_indexes(indexes, sort: bool | None | lib.NoDefault = True) -> Index:
184186
"""
185187
Return the union of indexes.
186188
@@ -190,7 +192,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
190192
----------
191193
indexes : list of Index or list objects
192194
sort : bool, default True
193-
Whether the result index should come out sorted or not.
195+
Whether the result index should come out sorted or not. NoDefault
196+
used for deprecation of GH#57335.
194197
195198
Returns
196199
-------
@@ -201,7 +204,7 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
201204
if len(indexes) == 1:
202205
result = indexes[0]
203206
if isinstance(result, list):
204-
if not sort:
207+
if not sort or sort is lib.no_default:
205208
result = Index(result)
206209
else:
207210
result = Index(sorted(result))
@@ -227,7 +230,8 @@ def union_indexes(indexes, sort: bool | None = True) -> Index:
227230
raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex")
228231

229232
if num_dtis == len(indexes):
230-
sort = True
233+
if sort is lib.no_default:
234+
sort = True
231235
result = indexes[0]
232236

233237
elif num_dtis > 1:

pandas/core/reshape/concat.py

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@
4545
ensure_index,
4646
get_objs_combined_axis,
4747
get_unanimous_names,
48+
union_indexes,
4849
)
50+
from pandas.core.indexes.datetimes import DatetimeIndex
4951
from pandas.core.internals import concatenate_managers
5052

5153
if TYPE_CHECKING:
@@ -162,7 +164,7 @@ def concat(
162164
levels=None,
163165
names: list[HashableT] | None = None,
164166
verify_integrity: bool = False,
165-
sort: bool = False,
167+
sort: bool | lib.NoDefault = lib.no_default,
166168
copy: bool | lib.NoDefault = lib.no_default,
167169
) -> DataFrame | Series:
168170
"""
@@ -405,13 +407,40 @@ def concat(
405407
"Only can inner (intersect) or outer (union) join the other axis"
406408
)
407409

408-
if not is_bool(sort):
410+
objs, keys, ndims = _clean_keys_and_objs(objs, keys)
411+
412+
if sort is lib.no_default:
413+
if axis == 0:
414+
non_concat_axis = [
415+
obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name])
416+
for obj in objs
417+
]
418+
else:
419+
non_concat_axis = [obj.index for obj in objs]
420+
421+
if (
422+
any(not isinstance(index, DatetimeIndex) for index in non_concat_axis)
423+
or all(
424+
id(prev) == id(curr)
425+
for prev, curr in zip(non_concat_axis, non_concat_axis[1:])
426+
)
427+
or (
428+
all(
429+
prev[-1] <= curr[0] and prev.is_monotonic_increasing
430+
for prev, curr in zip(non_concat_axis, non_concat_axis[1:])
431+
if not prev.empty and not curr.empty
432+
)
433+
and non_concat_axis[-1].is_monotonic_increasing
434+
)
435+
):
436+
# Sorting or not will not impact the result.
437+
sort = False
438+
elif not is_bool(sort):
409439
raise ValueError(
410440
f"The 'sort' keyword only accepts boolean values; {sort} was passed."
411441
)
412-
sort = bool(sort)
413-
414-
objs, keys, ndims = _clean_keys_and_objs(objs, keys)
442+
else:
443+
sort = bool(sort)
415444

416445
# select an object to be our result reference
417446
sample, objs = _get_sample_object(objs, ndims, keys, names, levels, intersect)
@@ -436,9 +465,10 @@ def concat(
436465
if len(ndims) > 1:
437466
objs = _sanitize_mixed_ndim(objs, sample, ignore_index, bm_axis)
438467

468+
orig_axis = axis
439469
axis = 1 - bm_axis if is_frame else 0
440470
names = names or getattr(keys, "names", None)
441-
return _get_result(
471+
result = _get_result(
442472
objs,
443473
is_series,
444474
bm_axis,
@@ -452,6 +482,28 @@ def concat(
452482
axis,
453483
)
454484

485+
if sort is lib.no_default:
486+
if orig_axis == 0:
487+
non_concat_axis = [
488+
obj.columns if isinstance(obj, ABCDataFrame) else Index([obj.name])
489+
for obj in objs
490+
]
491+
else:
492+
non_concat_axis = [obj.index for obj in objs]
493+
no_sort_result_index = union_indexes(non_concat_axis, sort=False)
494+
orig = result.index if orig_axis == 1 else result.columns
495+
if not no_sort_result_index.equals(orig):
496+
msg = (
497+
"Sorting by default when concatenating all DatetimeIndex is "
498+
"deprecated. In the future, pandas will respect the default "
499+
"of `sort=False`. Specify `sort=True` or `sort=False` to "
500+
"silence this message. If you see this warnings when not "
501+
"directly calling concat, report a bug to pandas."
502+
)
503+
warnings.warn(msg, Pandas4Warning, stacklevel=find_stack_level())
504+
505+
return result
506+
455507

456508
def _sanitize_mixed_ndim(
457509
objs: list[Series | DataFrame],
@@ -510,7 +562,7 @@ def _get_result(
510562
bm_axis: AxisInt,
511563
ignore_index: bool,
512564
intersect: bool,
513-
sort: bool,
565+
sort: bool | lib.NoDefault,
514566
keys: Iterable[Hashable] | None,
515567
levels,
516568
verify_integrity: bool,

pandas/tests/io/pytables/test_select.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from pandas._libs.tslibs import Timestamp
55
from pandas.compat import PY312
6+
from pandas.errors import Pandas4Warning
67

78
import pandas as pd
89
from pandas import (
@@ -901,7 +902,9 @@ def test_select_as_multiple(setup_path):
901902
result = store.select_as_multiple(
902903
["df1", "df2"], where=["A>0", "B>0"], selector="df1"
903904
)
904-
expected = concat([df1, df2], axis=1)
905+
msg = "Sorting by default when concatenating all DatetimeIndex is deprecated"
906+
with tm.assert_produces_warning(Pandas4Warning, match=msg):
907+
expected = concat([df1, df2], axis=1)
905908
expected = expected[(expected.A > 0) & (expected.B > 0)]
906909
tm.assert_frame_equal(result, expected, check_freq=False)
907910
# FIXME: 2021-01-20 this is failing with freq None vs 4B on some builds
@@ -910,7 +913,9 @@ def test_select_as_multiple(setup_path):
910913
result = store.select_as_multiple(
911914
["df1", "df2"], where="index>df2.index[4]", selector="df2"
912915
)
913-
expected = concat([df1, df2], axis=1)
916+
msg = "Sorting by default when concatenating all DatetimeIndex is deprecated"
917+
with tm.assert_produces_warning(Pandas4Warning, match=msg):
918+
expected = concat([df1, df2], axis=1)
914919
expected = expected[5:]
915920
tm.assert_frame_equal(result, expected)
916921

pandas/tests/reshape/concat/test_concat.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
import numpy as np
1111
import pytest
1212

13-
from pandas.errors import InvalidIndexError
13+
from pandas.errors import (
14+
InvalidIndexError,
15+
Pandas4Warning,
16+
)
1417

1518
import pandas as pd
1619
from pandas import (
@@ -434,7 +437,9 @@ def test_concat_bug_1719(self):
434437
# to join with union
435438
# these two are of different length!
436439
left = concat([ts1, ts2], join="outer", axis=1)
437-
right = concat([ts2, ts1], join="outer", axis=1)
440+
msg = "Sorting by default when concatenating all DatetimeIndex is deprecated"
441+
with tm.assert_produces_warning(Pandas4Warning, match=msg):
442+
right = concat([ts2, ts1], join="outer", axis=1)
438443

439444
assert len(left) == len(right)
440445

pandas/tests/reshape/concat/test_datetimes.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import numpy as np
66
import pytest
77

8+
from pandas.errors import Pandas4Warning
9+
810
import pandas as pd
911
from pandas import (
1012
DataFrame,
@@ -69,7 +71,9 @@ def test_concat_datetime_timezone(self):
6971

7072
idx3 = date_range("2011-01-01", periods=3, freq="h", tz="Asia/Tokyo")
7173
df3 = DataFrame({"b": [1, 2, 3]}, index=idx3)
72-
result = concat([df1, df3], axis=1)
74+
msg = "Sorting by default when concatenating all DatetimeIndex"
75+
with tm.assert_produces_warning(Pandas4Warning, match=msg):
76+
result = concat([df1, df3], axis=1)
7377

7478
exp_idx = DatetimeIndex(
7579
[

0 commit comments

Comments
 (0)