diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2f7330d1e81fe..50397b97422f6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -81,8 +81,6 @@ PerformanceWarning, ) from pandas.util._decorators import ( - Appender, - Substitution, cache_readonly, ) from pandas.util._exceptions import find_stack_level @@ -1779,10 +1777,6 @@ class DatelikeOps(DatetimeLikeArrayMixin): Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. """ - @Substitution( - URL="https://docs.python.org/3/library/datetime.html" - "#strftime-and-strptime-behavior" - ) def strftime(self, date_format: str) -> npt.NDArray[np.object_]: """ Convert to Index using specified date_format. @@ -1790,7 +1784,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: Return an Index of formatted strings specified by date_format, which supports the same string format as the python standard library. Details of the string format can be found in `python string format - doc <%(URL)s>`__. + doc `__. Formats supported by the C `strftime` API but not by the python string format doc (such as `"%%R"`, `"%%r"`) are not officially supported and should be @@ -2263,31 +2257,318 @@ def _round(self, freq, mode, ambiguous, nonexistent): result = result.view(self._ndarray.dtype) return self._simple_new(result, dtype=self.dtype) - @Appender((_round_doc + _round_example).format(op="round")) def round( self, freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", ) -> Self: + """ + Perform ceil operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to ceil the index to. Must be a fixed + frequency like 's' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise a ValueError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, + default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise a ValueError if there are + nonexistent times. + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + See Also + -------- + DatetimeIndex.floor : Perform floor operation on the data + to the specified `freq`. + DatetimeIndex.snap : Snap time stamps to nearest occurring frequency. + + Notes + ----- + If the timestamps have a timezone, ceiling will take place relative to the + local ("wall") time and re-localized to the same timezone. When ceiling + near daylight savings time, use ``nonexistent`` and ``ambiguous`` to + control the re-localization behavior. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range("1/1/2018 11:59:00", periods=3, freq="min") + + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='min') + + >>> rng.ceil("h") + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("h") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + + When rounding near a daylight savings time transition, use ``ambiguous`` or + ``nonexistent`` to control how the timestamp should be re-localized. + + >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") + + >>> rng_tz.ceil("2h", ambiguous=False) + DatetimeIndex(['2021-10-31 04:00:00+01:00'], + dtype='datetime64[s, Europe/Amsterdam]', freq=None) + + >>> rng_tz.ceil("2h", ambiguous=True) + DatetimeIndex(['2021-10-31 04:00:00+02:00'], + dtype='datetime64[s, Europe/Amsterdam]', freq=None) + """ return self._round(freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent) - @Appender((_round_doc + _floor_example).format(op="floor")) def floor( self, freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", ) -> Self: + """ + Perform floor operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to floor the index to. Must be a fixed + frequency like 's' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise a ValueError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', timedelta, + default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise a ValueError if there are + nonexistent times. + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + See Also + -------- + DatetimeIndex.ceil : Perform ceil operation on the data + to the specified `freq`. + DatetimeIndex.round : Perform round operation on the data + to the specified `freq`. + DatetimeIndex.snap : Snap time stamps to nearest occurring frequency. + + Notes + ----- + If the timestamps have a timezone, flooring will take place relative to the + local ("wall") time and re-localized to the same timezone. When flooring + near daylight savings time, use ``nonexistent`` and ``ambiguous`` to + control the re-localization behavior. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range("1/1/2018 11:59:00", periods=3, freq="min") + + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='min') + + >>> rng.floor("h") + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.floor("h") + 0 2018-01-01 11:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + + When rounding near a daylight savings time transition, use ``ambiguous`` or + ``nonexistent`` to control how the timestamp should be re-localized. + + >>> rng_tz = pd.DatetimeIndex(["2021-10-31 03:30:00"], tz="Europe/Amsterdam") + + >>> rng_tz.floor("2h", ambiguous=False) + DatetimeIndex(['2021-10-31 02:00:00+01:00'], + dtype='datetime64[s, Europe/Amsterdam]', freq=None) + + >>> rng_tz.floor("2h", ambiguous=True) + DatetimeIndex(['2021-10-31 02:00:00+02:00'], + dtype='datetime64[s, Europe/Amsterdam]', freq=None) + """ return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) - @Appender((_round_doc + _ceil_example).format(op="ceil")) def ceil( self, freq, ambiguous: TimeAmbiguous = "raise", nonexistent: TimeNonexistent = "raise", ) -> Self: + """ + Perform ceil operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to ceil the index to. Must be a fixed + frequency like 's' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise a ValueError if there are ambiguous + times. + + nonexistent : 'shift_forward', 'shift_backward', 'NaT', + timedelta, default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift_forward' will shift the nonexistent time forward to the + closest existing time + - 'shift_backward' will shift the nonexistent time backward to the + closest existing time + - 'NaT' will return NaT where there are nonexistent times + - timedelta objects will shift nonexistent times by the timedelta + - 'raise' will raise a ValueError if there are + nonexistent times. + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + See Also + -------- + DatetimeIndex.floor : Perform floor operation on the data + to the specified `freq`. + DatetimeIndex.round : Perform round operation on the data + to the specified `freq`. + DatetimeIndex.snap : Snap time stamps to nearest occurring frequency. + + Notes + ----- + If the timestamps have a timezone, ceiling will take place relative to the + local ("wall") time and re-localized to the same timezone. When ceiling + near daylight savings time, use ``nonexistent`` and ``ambiguous`` to + control the re-localization behavior. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range("1/1/2018 11:59:00", periods=3, freq="min") + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='min') + >>> rng.ceil("h") + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("h") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 13:00:00 + dtype: datetime64[ns] + + When rounding near a daylight savings time transition, use ``ambiguous`` or + ``nonexistent`` to control how the timestamp should be re-localized. + + >>> rng_tz = pd.DatetimeIndex(["2021-10-31 01:30:00"], tz="Europe/Amsterdam") + + >>> rng_tz.ceil("h", ambiguous=False) + DatetimeIndex(['2021-10-31 02:00:00+01:00'], + dtype='datetime64[s, Europe/Amsterdam]', freq=None) + + >>> rng_tz.ceil("h", ambiguous=True) + DatetimeIndex(['2021-10-31 02:00:00+02:00'], + dtype='datetime64[s, Europe/Amsterdam]', freq=None) + """ return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) # -------------------------------------------------------------- diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 033d654889e91..7c8d5f1e6e583 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2565,42 +2565,53 @@ def _get_grouper( def _get_time_bins(self, ax: DatetimeIndex): if not isinstance(ax, DatetimeIndex): raise TypeError( - "axis must be a DatetimeIndex, but got " + "axis must be a DatetimeIndex, but got" f"an instance of {type(ax).__name__}" ) if len(ax) == 0: - binner = labels = DatetimeIndex( - data=[], freq=self.freq, name=ax.name, dtype=ax.dtype + empty = DatetimeIndex(data=[], freq=self.freq, name=ax.name, dtype=ax.dtype) + return empty, [], empty + + def _calculate_bins_in_timezone(ax_to_use, tz): + """Calculate time bins in specified timezone""" + first, last = _get_timestamp_range_edges( + ax_to_use.min(), + ax_to_use.max(), + self.freq, + unit=ax.unit, + closed=self.closed, + origin=self.origin, + offset=self.offset, + ) + return date_range( + freq=self.freq, + start=first, + end=last, + tz=tz, + name=ax.name, + ambiguous=True, + nonexistent="shift_forward", + unit=ax.unit, ) - return binner, [], labels - first, last = _get_timestamp_range_edges( - ax.min(), - ax.max(), - self.freq, - unit=ax.unit, - closed=self.closed, - origin=self.origin, - offset=self.offset, - ) - # GH #12037 - # use first/last directly instead of call replace() on them - # because replace() will swallow the nanosecond part - # thus last bin maybe slightly before the end if the end contains - # nanosecond part and lead to `Values falls after last bin` error - # GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback - # has noted that ambiguous=True provides the most sensible result - binner = labels = date_range( - freq=self.freq, - start=first, - end=last, - tz=ax.tz, - name=ax.name, - ambiguous=True, - nonexistent="shift_forward", - unit=ax.unit, - ) + if ax.tz is not None: + try: + # normal way + binner = labels = _calculate_bins_in_timezone(ax, ax.tz) + except Exception as e: + if "nonexistent" in str(e).lower() or "ambiguous" in str(e).lower(): + # Fallback to UTC calculation for timezone-aware data + # to handle DST transitions + # 62601 + ax_utc = ax.tz_convert("UTC") + binner_utc = _calculate_bins_in_timezone(ax_utc, "UTC") + binner = labels = binner_utc.tz_convert(ax.tz) + else: + raise + else: + # no time zone + binner = labels = _calculate_bins_in_timezone(ax, None) ax_values = ax.asi8 binner, bin_edges = self._adjust_bin_edges(binner, ax_values) diff --git a/pandas/tests/resample/test_dst_handling.py b/pandas/tests/resample/test_dst_handling.py new file mode 100644 index 0000000000000..bc7d0100efab5 --- /dev/null +++ b/pandas/tests/resample/test_dst_handling.py @@ -0,0 +1,240 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + DatetimeIndex, +) + + +class TestResampleDSTAfricaCairo: + """DST transition tests for Africa/Cairo timezone.""" + + def test_resample_across_dst_transition(self): + df = DataFrame( + {"value": [1, 2]}, + index=DatetimeIndex( + [ + "2024-04-26 01:00:00", + "2024-04-27 00:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("D").mean() + + assert len(result) == 2 + assert isinstance(result.index, DatetimeIndex) + assert result.index.tz is not None + assert not result.isna().any().any() + + def test_resample_before_dst_boundary(self): + df = DataFrame( + {"value": [76.0, 42.0]}, + index=DatetimeIndex( + [ + "2024-04-24 00:00:00", + "2024-04-25 00:00:00", + ] + ).tz_localize("Africa/Cairo"), + ) + + result = df.resample("D").mean() + + assert len(result) == 2 + assert isinstance(result.index, DatetimeIndex) + assert "Africa/Cairo" in str(result.index.tz) + assert result.iloc[0, 0] == 76.0 + assert result.iloc[1, 0] == 42.0 + + @pytest.mark.parametrize("freq", ["2h", "6h", "12h"]) + def test_resample_various_freq(self, freq): + df = DataFrame( + {"value": [1, 2, 3, 4, 5]}, + index=DatetimeIndex( + [ + "2024-04-25 22:00:00", + "2024-04-25 23:00:00", + "2024-04-26 01:00:00", + "2024-04-26 02:00:00", + "2024-04-26 03:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample(freq).mean() + + assert isinstance(result, DataFrame) + assert len(result) > 0 + assert not result.isna().all().any() + + def test_resample_closed_label_combinations(self): + df = DataFrame( + {"value": [1, 2]}, + index=DatetimeIndex( + [ + "2024-04-26 01:00:00", + "2024-04-27 00:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + for closed in ["left", "right"]: + for label in ["left", "right"]: + result = df.resample("D", closed=closed, label=label).mean() + assert len(result) >= 1 + assert not result.isna().all().any() + + def test_resample_nonexistent_times(self): + timestamps = [ + "2024-04-25 23:00:00", + "2024-04-26 00:30:00", + "2024-04-26 01:00:00", + ] + + df = DataFrame( + {"value": [1, 2, 3]}, + index=DatetimeIndex(timestamps).tz_localize( + "Africa/Cairo", nonexistent="shift_forward" + ), + ) + + result = df.resample("h").mean() + + assert len(result) > 0 + assert isinstance(result, DataFrame) + + def test_resample_empty_dataframe(self): + df = DataFrame({"value": []}, index=DatetimeIndex([], tz="Africa/Cairo")) + + result = df.resample("D").mean() + + assert len(result) == 0 + assert isinstance(result.index, DatetimeIndex) + + def test_resample_single_point(self): + df = DataFrame( + {"value": [42.0]}, + index=DatetimeIndex(["2024-04-26 12:00:00"]).tz_localize( + "Africa/Cairo", nonexistent="shift_forward" + ), + ) + + result = df.resample("D").mean() + + assert len(result) == 1 + assert result.iloc[0, 0] == 42.0 + + +class TestResampleDSTMultipleTimezones: + """DST handling across multiple timezones.""" + + def test_resample_multiple_timezones(self): + timezones = [ + ("Africa/Cairo", "2024-04-26 01:00:00", "2024-04-27 00:00:00"), + ("Europe/London", "2024-03-31 01:00:00", "2024-04-01 00:00:00"), + ("America/New_York", "2024-03-10 01:00:00", "2024-03-11 00:00:00"), + ] + + for tz, start, end in timezones: + df = DataFrame( + {"value": [1, 2]}, + index=DatetimeIndex([start, end]).tz_localize( + tz, nonexistent="shift_forward", ambiguous=True + ), + ) + + result = df.resample("D").mean() + + assert len(result) >= 1 + assert isinstance(result.index, DatetimeIndex) + assert result.index.tz is not None + + +class TestResampleDSTEdgeCases: + """Edge cases around DST transitions.""" + + def test_resample_multiple_dst_days(self): + df = DataFrame( + {"value": [1, 2, 3, 4]}, + index=DatetimeIndex( + [ + "2024-04-25 23:00:00", + "2024-04-26 01:00:00", + "2024-04-27 00:00:00", + "2024-04-28 00:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("D").mean() + + assert len(result) >= 3 + + def test_resample_microsecond_precision(self): + df = DataFrame( + {"value": [1.1, 2.2]}, + index=DatetimeIndex( + [ + "2024-04-26 01:00:00.123456", + "2024-04-27 00:00:00.654321", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("D").mean() + + assert len(result) == 2 + + def test_resample_with_na_values(self): + df = DataFrame( + {"value": [1.0, np.nan, 3.0]}, + index=DatetimeIndex( + [ + "2024-04-25 23:00:00", + "2024-04-26 01:00:00", + "2024-04-26 02:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("h").mean() + + assert len(result) > 0 + assert isinstance(result, DataFrame) + + +class TestResampleDSTOriginalIssues: + """Tests reproducing the originally reported issues.""" + + def test_original_issue_1(self): + df = DataFrame( + {"value": [1, 2]}, + index=DatetimeIndex( + [ + "2024-04-26 01:00:00", + "2024-04-27 00:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("D").mean() + + assert len(result) > 0 + assert not result.isna().any().any() + + def test_original_issue_2(self): + df = DataFrame( + {"value": [76.0, 42.0]}, + index=DatetimeIndex( + [ + "2024-04-24 00:00:00", + "2024-04-25 00:00:00", + ] + ).tz_localize("Africa/Cairo"), + ) + + result = df.resample("D").mean() + + assert len(result) > 0 + assert not result.isna().any().any()