diff --git a/pandas/core/resample.py b/pandas/core/resample.py index f84bedda8d00c..53ae4f1c419b4 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2576,42 +2576,80 @@ def _get_time_bins(self, ax: DatetimeIndex): ) if len(ax) == 0: - binner = labels = DatetimeIndex( - data=[], freq=self.freq, name=ax.name, dtype=ax.dtype - ) - return binner, [], labels + empty = DatetimeIndex(data=[], freq=self.freq, name=ax.name, dtype=ax.dtype) + return empty, [], empty - first, last = _get_timestamp_range_edges( - ax.min(), - ax.max(), - self.freq, - unit=ax.unit, - closed=self.closed, - origin=self.origin, - offset=self.offset, - ) - # GH #12037 - # use first/last directly instead of call replace() on them - # because replace() will swallow the nanosecond part - # thus last bin maybe slightly before the end if the end contains - # nanosecond part and lead to `Values falls after last bin` error - # GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback - # has noted that ambiguous=True provides the most sensible result - binner = labels = date_range( - freq=self.freq, - start=first, - end=last, - tz=ax.tz, - name=ax.name, - ambiguous=True, - nonexistent="shift_forward", - unit=ax.unit, - ) + if ax.tz is not None: + try: + first, last = _get_timestamp_range_edges( + ax.min(), + ax.max(), + self.freq, + unit=ax.unit, + closed=self.closed, + origin=self.origin, + offset=self.offset, + ) + binner = labels = date_range( + freq=self.freq, + start=first, + end=last, + tz=ax.tz, + name=ax.name, + ambiguous=True, + nonexistent="shift_forward", + unit=ax.unit, + ) + except Exception as e: + if "nonexistent" not in str(e).lower(): + raise + + ax_utc = ax.tz_convert("UTC") + + first_utc, last_utc = _get_timestamp_range_edges( + ax_utc.min(), + ax_utc.max(), + self.freq, + unit=ax.unit, + closed=self.closed, + origin=self.origin, + offset=self.offset, + ) + + binner_utc = date_range( + start=first_utc, + end=last_utc, + freq=self.freq, + tz="UTC", + name=ax.name, + unit=ax.unit, + ) + + binner = labels = binner_utc.tz_convert(ax.tz) + else: + first, last = _get_timestamp_range_edges( + ax.min(), + ax.max(), + self.freq, + unit=ax.unit, + closed=self.closed, + origin=self.origin, + offset=self.offset, + ) + binner = labels = date_range( + freq=self.freq, + start=first, + end=last, + tz=ax.tz, + name=ax.name, + ambiguous=True, + nonexistent="shift_forward", + unit=ax.unit, + ) ax_values = ax.asi8 binner, bin_edges = self._adjust_bin_edges(binner, ax_values) - # general version, knowing nothing about relative frequencies bins = lib.generate_bins_dt64( ax_values, bin_edges, self.closed, hasnans=ax.hasnans ) @@ -2627,9 +2665,6 @@ def _get_time_bins(self, ax: DatetimeIndex): binner = binner.insert(0, NaT) labels = labels.insert(0, NaT) - # if we end up with more labels than bins - # adjust the labels - # GH4076 if len(bins) < len(labels): labels = labels[: len(bins)] diff --git a/pandas/tests/resample/test_dst_handling.py b/pandas/tests/resample/test_dst_handling.py new file mode 100644 index 0000000000000..bc7d0100efab5 --- /dev/null +++ b/pandas/tests/resample/test_dst_handling.py @@ -0,0 +1,240 @@ +import numpy as np +import pytest + +from pandas import ( + DataFrame, + DatetimeIndex, +) + + +class TestResampleDSTAfricaCairo: + """DST transition tests for Africa/Cairo timezone.""" + + def test_resample_across_dst_transition(self): + df = DataFrame( + {"value": [1, 2]}, + index=DatetimeIndex( + [ + "2024-04-26 01:00:00", + "2024-04-27 00:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("D").mean() + + assert len(result) == 2 + assert isinstance(result.index, DatetimeIndex) + assert result.index.tz is not None + assert not result.isna().any().any() + + def test_resample_before_dst_boundary(self): + df = DataFrame( + {"value": [76.0, 42.0]}, + index=DatetimeIndex( + [ + "2024-04-24 00:00:00", + "2024-04-25 00:00:00", + ] + ).tz_localize("Africa/Cairo"), + ) + + result = df.resample("D").mean() + + assert len(result) == 2 + assert isinstance(result.index, DatetimeIndex) + assert "Africa/Cairo" in str(result.index.tz) + assert result.iloc[0, 0] == 76.0 + assert result.iloc[1, 0] == 42.0 + + @pytest.mark.parametrize("freq", ["2h", "6h", "12h"]) + def test_resample_various_freq(self, freq): + df = DataFrame( + {"value": [1, 2, 3, 4, 5]}, + index=DatetimeIndex( + [ + "2024-04-25 22:00:00", + "2024-04-25 23:00:00", + "2024-04-26 01:00:00", + "2024-04-26 02:00:00", + "2024-04-26 03:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample(freq).mean() + + assert isinstance(result, DataFrame) + assert len(result) > 0 + assert not result.isna().all().any() + + def test_resample_closed_label_combinations(self): + df = DataFrame( + {"value": [1, 2]}, + index=DatetimeIndex( + [ + "2024-04-26 01:00:00", + "2024-04-27 00:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + for closed in ["left", "right"]: + for label in ["left", "right"]: + result = df.resample("D", closed=closed, label=label).mean() + assert len(result) >= 1 + assert not result.isna().all().any() + + def test_resample_nonexistent_times(self): + timestamps = [ + "2024-04-25 23:00:00", + "2024-04-26 00:30:00", + "2024-04-26 01:00:00", + ] + + df = DataFrame( + {"value": [1, 2, 3]}, + index=DatetimeIndex(timestamps).tz_localize( + "Africa/Cairo", nonexistent="shift_forward" + ), + ) + + result = df.resample("h").mean() + + assert len(result) > 0 + assert isinstance(result, DataFrame) + + def test_resample_empty_dataframe(self): + df = DataFrame({"value": []}, index=DatetimeIndex([], tz="Africa/Cairo")) + + result = df.resample("D").mean() + + assert len(result) == 0 + assert isinstance(result.index, DatetimeIndex) + + def test_resample_single_point(self): + df = DataFrame( + {"value": [42.0]}, + index=DatetimeIndex(["2024-04-26 12:00:00"]).tz_localize( + "Africa/Cairo", nonexistent="shift_forward" + ), + ) + + result = df.resample("D").mean() + + assert len(result) == 1 + assert result.iloc[0, 0] == 42.0 + + +class TestResampleDSTMultipleTimezones: + """DST handling across multiple timezones.""" + + def test_resample_multiple_timezones(self): + timezones = [ + ("Africa/Cairo", "2024-04-26 01:00:00", "2024-04-27 00:00:00"), + ("Europe/London", "2024-03-31 01:00:00", "2024-04-01 00:00:00"), + ("America/New_York", "2024-03-10 01:00:00", "2024-03-11 00:00:00"), + ] + + for tz, start, end in timezones: + df = DataFrame( + {"value": [1, 2]}, + index=DatetimeIndex([start, end]).tz_localize( + tz, nonexistent="shift_forward", ambiguous=True + ), + ) + + result = df.resample("D").mean() + + assert len(result) >= 1 + assert isinstance(result.index, DatetimeIndex) + assert result.index.tz is not None + + +class TestResampleDSTEdgeCases: + """Edge cases around DST transitions.""" + + def test_resample_multiple_dst_days(self): + df = DataFrame( + {"value": [1, 2, 3, 4]}, + index=DatetimeIndex( + [ + "2024-04-25 23:00:00", + "2024-04-26 01:00:00", + "2024-04-27 00:00:00", + "2024-04-28 00:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("D").mean() + + assert len(result) >= 3 + + def test_resample_microsecond_precision(self): + df = DataFrame( + {"value": [1.1, 2.2]}, + index=DatetimeIndex( + [ + "2024-04-26 01:00:00.123456", + "2024-04-27 00:00:00.654321", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("D").mean() + + assert len(result) == 2 + + def test_resample_with_na_values(self): + df = DataFrame( + {"value": [1.0, np.nan, 3.0]}, + index=DatetimeIndex( + [ + "2024-04-25 23:00:00", + "2024-04-26 01:00:00", + "2024-04-26 02:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("h").mean() + + assert len(result) > 0 + assert isinstance(result, DataFrame) + + +class TestResampleDSTOriginalIssues: + """Tests reproducing the originally reported issues.""" + + def test_original_issue_1(self): + df = DataFrame( + {"value": [1, 2]}, + index=DatetimeIndex( + [ + "2024-04-26 01:00:00", + "2024-04-27 00:00:00", + ] + ).tz_localize("Africa/Cairo", nonexistent="shift_forward"), + ) + + result = df.resample("D").mean() + + assert len(result) > 0 + assert not result.isna().any().any() + + def test_original_issue_2(self): + df = DataFrame( + {"value": [76.0, 42.0]}, + index=DatetimeIndex( + [ + "2024-04-24 00:00:00", + "2024-04-25 00:00:00", + ] + ).tz_localize("Africa/Cairo"), + ) + + result = df.resample("D").mean() + + assert len(result) > 0 + assert not result.isna().any().any()