diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 65982ecdb810c..a2c19198885d3 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1260,6 +1260,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.skew` incorrectly computing skewness for windows following outliers due to numerical instability. The calculation now properly handles catastrophic cancellation by recomputing affected windows (:issue:`47461`) - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) - Bug in :meth:`Series.resample` raising error when resampling non-nanosecond resolutions out of bounds for nanosecond precision (:issue:`57427`) +- Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` computing incorrect results due to numerical instability. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index dccd93e8dafd9..89530c6c9c46c 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -332,19 +332,13 @@ cdef float64_t calc_var( int ddof, float64_t nobs, float64_t ssqdm_x, - int64_t num_consecutive_same_value ) noexcept nogil: cdef: float64_t result # Variance is unchanged if no observation is added or removed if (nobs >= minp) and (nobs > ddof): - - # pathological case & repeatedly same values case - if nobs == 1 or num_consecutive_same_value >= nobs: - result = 0 - else: - result = ssqdm_x / (nobs - ddof) + result = ssqdm_x / (nobs - ddof) else: result = NaN @@ -357,12 +351,12 @@ cdef void add_var( float64_t *mean_x, float64_t *ssqdm_x, float64_t *compensation, - int64_t *num_consecutive_same_value, - float64_t *prev_value, + bint *numerically_unstable, ) noexcept nogil: """ add a value from the var calc """ cdef: float64_t delta, prev_mean, y, t + float64_t prev_m2 = ssqdm_x[0] # GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan` if val != val: @@ -370,14 +364,6 @@ cdef void add_var( nobs[0] = nobs[0] + 1 - # GH#42064, record num of same values to remove floating point artifacts - if val == prev_value[0]: - num_consecutive_same_value[0] += 1 - else: - # reset to 1 (include current value itself) - num_consecutive_same_value[0] = 1 - prev_value[0] = val - # Welford's method for the online variance-calculation # using Kahan summation # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance @@ -392,17 +378,23 @@ cdef void add_var( mean_x[0] = 0 ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0]) + if prev_m2 * InvCondTol > ssqdm_x[0]: + # possible catastrophic cancellation + numerically_unstable[0] = True + cdef void remove_var( float64_t val, float64_t *nobs, float64_t *mean_x, float64_t *ssqdm_x, - float64_t *compensation + float64_t *compensation, + bint *numerically_unstable, ) noexcept nogil: """ remove a value from the var calc """ cdef: float64_t delta, prev_mean, y, t + float64_t prev_m2 = ssqdm_x[0] if val == val: nobs[0] = nobs[0] - 1 if nobs[0]: @@ -416,9 +408,14 @@ cdef void remove_var( delta = t mean_x[0] = mean_x[0] - delta / nobs[0] ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0]) + + if prev_m2 * InvCondTol > ssqdm_x[0]: + # possible catastrophic cancellation + numerically_unstable[0] = True else: mean_x[0] = 0 ssqdm_x[0] = 0 + numerically_unstable[0] = False def roll_var(const float64_t[:] values, ndarray[int64_t] start, @@ -428,11 +425,12 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, """ cdef: float64_t mean_x, ssqdm_x, nobs, compensation_add, - float64_t compensation_remove, prev_value - int64_t s, e, num_consecutive_same_value + float64_t compensation_remove + int64_t s, e Py_ssize_t i, j, N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds + bint requires_recompute, numerically_unstable minp = max(minp, 1) is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( @@ -449,32 +447,35 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, # Over the first window, observations can only be added # never removed - if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: - - prev_value = values[s] - num_consecutive_same_value = 0 - - mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0 - for j in range(s, e): - add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, - &num_consecutive_same_value, &prev_value) - - else: + requires_recompute = ( + i == 0 + or not is_monotonic_increasing_bounds + or s >= end[i - 1] + ) + if not requires_recompute: # After the first window, observations can both be added # and removed # calculate deletes for j in range(start[i - 1], s): remove_var(values[j], &nobs, &mean_x, &ssqdm_x, - &compensation_remove) + &compensation_remove, &numerically_unstable) # calculate adds for j in range(end[i - 1], e): add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, - &num_consecutive_same_value, &prev_value) + &numerically_unstable) + + if requires_recompute or numerically_unstable: + + mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0 + for j in range(s, e): + add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add, + &numerically_unstable) + numerically_unstable = False - output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) if not is_monotonic_increasing_bounds: nobs = 0.0 diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 5b00aeed79db6..dff307b595a3a 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -8,9 +8,6 @@ from pandas.compat import ( IS64, - is_platform_arm, - is_platform_power, - is_platform_riscv64, ) from pandas.errors import Pandas4Warning @@ -1085,27 +1082,91 @@ def test_rolling_sem(frame_or_series): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - is_platform_arm() or is_platform_power() or is_platform_riscv64(), - reason="GH 38921", -) @pytest.mark.parametrize( - ("func", "third_value", "values"), + ("func", "values", "window", "ddof", "expected_values"), [ - ("var", 1, [5e33, 0, 0.5, 0.5, 2, 0]), - ("std", 1, [7.071068e16, 0, 0.7071068, 0.7071068, 1.414214, 0]), - ("var", 2, [5e33, 0.5, 0, 0.5, 2, 0]), - ("std", 2, [7.071068e16, 0.7071068, 0, 0.7071068, 1.414214, 0]), + ("var", [99999999999999999, 1, 1, 2, 3, 1, 1], 2, 1, [5e33, 0, 0.5, 0.5, 2, 0]), + ( + "std", + [99999999999999999, 1, 1, 2, 3, 1, 1], + 2, + 1, + [7.071068e16, 0, 0.7071068, 0.7071068, 1.414214, 0], + ), + ("var", [99999999999999999, 1, 2, 2, 3, 1, 1], 2, 1, [5e33, 0.5, 0, 0.5, 2, 0]), + ( + "std", + [99999999999999999, 1, 2, 2, 3, 1, 1], + 2, + 1, + [7.071068e16, 0.7071068, 0, 0.7071068, 1.414214, 0], + ), + ( + "std", + [1.2e03, 1.3e17, 1.5e17, 1.995e03, 1.990e03], + 2, + 1, + [9.192388e16, 1.414214e16, 1.060660e17, 3.535534e00], + ), + ( + "var", + [ + 0.00000000e00, + 0.00000000e00, + 3.16188252e-18, + 2.95781651e-16, + 2.23153542e-51, + 0.00000000e00, + 0.00000000e00, + 5.39943432e-48, + 1.38206260e-73, + 0.00000000e00, + ], + 3, + 1, + [ + 3.33250036e-036, + 2.88538519e-032, + 2.88538519e-032, + 2.91622617e-032, + 1.65991678e-102, + 9.71796366e-096, + 9.71796366e-096, + 9.71796366e-096, + ], + ), + ( + "std", + [1, -1, 0, 1, 3, 2, -2, 10000000000, 1, 2, 0, -2, 1, 3, 0, 1], + 6, + 1, + [ + 1.41421356e00, + 1.87082869e00, + 4.08248290e09, + 4.08248290e09, + 4.08248290e09, + 4.08248290e09, + 4.08248290e09, + 4.08248290e09, + 1.72240142e00, + 1.75119007e00, + 1.64316767e00, + ], + ), ], ) -def test_rolling_var_numerical_issues(func, third_value, values): - # GH: 37051 - ds = Series([99999999999999999, 1, third_value, 2, 3, 1, 1]) - result = getattr(ds.rolling(2), func)() - expected = Series([np.nan] + values) - tm.assert_series_equal(result, expected) +def test_rolling_var_correctness(func, values, window, ddof, expected_values): + # GH: 37051, 42064, 54518, 52407, 47721 + ts = Series(values) + result = getattr(ts.rolling(window=window), func)(ddof=ddof) + if result.last_valid_index(): + result = result[ + result.first_valid_index() : result.last_valid_index() + 1 + ].reset_index(drop=True) + expected = Series(expected_values) + tm.assert_series_equal(result, expected, atol=1e-55) # GH 42064 - # new `roll_var` will output 0.0 correctly tm.assert_series_equal(result == 0, expected == 0)