fix(rolling.skew): handle outliers in window

Alvaro-Kothe · Alvaro-Kothe · commit 23345cff57c8 · 2025-10-26T13:23:24.000-03:00
This change uses online update for the mean instead of computing the
centralized values. Additionally, it checks for possible catastrophic
cancellation by big changes in 3rd central moment.
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -1,6 +1,7 @@
 # cython: boundscheck=False, wraparound=False, cdivision=True
 
 from libc.math cimport (
+    fabs,
     round,
     signbit,
     sqrt,
@@ -482,196 +483,149 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 
 
 cdef float64_t calc_skew(int64_t minp, int64_t nobs,
-                         float64_t x, float64_t xx, float64_t xxx,
-                         int64_t num_consecutive_same_value
+                         float64_t mean, float64_t m2, float64_t m3,
                          ) noexcept nogil:
     cdef:
         float64_t result, dnobs
-        float64_t A, B, C, R
+        float64_t moments_ratio, correction
 
     if nobs >= minp:
         dnobs = <float64_t>nobs
-        A = x / dnobs
-        B = xx / dnobs - A * A
-        C = xxx / dnobs - A * A * A - 3 * A * B
 
         if nobs < 3:
             result = NaN
-        # GH 42064 46431
-        # uniform case, force result to be 0
-        elif num_consecutive_same_value >= nobs:
-            result = 0.0
-        # #18044: with uniform distribution, floating issue will
-        #         cause B != 0. and cause the result is a very
+        # #18044: with degenerate distribution, floating issue will
+        #         cause m2 != 0. and cause the result is a very
         #         large number.
         #
         #         in core/nanops.py nanskew/nankurt call the function
         #         _zero_out_fperr(m2) to fix floating error.
         #         if the variance is less than 1e-14, it could be
         #         treat as zero, here we follow the original
-        #         skew/kurt behaviour to check B <= 1e-14
-        elif B <= 1e-14:
+        #         skew/kurt behaviour to check m2 < n * (eps * eps * mean * mean)
+        elif m2 < dnobs * (1e-28 * mean * mean if fabs(mean) > 1e-14 else 1e-14):
             result = NaN
         else:
-            R = sqrt(B)
-            result = ((sqrt(dnobs * (dnobs - 1.)) * C) /
-                      ((dnobs - 2) * R * R * R))
+            moments_ratio = m3 / (m2 * sqrt(m2))
+            correction = dnobs * sqrt((dnobs - 1)) / (dnobs - 2)
+            result = moments_ratio * correction
     else:
         result = NaN
 
     return result
 
 
 cdef void add_skew(float64_t val, int64_t *nobs,
-                   float64_t *x, float64_t *xx,
-                   float64_t *xxx,
-                   float64_t *compensation_x,
-                   float64_t *compensation_xx,
-                   float64_t *compensation_xxx,
-                   int64_t *num_consecutive_same_value,
-                   float64_t *prev_value,
+                   float64_t *mean, float64_t *m2,
+                   float64_t *m3,
+                   bint *numerically_unstable
                    ) noexcept nogil:
     """ add a value from the skew calc """
     cdef:
-        float64_t y, t
+        float64_t n, delta, delta_n, term1, m3_update, new_m3
 
     # Not NaN
     if val == val:
-        nobs[0] = nobs[0] + 1
+        nobs[0] += 1
+        n = <float64_t>(nobs[0])
+        delta = val - mean[0]
+        delta_n = delta / n
+        term1 = delta * delta_n * (n - 1.0)
 
-        y = val - compensation_x[0]
-        t = x[0] + y
-        compensation_x[0] = t - x[0] - y
-        x[0] = t
-        y = val * val - compensation_xx[0]
-        t = xx[0] + y
-        compensation_xx[0] = t - xx[0] - y
-        xx[0] = t
-        y = val * val * val - compensation_xxx[0]
-        t = xxx[0] + y
-        compensation_xxx[0] = t - xxx[0] - y
-        xxx[0] = t
+        m3_update = delta_n * (term1 * (n - 2.0) - 3.0 * m2[0])
+        new_m3 = m3[0] + m3_update
+        if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3):
+            # possible catastrophic cancellation
+            numerically_unstable[0] = True
 
-        # GH#42064, record num of same values to remove floating point artifacts
-        if val == prev_value[0]:
-            num_consecutive_same_value[0] += 1
-        else:
-            # reset to 1 (include current value itself)
-            num_consecutive_same_value[0] = 1
-        prev_value[0] = val
+        m3[0] = new_m3
+        m2[0] += term1
+        mean[0] += delta_n
 
 
 cdef void remove_skew(float64_t val, int64_t *nobs,
-                      float64_t *x, float64_t *xx,
-                      float64_t *xxx,
-                      float64_t *compensation_x,
-                      float64_t *compensation_xx,
-                      float64_t *compensation_xxx) noexcept nogil:
+                      float64_t *mean, float64_t *m2,
+                      float64_t *m3,
+                      bint *numerically_unstable) noexcept nogil:
     """ remove a value from the skew calc """
     cdef:
-        float64_t y, t
+        float64_t n, delta, delta_n, term1, m3_update, new_m3
 
     # Not NaN
     if val == val:
-        nobs[0] = nobs[0] - 1
+        nobs[0] -= 1
+        n = <float64_t>(nobs[0])
+        delta = val - mean[0]
+        delta_n = delta / n
+        term1 = delta_n * delta * (n + 1.0)
 
-        y = - val - compensation_x[0]
-        t = x[0] + y
-        compensation_x[0] = t - x[0] - y
-        x[0] = t
-        y = - val * val - compensation_xx[0]
-        t = xx[0] + y
-        compensation_xx[0] = t - xx[0] - y
-        xx[0] = t
-        y = - val * val * val - compensation_xxx[0]
-        t = xxx[0] + y
-        compensation_xxx[0] = t - xxx[0] - y
-        xxx[0] = t
+        m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0])
+        new_m3 = m3[0] - m3_update
+
+        if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3):
+            # possible catastrophic cancellation
+            numerically_unstable[0] = True
+
+        m3[0] = new_m3
+        m2[0] -= term1
+        mean[0] -= delta_n
 
 
 def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp) -> np.ndarray:
     cdef:
         Py_ssize_t i, j
-        float64_t val, min_val, mean_val, sum_val = 0
-        float64_t compensation_xxx_add, compensation_xxx_remove
-        float64_t compensation_xx_add, compensation_xx_remove
-        float64_t compensation_x_add, compensation_x_remove
-        float64_t x, xx, xxx
-        float64_t prev_value
-        int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
-        int64_t s, e, num_consecutive_same_value
-        ndarray[float64_t] output, values_copy
-        bint is_monotonic_increasing_bounds
+        float64_t val
+        float64_t mean, m2, m3
+        int64_t nobs = 0, N = len(start)
+        int64_t s, e
+        ndarray[float64_t] output
+        bint requires_recompute, numerically_unstable = False
 
     minp = max(minp, 3)
     is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
         start, end
     )
     output = np.empty(N, dtype=np.float64)
-    min_val = np.nanmin(values)
-    values_copy = np.copy(values)
 
     with nogil:
-        for i in range(0, V):
-            val = values_copy[i]
-            if val == val:
-                nobs_mean += 1
-                sum_val += val
-        mean_val = sum_val / nobs_mean
-        # Other cases would lead to imprecision for smallest values
-        if min_val - mean_val > -1e5:
-            mean_val = round(mean_val)
-            for i in range(0, V):
-                values_copy[i] = values_copy[i] - mean_val
-
         for i in range(0, N):
-
             s = start[i]
             e = end[i]
 
-            # Over the first window, observations can only be added
-            # never removed
-            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
-
-                prev_value = values[s]
-                num_consecutive_same_value = 0
-
-                compensation_xxx_add = compensation_xxx_remove = 0
-                compensation_xx_add = compensation_xx_remove = 0
-                compensation_x_add = compensation_x_remove = 0
-                x = xx = xxx = 0
-                nobs = 0
-                for j in range(s, e):
-                    val = values_copy[j]
-                    add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
-                             &compensation_xx_add, &compensation_xxx_add,
-                             &num_consecutive_same_value, &prev_value)
-
-            else:
+            requires_recompute = (
+                i == 0
+                or not is_monotonic_increasing_bounds
+                or s >= end[i - 1]
+                or numerically_unstable
+            )
 
-                # After the first window, observations can both be added
-                # and removed
-                # calculate deletes
+            if not requires_recompute:
                 for j in range(start[i - 1], s):
-                    val = values_copy[j]
-                    remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove,
-                                &compensation_xx_remove, &compensation_xxx_remove)
+                    val = values[j]
+                    remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    val = values_copy[j]
-                    add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
-                             &compensation_xx_add, &compensation_xxx_add,
-                             &num_consecutive_same_value, &prev_value)
+                    val = values[j]
+                    add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
 
-            output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)
+            if requires_recompute or numerically_unstable:
+                numerically_unstable = False
+                mean = m2 = m3 = 0.0
+                nobs = 0
+
+                for j in range(s, e):
+                    val = values[j]
+                    add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable)
+
+            output[i] = calc_skew(minp, nobs, mean, m2, m3)
 
             if not is_monotonic_increasing_bounds:
                 nobs = 0
-                x = 0.0
-                xx = 0.0
-                xxx = 0.0
+                mean = 0.0
+                m2 = 0.0
+                m3 = 0.0
 
     return output
 
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1172,7 +1172,9 @@ def test_rolling_decreasing_indices(method):
     increasing = getattr(df.rolling(window=5), method)()
     decreasing = getattr(df_reverse.rolling(window=5), method)()
 
-    assert np.abs(decreasing.values[::-1][:-4] - increasing.values[4:]).max() < 1e-12
+    tm.assert_almost_equal(
+        decreasing.values[::-1][:-4], increasing.values[4:], atol=1e-12
+    )
 
 
 @pytest.mark.parametrize(
@@ -1438,13 +1440,18 @@ def test_rolling_skew_kurt_numerical_stability(method):
 
 
 @pytest.mark.parametrize(
-    "method, data, values",
+    ("method", "data", "values"),
     [
         (
             "skew",
             [3000000, 1, 1, 2, 3, 4, 999],
             [np.nan] * 3 + [2.0, 0.854563, 0.0, 1.999984],
         ),
+        (
+            "skew",
+            [1e6, -1e6, 1, 2, 3, 4, 5, 6],
+            [np.nan] * 3 + [-5.51135192e-06, -2.0, 0.0, 0.0, 0.0],
+        ),
         (
             "kurt",
             [3000000, 1, 1, 2, 3, 4, 999],