|
1 | 1 | # cython: boundscheck=False, wraparound=False, cdivision=True |
2 | 2 |
|
3 | 3 | from libc.math cimport ( |
| 4 | + fabs, |
4 | 5 | round, |
5 | 6 | signbit, |
6 | 7 | sqrt, |
@@ -482,196 +483,149 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start, |
482 | 483 |
|
483 | 484 |
|
484 | 485 | cdef float64_t calc_skew(int64_t minp, int64_t nobs, |
485 | | - float64_t x, float64_t xx, float64_t xxx, |
486 | | - int64_t num_consecutive_same_value |
| 486 | + float64_t mean, float64_t m2, float64_t m3, |
487 | 487 | ) noexcept nogil: |
488 | 488 | cdef: |
489 | 489 | float64_t result, dnobs |
490 | | - float64_t A, B, C, R |
| 490 | + float64_t moments_ratio, correction |
491 | 491 |
|
492 | 492 | if nobs >= minp: |
493 | 493 | dnobs = <float64_t>nobs |
494 | | - A = x / dnobs |
495 | | - B = xx / dnobs - A * A |
496 | | - C = xxx / dnobs - A * A * A - 3 * A * B |
497 | 494 |
|
498 | 495 | if nobs < 3: |
499 | 496 | result = NaN |
500 | | - # GH 42064 46431 |
501 | | - # uniform case, force result to be 0 |
502 | | - elif num_consecutive_same_value >= nobs: |
503 | | - result = 0.0 |
504 | | - # #18044: with uniform distribution, floating issue will |
505 | | - # cause B != 0. and cause the result is a very |
| 497 | + # #18044: with degenerate distribution, floating issue will |
| 498 | + # cause m2 != 0. and cause the result is a very |
506 | 499 | # large number. |
507 | 500 | # |
508 | 501 | # in core/nanops.py nanskew/nankurt call the function |
509 | 502 | # _zero_out_fperr(m2) to fix floating error. |
510 | 503 | # if the variance is less than 1e-14, it could be |
511 | 504 | # treat as zero, here we follow the original |
512 | | - # skew/kurt behaviour to check B <= 1e-14 |
513 | | - elif B <= 1e-14: |
| 505 | + # skew/kurt behaviour to check m2 < n * (eps * eps * mean * mean) |
| 506 | + elif m2 < dnobs * (1e-28 * mean * mean if fabs(mean) > 1e-14 else 1e-14): |
514 | 507 | result = NaN |
515 | 508 | else: |
516 | | - R = sqrt(B) |
517 | | - result = ((sqrt(dnobs * (dnobs - 1.)) * C) / |
518 | | - ((dnobs - 2) * R * R * R)) |
| 509 | + moments_ratio = m3 / (m2 * sqrt(m2)) |
| 510 | + correction = dnobs * sqrt((dnobs - 1)) / (dnobs - 2) |
| 511 | + result = moments_ratio * correction |
519 | 512 | else: |
520 | 513 | result = NaN |
521 | 514 |
|
522 | 515 | return result |
523 | 516 |
|
524 | 517 |
|
525 | 518 | cdef void add_skew(float64_t val, int64_t *nobs, |
526 | | - float64_t *x, float64_t *xx, |
527 | | - float64_t *xxx, |
528 | | - float64_t *compensation_x, |
529 | | - float64_t *compensation_xx, |
530 | | - float64_t *compensation_xxx, |
531 | | - int64_t *num_consecutive_same_value, |
532 | | - float64_t *prev_value, |
| 519 | + float64_t *mean, float64_t *m2, |
| 520 | + float64_t *m3, |
| 521 | + bint *numerically_unstable |
533 | 522 | ) noexcept nogil: |
534 | 523 | """ add a value from the skew calc """ |
535 | 524 | cdef: |
536 | | - float64_t y, t |
| 525 | + float64_t n, delta, delta_n, term1, m3_update, new_m3 |
537 | 526 |
|
538 | 527 | # Not NaN |
539 | 528 | if val == val: |
540 | | - nobs[0] = nobs[0] + 1 |
| 529 | + nobs[0] += 1 |
| 530 | + n = <float64_t>(nobs[0]) |
| 531 | + delta = val - mean[0] |
| 532 | + delta_n = delta / n |
| 533 | + term1 = delta * delta_n * (n - 1.0) |
541 | 534 |
|
542 | | - y = val - compensation_x[0] |
543 | | - t = x[0] + y |
544 | | - compensation_x[0] = t - x[0] - y |
545 | | - x[0] = t |
546 | | - y = val * val - compensation_xx[0] |
547 | | - t = xx[0] + y |
548 | | - compensation_xx[0] = t - xx[0] - y |
549 | | - xx[0] = t |
550 | | - y = val * val * val - compensation_xxx[0] |
551 | | - t = xxx[0] + y |
552 | | - compensation_xxx[0] = t - xxx[0] - y |
553 | | - xxx[0] = t |
| 535 | + m3_update = delta_n * (term1 * (n - 2.0) - 3.0 * m2[0]) |
| 536 | + new_m3 = m3[0] + m3_update |
| 537 | + if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): |
| 538 | + # possible catastrophic cancellation |
| 539 | + numerically_unstable[0] = True |
554 | 540 |
|
555 | | - # GH#42064, record num of same values to remove floating point artifacts |
556 | | - if val == prev_value[0]: |
557 | | - num_consecutive_same_value[0] += 1 |
558 | | - else: |
559 | | - # reset to 1 (include current value itself) |
560 | | - num_consecutive_same_value[0] = 1 |
561 | | - prev_value[0] = val |
| 541 | + m3[0] = new_m3 |
| 542 | + m2[0] += term1 |
| 543 | + mean[0] += delta_n |
562 | 544 |
|
563 | 545 |
|
564 | 546 | cdef void remove_skew(float64_t val, int64_t *nobs, |
565 | | - float64_t *x, float64_t *xx, |
566 | | - float64_t *xxx, |
567 | | - float64_t *compensation_x, |
568 | | - float64_t *compensation_xx, |
569 | | - float64_t *compensation_xxx) noexcept nogil: |
| 547 | + float64_t *mean, float64_t *m2, |
| 548 | + float64_t *m3, |
| 549 | + bint *numerically_unstable) noexcept nogil: |
570 | 550 | """ remove a value from the skew calc """ |
571 | 551 | cdef: |
572 | | - float64_t y, t |
| 552 | + float64_t n, delta, delta_n, term1, m3_update, new_m3 |
573 | 553 |
|
574 | 554 | # Not NaN |
575 | 555 | if val == val: |
576 | | - nobs[0] = nobs[0] - 1 |
| 556 | + nobs[0] -= 1 |
| 557 | + n = <float64_t>(nobs[0]) |
| 558 | + delta = val - mean[0] |
| 559 | + delta_n = delta / n |
| 560 | + term1 = delta_n * delta * (n + 1.0) |
577 | 561 |
|
578 | | - y = - val - compensation_x[0] |
579 | | - t = x[0] + y |
580 | | - compensation_x[0] = t - x[0] - y |
581 | | - x[0] = t |
582 | | - y = - val * val - compensation_xx[0] |
583 | | - t = xx[0] + y |
584 | | - compensation_xx[0] = t - xx[0] - y |
585 | | - xx[0] = t |
586 | | - y = - val * val * val - compensation_xxx[0] |
587 | | - t = xxx[0] + y |
588 | | - compensation_xxx[0] = t - xxx[0] - y |
589 | | - xxx[0] = t |
| 562 | + m3_update = delta_n * (term1 * (n + 2.0) - 3.0 * m2[0]) |
| 563 | + new_m3 = m3[0] - m3_update |
| 564 | + |
| 565 | + if fabs(m3_update) + fabs(m3[0]) > 1e10 * fabs(new_m3): |
| 566 | + # possible catastrophic cancellation |
| 567 | + numerically_unstable[0] = True |
| 568 | + |
| 569 | + m3[0] = new_m3 |
| 570 | + m2[0] -= term1 |
| 571 | + mean[0] -= delta_n |
590 | 572 |
|
591 | 573 |
|
592 | 574 | def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, |
593 | 575 | ndarray[int64_t] end, int64_t minp) -> np.ndarray: |
594 | 576 | cdef: |
595 | 577 | Py_ssize_t i, j |
596 | | - float64_t val, min_val, mean_val, sum_val = 0 |
597 | | - float64_t compensation_xxx_add, compensation_xxx_remove |
598 | | - float64_t compensation_xx_add, compensation_xx_remove |
599 | | - float64_t compensation_x_add, compensation_x_remove |
600 | | - float64_t x, xx, xxx |
601 | | - float64_t prev_value |
602 | | - int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0 |
603 | | - int64_t s, e, num_consecutive_same_value |
604 | | - ndarray[float64_t] output, values_copy |
605 | | - bint is_monotonic_increasing_bounds |
| 578 | + float64_t val |
| 579 | + float64_t mean, m2, m3 |
| 580 | + int64_t nobs = 0, N = len(start) |
| 581 | + int64_t s, e |
| 582 | + ndarray[float64_t] output |
| 583 | + bint requires_recompute, numerically_unstable = False |
606 | 584 |
|
607 | 585 | minp = max(minp, 3) |
608 | 586 | is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( |
609 | 587 | start, end |
610 | 588 | ) |
611 | 589 | output = np.empty(N, dtype=np.float64) |
612 | | - min_val = np.nanmin(values) |
613 | | - values_copy = np.copy(values) |
614 | 590 |
|
615 | 591 | with nogil: |
616 | | - for i in range(0, V): |
617 | | - val = values_copy[i] |
618 | | - if val == val: |
619 | | - nobs_mean += 1 |
620 | | - sum_val += val |
621 | | - mean_val = sum_val / nobs_mean |
622 | | - # Other cases would lead to imprecision for smallest values |
623 | | - if min_val - mean_val > -1e5: |
624 | | - mean_val = round(mean_val) |
625 | | - for i in range(0, V): |
626 | | - values_copy[i] = values_copy[i] - mean_val |
627 | | - |
628 | 592 | for i in range(0, N): |
629 | | - |
630 | 593 | s = start[i] |
631 | 594 | e = end[i] |
632 | 595 |
|
633 | | - # Over the first window, observations can only be added |
634 | | - # never removed |
635 | | - if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: |
636 | | - |
637 | | - prev_value = values[s] |
638 | | - num_consecutive_same_value = 0 |
639 | | - |
640 | | - compensation_xxx_add = compensation_xxx_remove = 0 |
641 | | - compensation_xx_add = compensation_xx_remove = 0 |
642 | | - compensation_x_add = compensation_x_remove = 0 |
643 | | - x = xx = xxx = 0 |
644 | | - nobs = 0 |
645 | | - for j in range(s, e): |
646 | | - val = values_copy[j] |
647 | | - add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, |
648 | | - &compensation_xx_add, &compensation_xxx_add, |
649 | | - &num_consecutive_same_value, &prev_value) |
650 | | - |
651 | | - else: |
| 596 | + requires_recompute = ( |
| 597 | + i == 0 |
| 598 | + or not is_monotonic_increasing_bounds |
| 599 | + or s >= end[i - 1] |
| 600 | + or numerically_unstable |
| 601 | + ) |
652 | 602 |
|
653 | | - # After the first window, observations can both be added |
654 | | - # and removed |
655 | | - # calculate deletes |
| 603 | + if not requires_recompute: |
656 | 604 | for j in range(start[i - 1], s): |
657 | | - val = values_copy[j] |
658 | | - remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove, |
659 | | - &compensation_xx_remove, &compensation_xxx_remove) |
| 605 | + val = values[j] |
| 606 | + remove_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) |
660 | 607 |
|
661 | 608 | # calculate adds |
662 | 609 | for j in range(end[i - 1], e): |
663 | | - val = values_copy[j] |
664 | | - add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, |
665 | | - &compensation_xx_add, &compensation_xxx_add, |
666 | | - &num_consecutive_same_value, &prev_value) |
| 610 | + val = values[j] |
| 611 | + add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) |
667 | 612 |
|
668 | | - output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value) |
| 613 | + if requires_recompute or numerically_unstable: |
| 614 | + numerically_unstable = False |
| 615 | + mean = m2 = m3 = 0.0 |
| 616 | + nobs = 0 |
| 617 | + |
| 618 | + for j in range(s, e): |
| 619 | + val = values[j] |
| 620 | + add_skew(val, &nobs, &mean, &m2, &m3, &numerically_unstable) |
| 621 | + |
| 622 | + output[i] = calc_skew(minp, nobs, mean, m2, m3) |
669 | 623 |
|
670 | 624 | if not is_monotonic_increasing_bounds: |
671 | 625 | nobs = 0 |
672 | | - x = 0.0 |
673 | | - xx = 0.0 |
674 | | - xxx = 0.0 |
| 626 | + mean = 0.0 |
| 627 | + m2 = 0.0 |
| 628 | + m3 = 0.0 |
675 | 629 |
|
676 | 630 | return output |
677 | 631 |
|
|
0 commit comments