Skip to content

Commit d1c1e83

Browse files
committed
Implemented two pass welford for improved numeric stability
1 parent 28fb765 commit d1c1e83

File tree

3 files changed

+47
-52
lines changed

3 files changed

+47
-52
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ Other enhancements
170170
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
171171
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
172172
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
173+
- :meth:`DataFrame.corr` now uses two pass Welford's Method to improve numerical stability with precision for very large/small values (:issue:`59652`)
173174
- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)
174175
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
175176
- Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`)

pandas/_libs/algos.pyx

Lines changed: 37 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -345,8 +345,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
345345
float64_t[:, ::1] result
346346
uint8_t[:, :] mask
347347
int64_t nobs = 0
348-
float64_t xx, xy, meanx, meany, divisor, number, v1sq, v2sq, val
349-
float64_t* vx, vy
348+
float64_t vx, vy, meanx, meany, divisor, ssqdmx, ssqdmy, cxy, val
349+
float64_t sumx, sumy
350350

351351
N, K = (<object>mat).shape
352352
if minp is None:
@@ -358,55 +358,43 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
358358
mask = np.isfinite(mat).view(np.uint8)
359359

360360
with nogil:
361-
# for xi in range(K):
362-
# for yi in range(xi + 1):
361+
for xi in range(K):
362+
for yi in range(xi+1):
363363
# Welford's method for the variance-calculation
364364
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
365-
nobs = v1sq = v2sq = covxy = meanx = meany = 0
366-
for i in range(N):
367-
vx = ptr(mat[i,:])
368-
vy = mat[i,:]
369-
meanx = vx.mean()
370-
meany = vy.mean()
371-
for j in range(vx):
372-
xx = (vx[j]- meanx)
373-
yy = (vy[j]- meany)
374-
nobs += 1
375-
number += xx*yy
376-
v1sq += xx*xx
377-
v2sq += yy*yy
378-
val = number/sqrt(v1sq * v2sq)
379-
print(val)
380-
381-
# for i in range(N):
382-
# if mask[i, xi] and mask[i, yi]:
383-
# vx = mat[i, xi]
384-
# vy = mat[i, yi]
385-
# nobs += 1
386-
# dx = vx - meanx
387-
# dy = vy - meany
388-
# meanx += 1. / nobs * dx
389-
# meany += 1. / nobs * dy
390-
# ssqdmx += (vx - meanx) * dx
391-
# ssqdmy += (vy - meany) * dy
392-
# covxy += (vx - meanx) * dy
393-
394-
# if nobs < minpv:
395-
# result[xi, yi] = result[yi, xi] = NaN
396-
# else:esult[xi, yi] = result[yi, xi] = val
397-
# divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
398-
399-
# # clip `covxy / divisor` to ensure coeff is within bounds
400-
# if divisor != 0:
401-
# val = covxy / divisor
402-
# if not cov:
403-
# if val > 1.0:
404-
# val = 1.0
405-
# elif val < -1.0:
406-
# val = -1.0
407-
# result[xi, yi] = result[yi, xi] = val
408-
# else:
409-
# result[xi, yi] = result[yi, xi] = NaN
365+
# Changed to Welford's two-pass for improved numeric stability
366+
nobs = ssqdmx = ssqdmy = cxy = meanx = meany = 0
367+
sumx = sumy = 0
368+
for i in range(N):
369+
if mask[i, xi] and mask[i, yi]:
370+
sumx += mat[i, xi]
371+
sumy += mat[i, yi]
372+
nobs += 1
373+
if nobs < minpv:
374+
result[xi, yi] = result[yi, xi] = NaN
375+
continue
376+
meanx = sumx / nobs
377+
meany = sumy / nobs
378+
for i in range(N):
379+
if mask[i, xi] and mask[i, yi]:
380+
vx = mat[i, xi] - meanx
381+
vy = mat[i, yi] - meany
382+
cxy += vx * vy
383+
ssqdmx += vx * vx
384+
ssqdmy += vy * vy
385+
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
386+
387+
# clip `covxy / divisor` to ensure coeff is within bounds
388+
if divisor != 0:
389+
val = cxy / divisor
390+
if not cov:
391+
if val > 1.0:
392+
val = 1.0
393+
elif val < -1.0:
394+
val = -1.0
395+
result[xi, yi] = result[yi, xi] = val
396+
else:
397+
result[xi, yi] = result[yi, xi] = NaN
410398

411399
return result.base
412400

pandas/tests/frame/methods/test_cov_corr.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -210,11 +210,17 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method):
210210
@pytest.mark.parametrize("length", [2, 20, 200, 2000])
211211
def test_corr_for_constant_columns(self, length):
212212
# GH: 37448
213+
#now matches numpy behavior
213214
df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
214215
result = df.corr()
215-
expected = DataFrame(
216-
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
217-
)
216+
if length == 2:
217+
expected = DataFrame(
218+
{"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
219+
)
220+
else:
221+
expected = DataFrame(
222+
{"A": [1., 1.], "B": [1., 1.]}, index=["A", "B"]
223+
)
218224
tm.assert_frame_equal(result, expected)
219225

220226
def test_calc_corr_small_numbers(self):

0 commit comments

Comments
 (0)