Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ Other enhancements
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
- :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
- :meth:`DataFrame.corr` now uses two pass Welford's Method to improve numerical stability with precision for very large/small values (:issue:`59652`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it exists. You are simply using two-pass.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Must have conflated the two, will update

- :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
- Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`)
Expand Down
51 changes: 31 additions & 20 deletions pandas/_libs/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,9 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
float64_t[:, ::1] result
uint8_t[:, :] mask
int64_t nobs = 0
float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy, val
float64_t vx, vy, meanx, meany, divisor, ssqdmx, ssqdmy, cxy, val
float64_t ref_x, ref_y, dx, dy
bint ref_set

N, K = (<object>mat).shape
if minp is None:
Expand All @@ -358,39 +360,48 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):

with nogil:
for xi in range(K):
for yi in range(xi + 1):
for yi in range(xi+1):
# Welford's method for the variance-calculation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
# Changed to Welford's two-pass for improved numeric stability
nobs = ssqdmx = ssqdmy = cxy = meanx = meany = 0
ref_set = False
for i in range(N):
if mask[i, xi] and mask[i, yi]:
nobs += 1
vx = mat[i, xi]
vy = mat[i, yi]
nobs += 1
if not ref_set:
ref_x = vx
ref_y = vy
ref_set = True

vx -= ref_x
vy -= ref_y
dx = vx - meanx
dy = vy - meany
meanx += 1. / nobs * dx
meany += 1. / nobs * dy
meanx += dx / nobs
meany += dy / nobs
cxy += dx * (vy - meany)
ssqdmx += (vx - meanx) * dx
ssqdmy += (vy - meany) * dy
covxy += (vx - meanx) * dy

if nobs < minpv:
result[xi, yi] = result[yi, xi] = NaN
continue

divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
# clip `cxy / divisor` to ensure coeff is within bounds
if divisor != 0:
val = cxy / divisor
if not cov:
if val > 1.0:
val = 1.0
elif val < -1.0:
val = -1.0
result[xi, yi] = result[yi, xi] = val
else:
divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)

# clip `covxy / divisor` to ensure coeff is within bounds
if divisor != 0:
val = covxy / divisor
if not cov:
if val > 1.0:
val = 1.0
elif val < -1.0:
val = -1.0
result[xi, yi] = result[yi, xi] = val
else:
result[xi, yi] = result[yi, xi] = NaN
result[xi, yi] = result[yi, xi] = NaN

return result.base

Expand Down
48 changes: 48 additions & 0 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,3 +493,51 @@ def test_cov_with_missing_values(self):
result2 = df.dropna().cov()
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)

pair_cases = [
np.array(
[[30.0, 30.100000381469727], [116.80000305175781, 116.8000030517578]],
dtype=np.longdouble,
),
np.array(
[[-30.0, 30.100000381469727], [116.80000305175781, -116.8000030517578]],
dtype=np.longdouble,
),
np.array([[1e-8, 3.42e-8], [2e-9, 3e-8]], dtype=np.longdouble),
np.array([[1e12, 1e-8], [1e12 + 1e-3, 2e-8]], dtype=np.longdouble),
np.array([[0.0, 1e-12], [1e-14, 0.0]], dtype=np.longdouble),
]

@pytest.mark.parametrize("values", pair_cases)
def test_pair_correlation(self, values):
df = DataFrame(values.T, dtype=np.longdouble)
result = df.corr(method="pearson")
expected = DataFrame(np.corrcoef(values[0], values[1]), dtype=np.float64)
tm.assert_frame_equal(result, expected)

multi_cases = [
np.array(
[[1e12, 1e-8, 5.5], [1e12 + 1e-3, 2e-8, 5.50000001]], dtype=np.longdouble
),
np.array(
[
[1e12, 1e12 + 1e-3, 1e12 + 2e-3],
[1e12 + 2e-3, 1e12 + 3e-3, 1e12 + 4e-3],
[1e12 + 1e-2, 1e12 + 1e-2, 1e12 + 1e-2],
],
dtype=np.longdouble,
),
np.array([[1e-8, 2e-8], [2e-8, 3e-8], [0.0, 1e-12]], dtype=np.longdouble),
]

@pytest.mark.parametrize("values", multi_cases)
def test_multi_correlation(self, values):
df = DataFrame(values.T, dtype=np.longdouble)
result = df.corr(method="pearson")
expected = DataFrame(
np.corrcoef(values),
index=range(values.shape[0]),
columns=range(values.shape[0]),
dtype=np.float64,
)
tm.assert_frame_equal(result, expected)
Loading