pandas-dev · eicchen · Oct 18, 2025 · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -170,6 +170,7 @@ Other enhancements
 - :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
 - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
 - :meth:`DataFrame.agg` called with ``axis=1`` and a ``func`` which relabels the result index now raises a ``NotImplementedError`` (:issue:`58807`).
+- :meth:`DataFrame.corr` now uses two pass Welford's Method to improve numerical stability with precision for very large/small values (:issue:`59652`)
 - :meth:`Index.get_loc` now accepts also subclasses of ``tuple`` as keys (:issue:`57922`)
 - :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
 - Added missing parameter ``weights`` in :meth:`DataFrame.plot.kde` for the estimation of the PDF (:issue:`59337`)

diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -345,7 +345,9 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
         float64_t[:, ::1] result
         uint8_t[:, :] mask
         int64_t nobs = 0
-        float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy, val
+        float64_t vx, vy, meanx, meany, divisor, ssqdmx, ssqdmy, cxy, val
+        float64_t ref_x, ref_y, dx, dy
+        bint ref_set
 
     N, K = (<object>mat).shape
     if minp is None:
@@ -358,39 +360,48 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
 
     with nogil:
         for xi in range(K):
-            for yi in range(xi + 1):
+            for yi in range(xi+1):
                 # Welford's method for the variance-calculation
                 # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-                nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
+                # Changed to Welford's two-pass for improved numeric stability
+                nobs = ssqdmx = ssqdmy = cxy = meanx = meany = 0
+                ref_set = False
                 for i in range(N):
                     if mask[i, xi] and mask[i, yi]:
+                        nobs += 1
                         vx = mat[i, xi]
                         vy = mat[i, yi]
-                        nobs += 1
+                        if not ref_set:
+                            ref_x = vx
+                            ref_y = vy
+                            ref_set = True
+
+                        vx -= ref_x
+                        vy -= ref_y
                         dx = vx - meanx
                         dy = vy - meany
-                        meanx += 1. / nobs * dx
-                        meany += 1. / nobs * dy
+                        meanx += dx / nobs
+                        meany += dy / nobs
+                        cxy += dx * (vy - meany)
                         ssqdmx += (vx - meanx) * dx
                         ssqdmy += (vy - meany) * dy
-                        covxy += (vx - meanx) * dy
 
                 if nobs < minpv:
                     result[xi, yi] = result[yi, xi] = NaN
+                    continue
+
+                divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
+                # clip `cxy / divisor` to ensure coeff is within bounds
+                if divisor != 0:
+                    val = cxy / divisor
+                    if not cov:
+                        if val > 1.0:
+                            val = 1.0
+                        elif val < -1.0:
+                            val = -1.0
+                    result[xi, yi] = result[yi, xi] = val
                 else:
-                    divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
-
-                    # clip `covxy / divisor` to ensure coeff is within bounds
-                    if divisor != 0:
-                        val = covxy / divisor
-                        if not cov:
-                            if val > 1.0:
-                                val = 1.0
-                            elif val < -1.0:
-                                val = -1.0
-                        result[xi, yi] = result[yi, xi] = val
-                    else:
-                        result[xi, yi] = result[yi, xi] = NaN
+                    result[xi, yi] = result[yi, xi] = NaN
 
     return result.base
 

diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -493,3 +493,51 @@ def test_cov_with_missing_values(self):
         result2 = df.dropna().cov()
         tm.assert_frame_equal(result1, expected)
         tm.assert_frame_equal(result2, expected)
+
+    pair_cases = [
+        np.array(
+            [[30.0, 30.100000381469727], [116.80000305175781, 116.8000030517578]],
+            dtype=np.longdouble,
+        ),
+        np.array(
+            [[-30.0, 30.100000381469727], [116.80000305175781, -116.8000030517578]],
+            dtype=np.longdouble,
+        ),
+        np.array([[1e-8, 3.42e-8], [2e-9, 3e-8]], dtype=np.longdouble),
+        np.array([[1e12, 1e-8], [1e12 + 1e-3, 2e-8]], dtype=np.longdouble),
+        np.array([[0.0, 1e-12], [1e-14, 0.0]], dtype=np.longdouble),
+    ]
+
+    @pytest.mark.parametrize("values", pair_cases)
+    def test_pair_correlation(self, values):
+        df = DataFrame(values.T, dtype=np.longdouble)
+        result = df.corr(method="pearson")
+        expected = DataFrame(np.corrcoef(values[0], values[1]), dtype=np.float64)
+        tm.assert_frame_equal(result, expected)
+
+    multi_cases = [
+        np.array(
+            [[1e12, 1e-8, 5.5], [1e12 + 1e-3, 2e-8, 5.50000001]], dtype=np.longdouble
+        ),
+        np.array(
+            [
+                [1e12, 1e12 + 1e-3, 1e12 + 2e-3],
+                [1e12 + 2e-3, 1e12 + 3e-3, 1e12 + 4e-3],
+                [1e12 + 1e-2, 1e12 + 1e-2, 1e12 + 1e-2],
+            ],
+            dtype=np.longdouble,
+        ),
+        np.array([[1e-8, 2e-8], [2e-8, 3e-8], [0.0, 1e-12]], dtype=np.longdouble),
+    ]
+
+    @pytest.mark.parametrize("values", multi_cases)
+    def test_multi_correlation(self, values):
+        df = DataFrame(values.T, dtype=np.longdouble)
+        result = df.corr(method="pearson")
+        expected = DataFrame(
+            np.corrcoef(values),
+            index=range(values.shape[0]),
+            columns=range(values.shape[0]),
+            dtype=np.float64,
+        )
+        tm.assert_frame_equal(result, expected)