corr fix with centering and scaling, added test cases for pair and multi

eicchen · eicchen · commit 611a35157849 · 2025-10-25T15:42:35.000-05:00
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -346,7 +346,8 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
         uint8_t[:, :] mask
         int64_t nobs = 0
         float64_t vx, vy, meanx, meany, divisor, ssqdmx, ssqdmy, cxy, val
-        float64_t sumx, sumy
+        float64_t ref_x, ref_y, dx, dy
+        bint ref_set
 
     N, K = (<object>mat).shape
     if minp is None:
@@ -364,27 +365,33 @@ def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
                 # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
                 # Changed to Welford's two-pass for improved numeric stability
                 nobs = ssqdmx = ssqdmy = cxy = meanx = meany = 0
-                sumx = sumy = 0
+                ref_set = False
                 for i in range(N):
                     if mask[i, xi] and mask[i, yi]:
-                        sumx += mat[i, xi]
-                        sumy += mat[i, yi]
                         nobs += 1
+                        vx = mat[i, xi]
+                        vy = mat[i, yi]
+                        if not ref_set:
+                            ref_x = vx
+                            ref_y = vy
+                            ref_set = True
+
+                        vx -= ref_x
+                        vy -= ref_y
+                        dx = vx - meanx
+                        dy = vy - meany
+                        meanx += dx / nobs
+                        meany += dy / nobs
+                        cxy += dx * (vy - meany)
+                        ssqdmx += (vx - meanx) * dx
+                        ssqdmy += (vy - meany) * dy
+
                 if nobs < minpv:
                     result[xi, yi] = result[yi, xi] = NaN
                     continue
-                meanx = sumx / nobs
-                meany = sumy / nobs
-                for i in range(N):
-                    if mask[i, xi] and mask[i, yi]:
-                        vx = mat[i, xi] - meanx
-                        vy = mat[i, yi] - meany
-                        cxy += vx * vy
-                        ssqdmx += vx * vx
-                        ssqdmy += vy * vy
-                divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
 
-                # clip `covxy / divisor` to ensure coeff is within bounds
+                divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
+                # clip `cxy / divisor` to ensure coeff is within bounds
                 if divisor != 0:
                     val = cxy / divisor
                     if not cov:
diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
@@ -210,15 +210,11 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method):
     @pytest.mark.parametrize("length", [2, 20, 200, 2000])
     def test_corr_for_constant_columns(self, length):
         # GH: 37448
-        # now matches numpy behavior
         df = DataFrame(length * [[0.4, 0.1]], columns=["A", "B"])
         result = df.corr()
-        if length == 2:
-            expected = DataFrame(
-                {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
-            )
-        else:
-            expected = DataFrame({"A": [1.0, 1.0], "B": [1.0, 1.0]}, index=["A", "B"])
+        expected = DataFrame(
+            {"A": [np.nan, np.nan], "B": [np.nan, np.nan]}, index=["A", "B"]
+        )
         tm.assert_frame_equal(result, expected)
 
     def test_calc_corr_small_numbers(self):
@@ -498,11 +494,50 @@ def test_cov_with_missing_values(self):
         tm.assert_frame_equal(result1, expected)
         tm.assert_frame_equal(result2, expected)
 
-    def test_close_corr(self):
-        values = np.array(
-            [[30.0, 30.100000381469727], [116.80000305175781, 116.8000030517578]]
-        )
-        df = DataFrame(values.T)
+    pair_cases = [
+        np.array(
+            [[30.0, 30.100000381469727], [116.80000305175781, 116.8000030517578]],
+            dtype=np.longdouble,
+        ),
+        np.array(
+            [[-30.0, 30.100000381469727], [116.80000305175781, -116.8000030517578]],
+            dtype=np.longdouble,
+        ),
+        np.array([[1e-8, 3.42e-8], [2e-9, 3e-8]], dtype=np.longdouble),
+        np.array([[1e12, 1e-8], [1e12 + 1e-3, 2e-8]], dtype=np.longdouble),
+        np.array([[0.0, 1e-12], [1e-14, 0.0]], dtype=np.longdouble),
+    ]
+
+    @pytest.mark.parametrize("values", pair_cases)
+    def test_pair_correlation(self, values):
+        df = DataFrame(values.T, dtype=np.longdouble)
         result = df.corr(method="pearson")
-        expected = DataFrame(np.corrcoef(values[0], values[1]))
+        expected = DataFrame(np.corrcoef(values[0], values[1]), dtype=np.float64)
+        tm.assert_frame_equal(result, expected)
+
+    multi_cases = [
+        np.array(
+            [[1e12, 1e-8, 5.5], [1e12 + 1e-3, 2e-8, 5.50000001]], dtype=np.longdouble
+        ),
+        np.array(
+            [
+                [1e12, 1e12 + 1e-3, 1e12 + 2e-3],
+                [1e12 + 2e-3, 1e12 + 3e-3, 1e12 + 4e-3],
+                [1e12 + 1e-2, 1e12 + 1e-2, 1e12 + 1e-2],
+            ],
+            dtype=np.longdouble,
+        ),
+        np.array([[1e-8, 2e-8], [2e-8, 3e-8], [0.0, 1e-12]], dtype=np.longdouble),
+    ]
+
+    @pytest.mark.parametrize("values", multi_cases)
+    def test_multi_correlation(self, values):
+        df = DataFrame(values.T, dtype=np.longdouble)
+        result = df.corr(method="pearson")
+        expected = DataFrame(
+            np.corrcoef(values),
+            index=range(values.shape[0]),
+            columns=range(values.shape[0]),
+            dtype=np.float64,
+        )
         tm.assert_frame_equal(result, expected)