Skip to content

Commit 1f8c628

Browse files
init commit kendall spearman ordinal cats
1 parent 81f8d5d commit 1f8c628

File tree

4 files changed

+83
-2
lines changed

4 files changed

+83
-2
lines changed

pandas/core/frame.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11633,6 +11633,11 @@ def corr(
1163311633
data = self._get_numeric_data() if numeric_only else self
1163411634
cols = data.columns
1163511635
idx = cols.copy()
11636+
11637+
if method in ("spearman", "kendall"):
11638+
data = data._transform_ord_cat_cols_to_coded_cols()
11639+
11640+
1163611641
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
1163711642

1163811643
if method == "pearson":
@@ -11926,7 +11931,8 @@ def corrwith(
1192611931
correl = num / dom
1192711932

1192811933
elif method in ["kendall", "spearman"] or callable(method):
11929-
11934+
left = left._convert_ordered_cat_to_code()
11935+
right = right._convert_ordered_cat_to_code()
1193011936
def c(x):
1193111937
return nanops.nancorr(x[0], x[1], method=method)
1193211938

@@ -11957,6 +11963,25 @@ def c(x):
1195711963

1195811964
return correl
1195911965

11966+
def _transform_ord_cat_cols_to_coded_cols(self) -> DataFrame:
11967+
"""
11968+
any ordered categorical columns are transformed to the respectice caregorical codes
11969+
other columns remain untouched
11970+
"""
11971+
categ = self.select_dtypes("category")
11972+
if len(categ.columns) == 0:
11973+
return self
11974+
11975+
cols_convert = categ.loc[:, categ.agg(lambda x: x.cat.ordered)].columns
11976+
11977+
if len(cols_convert) > 0:
11978+
data = self.copy(deep=False)
11979+
data[cols_convert] = data[cols_convert].transform(
11980+
lambda x: x.cat.codes.replace(-1, np.nan)
11981+
)
11982+
return data
11983+
return self
11984+
1196011985
# ----------------------------------------------------------------------
1196111986
# ndarray-like stats methods
1196211987

pandas/core/series.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2687,6 +2687,12 @@ def corr(
26872687
this, other = self.align(other, join="inner")
26882688
if len(this) == 0:
26892689
return np.nan
2690+
2691+
if method in ("spearman", "kendall"):
2692+
if this.dtype == "category" and this.cat.ordered:
2693+
this = this.cat.codes.replace(-1, np.nan)
2694+
if other.dtype == "category" and other.cat.ordered:
2695+
other = other.cat.codes.replace(-1, np.nan)
26902696

26912697
this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
26922698
other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)

pandas/tests/series/methods/test_cov_corr.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
)
1212
import pandas._testing as tm
1313

14-
1514
class TestSeriesCov:
1615
def test_cov(self, datetime_series):
1716
# full overlap
@@ -184,3 +183,31 @@ def test_corr_callable_method(self, datetime_series):
184183
df = pd.DataFrame([s1, s2])
185184
expected = pd.DataFrame([{0: 1.0, 1: 0}, {0: 0, 1: 1.0}])
186185
tm.assert_almost_equal(df.transpose().corr(method=my_corr), expected)
186+
187+
@pytest.mark.parametrize("method", ["kendall", "spearman"])
188+
def test_corr_rank_ordered_categorical(self, method,):
189+
stats = pytest.importorskip("scipy.stats")
190+
method_scipy_func = {
191+
"kendall": stats.kendalltau,
192+
"spearman": stats.spearmanr
193+
}
194+
ser_ord_cat = pd.Series( pd.Categorical(
195+
["low", "m", "h", "vh"],
196+
categories=["low", "m", "h", "vh"], ordered=True
197+
))
198+
ser_ord_cat_codes = ser_ord_cat.cat.codes.replace(-1, np.nan)
199+
ser_ord_int = pd.Series([0, 1, 2, 3])
200+
ser_ord_float = pd.Series([2.0, 3.0, 4.5, 6.5])
201+
202+
corr_calc = ser_ord_cat.corr(ser_ord_int, method=method)
203+
corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_int)[0]
204+
tm.assert_almost_equal(corr_calc, corr_expected)
205+
206+
corr_calc = ser_ord_cat.corr(ser_ord_float, method=method)
207+
corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_float)[0]
208+
tm.assert_almost_equal(corr_calc, corr_expected)
209+
210+
corr_calc = ser_ord_cat.corr(ser_ord_cat, method=method)
211+
corr_expected = method_scipy_func[method](ser_ord_cat_codes, ser_ord_cat_codes)[0]
212+
tm.assert_almost_equal(corr_calc, corr_expected)
213+

test_corr.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import pandas as pd
2+
df = pd.DataFrame({'a' : [1, 2, 3, 4], 'b' : [4, 3, 2, 1]})
3+
df['b'] = df['b'].astype('category').cat.set_categories([4, 3, 2, 1], ordered=True)
4+
#import pdb; pdb.set_trace()
5+
crr = df.corr(method='spearman')
6+
print(crr)
7+
8+
9+
df = pd.DataFrame({'a' : [1, 2, 3, 4], 'b' : ["vh", "h", "m", "l"]})
10+
df['b'] = df['b'].astype('category').cat.set_categories(["vh", "h", "m", "l"], ordered=True)
11+
#import pdb; pdb.set_trace()
12+
print(df)
13+
print(df.dtypes)
14+
crr = df.corr(method='spearman')
15+
print(crr)
16+
17+
ser_ord_cat = pd.Series( pd.Categorical(
18+
["vh", "h", "m", "low"],
19+
categories=["vh", "h", "m", "low"], ordered=True
20+
))
21+
print(ser_ord_cat)
22+
crr = ser_ord_cat.corr(ser_ord_cat, method='spearman')
23+
print(crr)

0 commit comments

Comments
 (0)