Skip to content

Commit 679fd45

Browse files
TST: groupby.sum large integers: add regression tests
1 parent aa4ba3e commit 679fd45

File tree

1 file changed

+135
-0
lines changed

1 file changed

+135
-0
lines changed
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import numpy as np
2+
import pandas as pd
3+
import pytest
4+
5+
6+
def test_groupby_sum_single_group_large_int64_matches_df_sum():
7+
df = pd.DataFrame({"gb": ["A", "A"], "val": pd.Series([14, 2**60], dtype="int64")})
8+
got = df.groupby("gb")["val"].sum().iloc[0]
9+
exp = df["val"].sum()
10+
assert got == exp
11+
assert df["val"].dtype == "int64"
12+
assert df.groupby("gb")["val"].sum().dtype == "int64"
13+
14+
15+
def test_groupby_sum_multi_groups_matches_series_sum_int64():
16+
vals = pd.Series([2**60, 14, 2**60 + 3, 7], dtype="int64")
17+
gb = pd.Series(["A", "A", "B", "B"])
18+
df = pd.DataFrame({"gb": gb, "val": vals})
19+
20+
got = df.groupby("gb")["val"].sum()
21+
exp = pd.Series(
22+
{"A": vals.iloc[:2].sum(), "B": vals.iloc[2:].sum()},
23+
dtype="int64",
24+
)
25+
exp.index.name = "gb"
26+
exp.name = "val" # <- this aligns the Series name with got
27+
28+
pd.testing.assert_series_equal(got, exp)
29+
30+
31+
@pytest.mark.parametrize(
32+
"dtype, big, small",
33+
[
34+
("int64", 2**60, 123),
35+
("uint64", np.uint64(2**60), np.uint64(123)),
36+
],
37+
)
38+
def test_groupby_sum_preserves_dtype_no_float_cast(dtype, big, small):
39+
df = pd.DataFrame(
40+
{"gb": ["A", "A", "B"], "val": pd.Series([big, small, big], dtype=dtype)}
41+
)
42+
out = df.groupby("gb")["val"].sum()
43+
assert out.dtype.name == dtype
44+
assert out.loc["A"] == pd.Series([big, small], dtype=dtype).sum()
45+
assert out.loc["B"] == big
46+
47+
48+
def test_groupby_sum_nullable_uint64_min_count_behavior():
49+
s = pd.Series([pd.NA, np.uint64(2**60)], dtype="UInt64")
50+
df = pd.DataFrame({"gb": ["A", "A"], "val": s})
51+
52+
out_na = df.groupby("gb")["val"].sum(min_count=2)
53+
assert out_na.dtype.name == "UInt64"
54+
assert out_na.iloc[0] is pd.NA
55+
56+
out_ok = df.groupby("gb")["val"].sum(min_count=1)
57+
assert out_ok.dtype.name == "UInt64"
58+
assert out_ok.iloc[0] == np.uint64(2**60)
59+
60+
61+
def test_groupby_sum_nullable_all_na_respects_min_count():
62+
s = pd.Series([pd.NA, pd.NA], dtype="Int64")
63+
df = pd.DataFrame({"gb": ["A", "A"], "val": s})
64+
out = df.groupby("gb")["val"].sum(min_count=1)
65+
assert out.dtype.name == "Int64"
66+
assert out.iloc[0] is pd.NA
67+
68+
69+
def test_groupby_sum_dataframe_multiple_integer_columns_preserve_dtypes():
70+
# int64 + uint64 columns; ensure values and dtypes preserved
71+
df = pd.DataFrame(
72+
{
73+
"gb": ["A", "A", "B"],
74+
"i64": pd.Series([2**60, 5, 7], dtype="int64"),
75+
"u64": pd.Series(
76+
[np.uint64(10), np.uint64(2**54), np.uint64(3)],
77+
dtype="uint64",
78+
),
79+
}
80+
)
81+
82+
got = df.groupby("gb")[["i64", "u64"]].sum()
83+
84+
exp = pd.DataFrame(
85+
{
86+
"i64": pd.Series(
87+
{
88+
"A": pd.Series([2**60, 5], dtype="int64").sum(),
89+
"B": pd.Series([7], dtype="int64").sum(),
90+
},
91+
dtype="int64",
92+
),
93+
"u64": pd.Series(
94+
{
95+
"A": pd.Series(
96+
[np.uint64(10), np.uint64(2**54)], dtype="uint64"
97+
).sum(),
98+
"B": pd.Series([np.uint64(3)], dtype="uint64").sum(),
99+
},
100+
dtype="uint64",
101+
),
102+
}
103+
)
104+
exp.index.name = "gb" # align index name with groupby result
105+
106+
pd.testing.assert_frame_equal(got, exp)
107+
assert got["i64"].dtype == "int64"
108+
assert got["u64"].dtype == "uint64"
109+
110+
111+
def test_groupby_sum_dataframe_nullable_integers_min_count_by_column():
112+
# Nullable Int64 / UInt64 with missing values; verify per-column min_count behavior
113+
df = pd.DataFrame(
114+
{
115+
"gb": ["A", "A", "A", "B"],
116+
"I": pd.Series([pd.NA, 2**60, pd.NA, 5], dtype="Int64"),
117+
"U": pd.Series([pd.NA, np.uint64(7), pd.NA, np.uint64(2)], dtype="UInt64"),
118+
}
119+
)
120+
121+
out_na = df.groupby("gb")[["I", "U"]].sum(min_count=2)
122+
assert out_na.loc["A", "I"] is pd.NA
123+
assert out_na.loc["A", "U"] is pd.NA
124+
assert out_na.loc["B", "I"] is pd.NA
125+
assert out_na.loc["B", "U"] is pd.NA
126+
assert out_na["I"].dtype.name == "Int64"
127+
assert out_na["U"].dtype.name == "UInt64"
128+
129+
out_ok = df.groupby("gb")[["I", "U"]].sum(min_count=1)
130+
assert out_ok["I"].dtype.name == "Int64"
131+
assert out_ok["U"].dtype.name == "UInt64"
132+
assert out_ok.loc["A", "I"] == 2**60
133+
assert out_ok.loc["A", "U"] == np.uint64(7)
134+
assert out_ok.loc["B", "I"] == 5
135+
assert out_ok.loc["B", "U"] == np.uint64(2)

0 commit comments

Comments
 (0)