diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 259470a4f1513..bf5c99e7d938b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1149,6 +1149,8 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) +- Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` where the end of window was not indexed correctly. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`) +- Bug in :meth:`DataFrameGroupBy.agg` where ``numeric_only=True`` was ignored when passing a list of aggregation functions (:issue:`49352`) Reshaping ^^^^^^^^^ @@ -1202,6 +1204,7 @@ Other - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) - Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) +- Bug in :meth:`DataFrame.agg` where ``numeric_only=True`` was ignored when passing a list of aggregation functions, causing non-numeric columns to be included or raising TypeError (:issue:`49352`) - Bug in :meth:`DataFrame.apply` raising ``RecursionError`` when passing ``func=list[int]``. (:issue:`61565`) - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 468f24a07cb4a..87d9ec13e2dc4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -837,6 +837,43 @@ def agg_or_apply_list_like( if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") + # GH#49352 - Handle numeric_only with list of functions + # When numeric_only=True is passed with a list of functions, filter + # to numeric columns before processing to avoid TypeError on non-numeric Series + if op_name == "agg" and kwargs.get("numeric_only", False): + # Check if obj is a DataFrame (not Series) with 2 dimensions + if isinstance(obj, ABCDataFrame) and obj.ndim == 2: + # Filter to numeric columns before processing + numeric_obj = obj.select_dtypes(include="number") + + # Only proceed if we have numeric columns + if not numeric_obj.empty: + # Create kwargs without numeric_only to avoid + # passing it to Series methods + kwargs_filtered = { + k: v for k, v in kwargs.items() if k != "numeric_only" + } + + # Compute with filtered object and cleaned kwargs + keys, results = self.compute_list_like( + op_name, numeric_obj, kwargs_filtered + ) + result = self.wrap_results_list_like(keys, results) + return result + else: + # No numeric columns - return empty result + from pandas import DataFrame + + # Get function names for index + if isinstance(self.func, list): + keys = self.func # type: ignore[assignment] + elif isinstance(self.func, dict): + keys = list(self.func.keys()) + else: + keys = [] + + return DataFrame(index=keys) + keys, results = self.compute_list_like(op_name, obj, kwargs) result = self.wrap_results_list_like(keys, results) return result @@ -1629,6 +1666,19 @@ def agg_or_apply_list_like( else: selected_obj = obj._obj_with_exclusions + # GH#49352 - Handle numeric_only with list of functions for GroupBy + # Filter to numeric columns before processing to avoid TypeError + if op_name == "agg" and kwargs.get("numeric_only", False): + # For GroupBy, filter the selected object to numeric columns + if selected_obj.ndim == 2: + numeric_obj = selected_obj.select_dtypes(include="number") + + if not numeric_obj.empty: + # Update selected_obj to filtered numeric columns + selected_obj = numeric_obj + # Remove numeric_only from kwargs to avoid passing to Series methods + kwargs = {k: v for k, v in kwargs.items() if k != "numeric_only"} + # Only set as_index=True on groupby objects, not Window or Resample # that inherit from this class. with com.temp_setattr( diff --git a/pandas/tests/apply/test_frame_apply_numeric_only.py b/pandas/tests/apply/test_frame_apply_numeric_only.py new file mode 100644 index 0000000000000..1f76b62fdab13 --- /dev/null +++ b/pandas/tests/apply/test_frame_apply_numeric_only.py @@ -0,0 +1,224 @@ +""" +Tests for DataFrame.agg with numeric_only parameter and list of functions. +This tests the fix for GH#49352. +""" + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +class TestFrameAggNumericOnly: + """Tests for DataFrame.agg with numeric_only parameter and list of functions.""" + + def test_agg_list_numeric_only_mixed_dtypes(self): + """GH#49352 - Main test case from the issue.""" + df = DataFrame( + { + "A": [1, 2, 3, 4, 5], + "B": [10.5, 20.5, 30.5, 40.5, 50.5], + "C": ["a", "b", "c", "d", "e"], + } + ) + result = df.agg(["min", "max", "mean"], numeric_only=True) + expected = DataFrame( + {"A": [1.0, 5.0, 3.0], "B": [10.5, 50.5, 30.5]}, + index=["min", "max", "mean"], + ) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_all_numeric(self): + """Should work when all columns are numeric.""" + df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + {"A": [6.0, 2.0], "B": [60.0, 20.0]}, index=["sum", "mean"] + ) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_no_numeric(self): + """Should return empty DataFrame when no numeric columns.""" + df = DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) + result = df.agg(["min", "max"], numeric_only=True) + expected = DataFrame(index=["min", "max"]) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "funcs,expected_index", + [ + (["sum", "mean"], ["sum", "mean"]), + ([np.sum, np.mean], ["sum", "mean"]), + (["sum", np.mean], ["sum", "mean"]), + ([np.sum, "mean"], ["sum", "mean"]), + ], + ) + def test_agg_list_numeric_only_various_function_types(self, funcs, expected_index): + """Test with different combinations of string and numpy functions.""" + df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30], "C": ["a", "b", "c"]}) + result = df.agg(funcs, numeric_only=True) + expected = DataFrame({"A": [6.0, 2.0], "B": [60.0, 20.0]}, index=expected_index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "funcs", + [ + ["min", "max"], + ["sum", "mean", "std"], + ["min", "max", "mean", "median"], + ], + ) + def test_agg_list_numeric_only_different_function_counts(self, funcs): + """Test with different numbers of functions.""" + df = DataFrame( + { + "A": [1, 2, 3, 4, 5], + "B": [10, 20, 30, 40, 50], + "C": ["a", "b", "c", "d", "e"], + } + ) + result = df.agg(funcs, numeric_only=True) + + # Verify structure + assert isinstance(result, DataFrame) + assert list(result.columns) == ["A", "B"] + assert list(result.index) == funcs + assert result.shape == (len(funcs), 2) + + @pytest.mark.parametrize( + "data,expected_cols", + [ + # Only integers + ({"A": [1, 2, 3], "B": [4, 5, 6], "C": ["x", "y", "z"]}, ["A", "B"]), + # Only floats + ({"A": [1.1, 2.2], "B": [3.3, 4.4], "C": ["x", "y"]}, ["A", "B"]), + # Mix of int and float + ({"int": [1, 2], "float": [1.5, 2.5], "str": ["a", "b"]}, ["int", "float"]), + # Single numeric column + ({"num": [1, 2, 3], "text": ["a", "b", "c"]}, ["num"]), + ], + ) + def test_agg_list_numeric_only_various_dtypes(self, data, expected_cols): + """Test with various numeric dtype combinations.""" + df = DataFrame(data) + result = df.agg(["sum", "mean"], numeric_only=True) + + assert isinstance(result, DataFrame) + assert list(result.columns) == expected_cols + assert list(result.index) == ["sum", "mean"] + + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_agg_list_numeric_only_parameter_values(self, numeric_only): + """Test with different numeric_only parameter values.""" + df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) + + if numeric_only is None: + result = df.agg(["sum", "mean"]) + else: + result = df.agg(["sum", "mean"], numeric_only=numeric_only) + + expected = DataFrame({"A": [6, 2.0], "B": [60, 20.0]}, index=["sum", "mean"]) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_false_with_strings(self): + """Verify numeric_only=False works with min/max on strings.""" + df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) + result = df.agg(["min", "max"], numeric_only=False) + expected = DataFrame({"A": [1, 3], "B": ["a", "c"]}, index=["min", "max"]) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_preserves_column_order(self): + """Test that column order is preserved.""" + df = DataFrame( + { + "Z": [1, 2, 3], + "A": [10, 20, 30], + "M": [100, 200, 300], + "text": ["a", "b", "c"], + } + ) + result = df.agg(["sum", "mean"], numeric_only=True) + + assert list(result.columns) == ["Z", "A", "M"] + + @pytest.mark.parametrize("single_func", ["sum", "mean", "min", "max"]) + def test_agg_single_function_still_works(self, single_func): + """Verify that single function (not a list) still works.""" + df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30], "C": ["a", "b", "c"]}) + result = df.agg(single_func, numeric_only=True) + + assert isinstance(result, pd.Series) + assert "A" in result.index + assert "B" in result.index + assert "C" not in result.index + + def test_agg_list_numeric_only_with_int_and_float(self): + """Test that both int and float columns are included.""" + df = DataFrame( + { + "int_col": [1, 2, 3], + "float_col": [1.5, 2.5, 3.5], + "str_col": ["a", "b", "c"], + } + ) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + {"int_col": [6.0, 2.0], "float_col": [7.5, 2.5]}, index=["sum", "mean"] + ) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_single_row(self): + """Test with single row DataFrame.""" + df = DataFrame({"A": [1], "B": [10], "C": ["x"]}) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + {"A": [1.0, 1.0], "B": [10.0, 10.0]}, index=["sum", "mean"] + ) + tm.assert_frame_equal(result, expected) + + # ========== NEW TESTS - Additional Edge Cases ========== + + def test_agg_list_numeric_only_with_nans(self): + """Test DataFrame with NaN values.""" + df = DataFrame( + {"A": [1, np.nan, 3], "B": [10, 20, np.nan], "C": ["x", "y", "z"]} + ) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + {"A": [4.0, 2.0], "B": [30.0, 15.0]}, index=["sum", "mean"] + ) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_with_datetime(self): + """Test that datetime columns are excluded with numeric_only=True.""" + df = DataFrame( + { + "num": [1, 2, 3], + "date": pd.date_range("2020-01-01", periods=3), + "text": ["a", "b", "c"], + } + ) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame({"num": [6.0, 2.0]}, index=["sum", "mean"]) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_large_dataframe(self): + """Test with a larger DataFrame for performance verification.""" + rng = np.random.default_rng(42) + df = DataFrame( + { + "A": rng.integers(1, 100, 1000), + "B": rng.standard_normal(1000), + "C": ["text"] * 1000, + } + ) + + result = df.agg(["sum", "mean", "std"], numeric_only=True) + + # Just verify structure, not exact values due to randomness + assert isinstance(result, DataFrame) + assert list(result.columns) == ["A", "B"] + assert list(result.index) == ["sum", "mean", "std"] + assert result.shape == (3, 2) diff --git a/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py b/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py new file mode 100644 index 0000000000000..efe1735df78db --- /dev/null +++ b/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py @@ -0,0 +1,275 @@ +""" +Tests for GroupBy.agg with numeric_only parameter and list of functions. +This tests the GroupBy part of the fix for GH#49352. +""" + +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +class TestGroupByAggNumericOnly: + """Tests for GroupBy.agg with numeric_only parameter and list of functions.""" + + def test_groupby_agg_list_numeric_only_basic(self): + """GH#49352 - Basic GroupBy aggregation with mixed dtypes.""" + df = DataFrame( + { + "key": ["A", "B", "A", "B", "A"], + "num1": [1, 2, 3, 4, 5], + "num2": [10, 20, 30, 40, 50], + "text": ["a", "b", "c", "d", "e"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + [[9, 3.0, 90, 30.0], [6, 3.0, 60, 30.0]], + index=pd.Index(["A", "B"], name="key"), + columns=pd.MultiIndex.from_product([["num1", "num2"], ["sum", "mean"]]), + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_agg_list_numeric_only_all_numeric(self): + """GroupBy with all numeric columns.""" + df = DataFrame( + { + "key": ["X", "Y", "X", "Y"], + "val1": [1, 2, 3, 4], + "val2": [10, 20, 30, 40], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + [[4, 2.0, 40, 20.0], [6, 3.0, 60, 30.0]], + index=pd.Index(["X", "Y"], name="key"), + columns=pd.MultiIndex.from_product([["val1", "val2"], ["sum", "mean"]]), + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "funcs,expected_func_names", + [ + (["sum", "mean"], ["sum", "mean"]), + ([np.sum, np.mean], ["sum", "mean"]), + (["sum", np.mean], ["sum", "mean"]), + (["min", "max", "mean"], ["min", "max", "mean"]), + ], + ) + def test_groupby_agg_list_numeric_only_various_functions( + self, funcs, expected_func_names + ): + """Test GroupBy with different function combinations.""" + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "val": [1, 2, 3, 4], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(funcs, numeric_only=True) + + assert isinstance(result, DataFrame) + assert result.columns.levels[0].tolist() == ["val"] + assert result.columns.levels[1].tolist() == expected_func_names + assert result.shape == (2, len(funcs)) + + @pytest.mark.parametrize( + "group_cols", + [ + ["key1"], + ["key1", "key2"], + ], + ) + def test_groupby_agg_list_numeric_only_multiple_groups(self, group_cols): + """Test GroupBy with single and multiple grouping columns.""" + df = DataFrame( + { + "key1": ["A", "A", "B", "B"], + "key2": ["X", "Y", "X", "Y"], + "val": [1, 2, 3, 4], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby(group_cols).agg(["sum", "mean"], numeric_only=True) + + assert isinstance(result, DataFrame) + assert result.columns.levels[0].tolist() == ["val"] + assert result.columns.levels[1].tolist() == ["sum", "mean"] + + @pytest.mark.parametrize( + "data,expected_cols", + [ + # Int and float + ( + { + "key": ["A", "A", "B"], + "int": [1, 2, 3], + "float": [1.5, 2.5, 3.5], + "str": ["x", "y", "z"], + }, + ["int", "float"], + ), + # Only int + ( + {"key": ["A", "B", "A"], "num": [1, 2, 3], "text": ["a", "b", "c"]}, + ["num"], + ), + # Multiple numeric + ( + { + "key": ["A", "A"], + "n1": [1, 2], + "n2": [3, 4], + "n3": [5, 6], + "str": ["x", "y"], + }, + ["n1", "n2", "n3"], + ), + ], + ) + def test_groupby_agg_list_numeric_only_various_dtypes(self, data, expected_cols): + """Test GroupBy with various numeric column combinations.""" + df = DataFrame(data) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) + + assert isinstance(result, DataFrame) + assert result.columns.levels[0].tolist() == expected_cols + assert result.columns.levels[1].tolist() == ["sum", "mean"] + + def test_groupby_agg_list_numeric_only_mixed_int_float(self): + """Test that both int and float columns are included in GroupBy.""" + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "int_col": [1, 2, 3, 4], + "float_col": [1.5, 2.5, 3.5, 4.5], + "str_col": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + [[3, 1.5, 4.0, 2.0], [7, 3.5, 8.0, 4.0]], + index=pd.Index(["A", "B"], name="key"), + columns=pd.MultiIndex.from_product( + [["int_col", "float_col"], ["sum", "mean"]] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_agg_list_numeric_only_preserves_column_order(self): + """Test that GroupBy preserves column order.""" + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "z_col": [1, 2, 3, 4], + "a_col": [10, 20, 30, 40], + "m_col": [100, 200, 300, 400], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(["sum"], numeric_only=True) + + assert result.columns.levels[0].tolist() == ["z_col", "a_col", "m_col"] + + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_groupby_agg_list_numeric_only_parameter_values(self, numeric_only): + """Test GroupBy with numeric_only=True and False.""" + df = DataFrame({"key": ["A", "A", "B", "B"], "val": [1, 2, 3, 4]}) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=numeric_only) + expected = DataFrame( + [[3, 1.5], [7, 3.5]], + index=pd.Index(["A", "B"], name="key"), + columns=pd.MultiIndex.from_product([["val"], ["sum", "mean"]]), + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_agg_list_numeric_only_single_group(self): + """Test GroupBy with a single group.""" + df = DataFrame( + { + "key": ["A", "A", "A"], + "val1": [1, 2, 3], + "val2": [10, 20, 30], + "text": ["x", "y", "z"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + [[6, 2.0, 60, 20.0]], + index=pd.Index(["A"], name="key"), + columns=pd.MultiIndex.from_product([["val1", "val2"], ["sum", "mean"]]), + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_agg_list_numeric_only_many_groups(self): + """Test GroupBy with many groups.""" + df = DataFrame( + { + "key": ["A", "B", "C", "D", "E"], + "val": [1, 2, 3, 4, 5], + "text": ["a", "b", "c", "d", "e"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) + + assert isinstance(result, DataFrame) + assert len(result) == 5 + assert result.columns.levels[0].tolist() == ["val"] + assert result.columns.levels[1].tolist() == ["sum", "mean"] + + # ========== NEW TESTS - Additional Edge Cases ========== + + def test_groupby_agg_list_numeric_only_with_nans(self): + """Test GroupBy with NaN values.""" + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "val": [1, np.nan, 3, 4], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) + + assert isinstance(result, DataFrame) + assert result.loc["A", ("val", "sum")] == 1.0 + assert result.loc["B", ("val", "sum")] == 7.0 + + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_agg_list_numeric_only_as_index(self, as_index): + """Test GroupBy with as_index parameter.""" + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "val": [1, 2, 3, 4], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key", as_index=as_index).agg( + ["sum", "mean"], numeric_only=True + ) + + if as_index: + assert result.index.name == "key" + else: + assert "key" in result.columns + + def test_groupby_agg_list_numeric_only_datetime_column(self): + """Test GroupBy with datetime columns excluded.""" + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "val": [1, 2, 3, 4], + "date": pd.date_range("2020-01-01", periods=4), + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + [[3, 1.5], [7, 3.5]], + index=pd.Index(["A", "B"], name="key"), + columns=pd.MultiIndex.from_product([["val"], ["sum", "mean"]]), + ) + tm.assert_frame_equal(result, expected)