From cbfe34a87b9d2df533710f05cacd414fade62dcb Mon Sep 17 00:00:00 2001 From: Aqib Ali Date: Wed, 22 Oct 2025 17:15:44 +0530 Subject: [PATCH 1/3] BUG: Fix numeric_only ignored with list of functions in agg (#49352) --- doc/source/whatsnew/v3.0.0.rst | 3 +- pandas/core/apply.py | 38 +++ .../apply/test_frame_apply_numeric_only.py | 242 ++++++++++++++++++ .../aggregate/test_aggregate_numeric_only.py | 228 +++++++++++++++++ 4 files changed, 510 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/apply/test_frame_apply_numeric_only.py create mode 100644 pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index eb938a7140e29..16ad5ee7586f9 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1144,6 +1144,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) - Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` where the end of window was not indexed correctly. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`) +- Bug in :meth:`DataFrameGroupBy.agg` where `numeric_only=True` was ignored when passing a list of aggregation functions (:issue:`49352`) Reshaping ^^^^^^^^^ @@ -1236,7 +1237,7 @@ Other - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) - +- Bug in :meth:`DataFrame.agg` where `numeric_only=True` was ignored when passing a list of aggregation functions, causing non-numeric columns to be included or raising TypeError (:issue:49352) .. ***DO NOT USE THIS SECTION*** - diff --git a/pandas/core/apply.py b/pandas/core/apply.py index b305cbfaa3a1e..13935447a5caa 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -835,6 +835,31 @@ def agg_or_apply_list_like( if getattr(obj, "axis", 0) == 1: raise NotImplementedError("axis other than 0 is not supported") + # GH#49352 - Handle numeric_only with list of functions + # When numeric_only=True is passed with a list of functions, filter + # to numeric columns before processing to avoid TypeError on non-numeric Series + if op_name == "agg" and kwargs.get("numeric_only", False): + # Check if obj is a DataFrame (not Series) with 2 dimensions + if isinstance(obj, ABCDataFrame) and obj.ndim == 2: + # Filter to numeric columns before processing + numeric_obj = obj.select_dtypes(include="number") + + # Only proceed if we have numeric columns + if not numeric_obj.empty: + # Create kwargs without numeric_only to avoid passing it to Series methods + kwargs_filtered = {k: v for k, v in kwargs.items() if k != "numeric_only"} + + # Compute with filtered object and cleaned kwargs + keys, results = self.compute_list_like(op_name, numeric_obj, kwargs_filtered) + result = self.wrap_results_list_like(keys, results) + return result + else: + # No numeric columns - return empty result + from pandas import DataFrame + # Get function names for index + keys = list(self.func) if is_list_like(self.func) else [] + return DataFrame(index=keys) + keys, results = self.compute_list_like(op_name, obj, kwargs) result = self.wrap_results_list_like(keys, results) return result @@ -1627,6 +1652,19 @@ def agg_or_apply_list_like( else: selected_obj = obj._obj_with_exclusions + # GH#49352 - Handle numeric_only with list of functions for GroupBy + # Filter to numeric columns before processing to avoid TypeError + if op_name == "agg" and kwargs.get("numeric_only", False): + # For GroupBy, filter the selected object to numeric columns + if selected_obj.ndim == 2: + numeric_obj = selected_obj.select_dtypes(include="number") + + if not numeric_obj.empty: + # Update selected_obj to filtered numeric columns + selected_obj = numeric_obj + # Remove numeric_only from kwargs to avoid passing to Series methods + kwargs = {k: v for k, v in kwargs.items() if k != "numeric_only"} + # Only set as_index=True on groupby objects, not Window or Resample # that inherit from this class. with com.temp_setattr( diff --git a/pandas/tests/apply/test_frame_apply_numeric_only.py b/pandas/tests/apply/test_frame_apply_numeric_only.py new file mode 100644 index 0000000000000..f69ec679382a3 --- /dev/null +++ b/pandas/tests/apply/test_frame_apply_numeric_only.py @@ -0,0 +1,242 @@ +""" +Tests for DataFrame.agg with numeric_only parameter and list of functions. +This tests the fix for GH#49352. +""" + +import numpy as np +import pytest +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +class TestFrameAggNumericOnly: + """Tests for DataFrame.agg with numeric_only parameter and list of functions.""" + + def test_agg_list_numeric_only_mixed_dtypes(self): + """GH#49352 - Main test case from the issue.""" + df = DataFrame({ + 'A': [1, 2, 3, 4, 5], + 'B': [10.5, 20.5, 30.5, 40.5, 50.5], + 'C': ['a', 'b', 'c', 'd', 'e'] + }) + result = df.agg(['min', 'max', 'mean'], numeric_only=True) + expected = DataFrame({ + 'A': [1.0, 5.0, 3.0], + 'B': [10.5, 50.5, 30.5] + }, index=['min', 'max', 'mean']) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_all_numeric(self): + """Should work when all columns are numeric.""" + df = DataFrame({ + 'A': [1, 2, 3], + 'B': [10, 20, 30] + }) + result = df.agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame({ + 'A': [6.0, 2.0], + 'B': [60.0, 20.0] + }, index=['sum', 'mean']) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_no_numeric(self): + """Should return empty DataFrame when no numeric columns.""" + df = DataFrame({ + 'A': ['a', 'b', 'c'], + 'B': ['x', 'y', 'z'] + }) + result = df.agg(['min', 'max'], numeric_only=True) + expected = DataFrame(index=['min', 'max']) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("funcs,expected_index", [ + (['sum', 'mean'], ['sum', 'mean']), + ([np.sum, np.mean], ['sum', 'mean']), + (['sum', np.mean], ['sum', 'mean']), + ([np.sum, 'mean'], ['sum', 'mean']), + ]) + def test_agg_list_numeric_only_various_function_types(self, funcs, expected_index): + """Test with different combinations of string and numpy functions.""" + df = DataFrame({ + 'A': [1, 2, 3], + 'B': [10, 20, 30], + 'C': ['a', 'b', 'c'] + }) + result = df.agg(funcs, numeric_only=True) + expected = DataFrame({ + 'A': [6.0, 2.0], + 'B': [60.0, 20.0] + }, index=expected_index) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("funcs", [ + ['min', 'max'], + ['sum', 'mean', 'std'], + ['min', 'max', 'mean', 'median'], + ]) + def test_agg_list_numeric_only_different_function_counts(self, funcs): + """Test with different numbers of functions.""" + df = DataFrame({ + 'A': [1, 2, 3, 4, 5], + 'B': [10, 20, 30, 40, 50], + 'C': ['a', 'b', 'c', 'd', 'e'] + }) + result = df.agg(funcs, numeric_only=True) + + # Verify structure + assert isinstance(result, DataFrame) + assert list(result.columns) == ['A', 'B'] + assert list(result.index) == funcs + assert result.shape == (len(funcs), 2) + + @pytest.mark.parametrize("data,expected_cols", [ + # Only integers + ({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['x', 'y', 'z']}, ['A', 'B']), + # Only floats + ({'A': [1.1, 2.2], 'B': [3.3, 4.4], 'C': ['x', 'y']}, ['A', 'B']), + # Mix of int and float + ({'int': [1, 2], 'float': [1.5, 2.5], 'str': ['a', 'b']}, ['int', 'float']), + # Single numeric column + ({'num': [1, 2, 3], 'text': ['a', 'b', 'c']}, ['num']), + ]) + def test_agg_list_numeric_only_various_dtypes(self, data, expected_cols): + """Test with various numeric dtype combinations.""" + df = DataFrame(data) + result = df.agg(['sum', 'mean'], numeric_only=True) + + assert isinstance(result, DataFrame) + assert list(result.columns) == expected_cols + assert list(result.index) == ['sum', 'mean'] + + @pytest.mark.parametrize("numeric_only", [True, False, None]) + def test_agg_list_numeric_only_parameter_values(self, numeric_only): + """Test with different numeric_only parameter values.""" + df = DataFrame({ + 'A': [1, 2, 3], + 'B': [10, 20, 30] + }) + + if numeric_only is None: + result = df.agg(['sum', 'mean']) + else: + result = df.agg(['sum', 'mean'], numeric_only=numeric_only) + + expected = DataFrame({ + 'A': [6, 2.0], + 'B': [60, 20.0] + }, index=['sum', 'mean']) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_false_with_strings(self): + """Verify numeric_only=False works with min/max on strings.""" + df = DataFrame({ + 'A': [1, 2, 3], + 'B': ['a', 'b', 'c'] + }) + result = df.agg(['min', 'max'], numeric_only=False) + expected = DataFrame({ + 'A': [1, 3], + 'B': ['a', 'c'] + }, index=['min', 'max']) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_preserves_column_order(self): + """Test that column order is preserved.""" + df = DataFrame({ + 'Z': [1, 2, 3], + 'A': [10, 20, 30], + 'M': [100, 200, 300], + 'text': ['a', 'b', 'c'] + }) + result = df.agg(['sum', 'mean'], numeric_only=True) + + assert list(result.columns) == ['Z', 'A', 'M'] + + @pytest.mark.parametrize("single_func", ['sum', 'mean', 'min', 'max']) + def test_agg_single_function_still_works(self, single_func): + """Verify that single function (not a list) still works.""" + df = DataFrame({ + 'A': [1, 2, 3], + 'B': [10, 20, 30], + 'C': ['a', 'b', 'c'] + }) + result = df.agg(single_func, numeric_only=True) + + assert isinstance(result, pd.Series) + assert 'A' in result.index + assert 'B' in result.index + assert 'C' not in result.index + + def test_agg_list_numeric_only_with_int_and_float(self): + """Test that both int and float columns are included.""" + df = DataFrame({ + 'int_col': [1, 2, 3], + 'float_col': [1.5, 2.5, 3.5], + 'str_col': ['a', 'b', 'c'] + }) + result = df.agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame({ + 'int_col': [6.0, 2.0], + 'float_col': [7.5, 2.5] + }, index=['sum', 'mean']) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_single_row(self): + """Test with single row DataFrame.""" + df = DataFrame({ + 'A': [1], + 'B': [10], + 'C': ['x'] + }) + result = df.agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame({ + 'A': [1.0, 1.0], + 'B': [10.0, 10.0] + }, index=['sum', 'mean']) + tm.assert_frame_equal(result, expected) + + # ========== NEW TESTS - Additional Edge Cases ========== + + def test_agg_list_numeric_only_with_nans(self): + """Test DataFrame with NaN values.""" + df = DataFrame({ + 'A': [1, np.nan, 3], + 'B': [10, 20, np.nan], + 'C': ['x', 'y', 'z'] + }) + result = df.agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame({ + 'A': [4.0, 2.0], + 'B': [30.0, 15.0] + }, index=['sum', 'mean']) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_with_datetime(self): + """Test that datetime columns are excluded with numeric_only=True.""" + df = DataFrame({ + 'num': [1, 2, 3], + 'date': pd.date_range('2020-01-01', periods=3), + 'text': ['a', 'b', 'c'] + }) + result = df.agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame({ + 'num': [6.0, 2.0] + }, index=['sum', 'mean']) + tm.assert_frame_equal(result, expected) + + def test_agg_list_numeric_only_large_dataframe(self): + """Test with a larger DataFrame for performance verification.""" + np.random.seed(42) + df = DataFrame({ + 'A': np.random.randint(1, 100, 1000), + 'B': np.random.randn(1000), + 'C': ['text'] * 1000 + }) + result = df.agg(['sum', 'mean', 'std'], numeric_only=True) + + # Just verify structure, not exact values due to randomness + assert isinstance(result, DataFrame) + assert list(result.columns) == ['A', 'B'] + assert list(result.index) == ['sum', 'mean', 'std'] + assert result.shape == (3, 2) diff --git a/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py b/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py new file mode 100644 index 0000000000000..dcad9cf6f283e --- /dev/null +++ b/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py @@ -0,0 +1,228 @@ +""" +Tests for GroupBy.agg with numeric_only parameter and list of functions. +This tests the GroupBy part of the fix for GH#49352. +""" + +import numpy as np +import pytest +import pandas as pd +from pandas import DataFrame +import pandas._testing as tm + + +class TestGroupByAggNumericOnly: + """Tests for GroupBy.agg with numeric_only parameter and list of functions.""" + + def test_groupby_agg_list_numeric_only_basic(self): + """GH#49352 - Basic GroupBy aggregation with mixed dtypes.""" + df = DataFrame({ + 'key': ['A', 'B', 'A', 'B', 'A'], + 'num1': [1, 2, 3, 4, 5], + 'num2': [10, 20, 30, 40, 50], + 'text': ['a', 'b', 'c', 'd', 'e'] + }) + result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame( + [[9, 3.0, 90, 30.0], + [6, 3.0, 60, 30.0]], + index=pd.Index(['A', 'B'], name='key'), + columns=pd.MultiIndex.from_product([['num1', 'num2'], ['sum', 'mean']]) + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_agg_list_numeric_only_all_numeric(self): + """GroupBy with all numeric columns.""" + df = DataFrame({ + 'key': ['X', 'Y', 'X', 'Y'], + 'val1': [1, 2, 3, 4], + 'val2': [10, 20, 30, 40] + }) + result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame( + [[4, 2.0, 40, 20.0], + [6, 3.0, 60, 30.0]], + index=pd.Index(['X', 'Y'], name='key'), + columns=pd.MultiIndex.from_product([['val1', 'val2'], ['sum', 'mean']]) + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("funcs,expected_func_names", [ + (['sum', 'mean'], ['sum', 'mean']), + ([np.sum, np.mean], ['sum', 'mean']), + (['sum', np.mean], ['sum', 'mean']), + (['min', 'max', 'mean'], ['min', 'max', 'mean']), + ]) + def test_groupby_agg_list_numeric_only_various_functions(self, funcs, expected_func_names): + """Test GroupBy with different function combinations.""" + df = DataFrame({ + 'key': ['A', 'A', 'B', 'B'], + 'val': [1, 2, 3, 4], + 'text': ['a', 'b', 'c', 'd'] + }) + result = df.groupby('key').agg(funcs, numeric_only=True) + + assert isinstance(result, DataFrame) + assert result.columns.levels[0].tolist() == ['val'] + assert result.columns.levels[1].tolist() == expected_func_names + assert result.shape == (2, len(funcs)) + + @pytest.mark.parametrize("group_cols", [ + ['key1'], + ['key1', 'key2'], + ]) + def test_groupby_agg_list_numeric_only_multiple_groups(self, group_cols): + """Test GroupBy with single and multiple grouping columns.""" + df = DataFrame({ + 'key1': ['A', 'A', 'B', 'B'], + 'key2': ['X', 'Y', 'X', 'Y'], + 'val': [1, 2, 3, 4], + 'text': ['a', 'b', 'c', 'd'] + }) + result = df.groupby(group_cols).agg(['sum', 'mean'], numeric_only=True) + + assert isinstance(result, DataFrame) + assert result.columns.levels[0].tolist() == ['val'] + assert result.columns.levels[1].tolist() == ['sum', 'mean'] + + @pytest.mark.parametrize("data,expected_cols", [ + # Int and float + ({'key': ['A', 'A', 'B'], 'int': [1, 2, 3], 'float': [1.5, 2.5, 3.5], 'str': ['x', 'y', 'z']}, + ['int', 'float']), + # Only int + ({'key': ['A', 'B', 'A'], 'num': [1, 2, 3], 'text': ['a', 'b', 'c']}, + ['num']), + # Multiple numeric + ({'key': ['A', 'A'], 'n1': [1, 2], 'n2': [3, 4], 'n3': [5, 6], 'str': ['x', 'y']}, + ['n1', 'n2', 'n3']), + ]) + def test_groupby_agg_list_numeric_only_various_dtypes(self, data, expected_cols): + """Test GroupBy with various numeric column combinations.""" + df = DataFrame(data) + result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + + assert isinstance(result, DataFrame) + assert result.columns.levels[0].tolist() == expected_cols + assert result.columns.levels[1].tolist() == ['sum', 'mean'] + + def test_groupby_agg_list_numeric_only_mixed_int_float(self): + """Test that both int and float columns are included in GroupBy.""" + df = DataFrame({ + 'key': ['A', 'A', 'B', 'B'], + 'int_col': [1, 2, 3, 4], + 'float_col': [1.5, 2.5, 3.5, 4.5], + 'str_col': ['a', 'b', 'c', 'd'] + }) + result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame( + [[3, 1.5, 4.0, 2.0], + [7, 3.5, 8.0, 4.0]], + index=pd.Index(['A', 'B'], name='key'), + columns=pd.MultiIndex.from_product([['int_col', 'float_col'], ['sum', 'mean']]) + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_agg_list_numeric_only_preserves_column_order(self): + """Test that GroupBy preserves column order.""" + df = DataFrame({ + 'key': ['A', 'A', 'B', 'B'], + 'z_col': [1, 2, 3, 4], + 'a_col': [10, 20, 30, 40], + 'm_col': [100, 200, 300, 400], + 'text': ['a', 'b', 'c', 'd'] + }) + result = df.groupby('key').agg(['sum'], numeric_only=True) + + assert result.columns.levels[0].tolist() == ['z_col', 'a_col', 'm_col'] + + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_groupby_agg_list_numeric_only_parameter_values(self, numeric_only): + """Test GroupBy with numeric_only=True and False.""" + df = DataFrame({ + 'key': ['A', 'A', 'B', 'B'], + 'val': [1, 2, 3, 4] + }) + result = df.groupby('key').agg(['sum', 'mean'], numeric_only=numeric_only) + expected = DataFrame( + [[3, 1.5], + [7, 3.5]], + index=pd.Index(['A', 'B'], name='key'), + columns=pd.MultiIndex.from_product([['val'], ['sum', 'mean']]) + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_agg_list_numeric_only_single_group(self): + """Test GroupBy with a single group.""" + df = DataFrame({ + 'key': ['A', 'A', 'A'], + 'val1': [1, 2, 3], + 'val2': [10, 20, 30], + 'text': ['x', 'y', 'z'] + }) + result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame( + [[6, 2.0, 60, 20.0]], + index=pd.Index(['A'], name='key'), + columns=pd.MultiIndex.from_product([['val1', 'val2'], ['sum', 'mean']]) + ) + tm.assert_frame_equal(result, expected) + + def test_groupby_agg_list_numeric_only_many_groups(self): + """Test GroupBy with many groups.""" + df = DataFrame({ + 'key': ['A', 'B', 'C', 'D', 'E'], + 'val': [1, 2, 3, 4, 5], + 'text': ['a', 'b', 'c', 'd', 'e'] + }) + result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + + assert isinstance(result, DataFrame) + assert len(result) == 5 + assert result.columns.levels[0].tolist() == ['val'] + assert result.columns.levels[1].tolist() == ['sum', 'mean'] + + # ========== NEW TESTS - Additional Edge Cases ========== + + def test_groupby_agg_list_numeric_only_with_nans(self): + """Test GroupBy with NaN values.""" + df = DataFrame({ + 'key': ['A', 'A', 'B', 'B'], + 'val': [1, np.nan, 3, 4], + 'text': ['a', 'b', 'c', 'd'] + }) + result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + + assert isinstance(result, DataFrame) + assert result.loc['A', ('val', 'sum')] == 1.0 + assert result.loc['B', ('val', 'sum')] == 7.0 + + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_agg_list_numeric_only_as_index(self, as_index): + """Test GroupBy with as_index parameter.""" + df = DataFrame({ + 'key': ['A', 'A', 'B', 'B'], + 'val': [1, 2, 3, 4], + 'text': ['a', 'b', 'c', 'd'] + }) + result = df.groupby('key', as_index=as_index).agg(['sum', 'mean'], numeric_only=True) + + if as_index: + assert result.index.name == 'key' + else: + assert 'key' in result.columns + + def test_groupby_agg_list_numeric_only_datetime_column(self): + """Test GroupBy with datetime columns excluded.""" + df = DataFrame({ + 'key': ['A', 'A', 'B', 'B'], + 'val': [1, 2, 3, 4], + 'date': pd.date_range('2020-01-01', periods=4), + 'text': ['a', 'b', 'c', 'd'] + }) + result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + expected = DataFrame( + [[3, 1.5], + [7, 3.5]], + index=pd.Index(['A', 'B'], name='key'), + columns=pd.MultiIndex.from_product([['val'], ['sum', 'mean']]) + ) + tm.assert_frame_equal(result, expected) \ No newline at end of file From cc4b0d4f91caad06cda4da22e1e07568670c0586 Mon Sep 17 00:00:00 2001 From: Aqib Ali Date: Thu, 23 Oct 2025 10:53:22 +0530 Subject: [PATCH 2/3] Fix pre-commit.ci errors --- doc/source/whatsnew/v3.0.0.rst | 5 +- pandas/core/apply.py | 9 +- .../apply/test_frame_apply_numeric_only.py | 277 +++++++-------- .../aggregate/test_aggregate_numeric_only.py | 319 ++++++++++-------- 4 files changed, 322 insertions(+), 288 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 81a41f0a015f2..bf5c99e7d938b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1150,7 +1150,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) - Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` where the end of window was not indexed correctly. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`) -- Bug in :meth:`DataFrameGroupBy.agg` where `numeric_only=True` was ignored when passing a list of aggregation functions (:issue:`49352`) +- Bug in :meth:`DataFrameGroupBy.agg` where ``numeric_only=True`` was ignored when passing a list of aggregation functions (:issue:`49352`) Reshaping ^^^^^^^^^ @@ -1204,6 +1204,7 @@ Other - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) - Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) +- Bug in :meth:`DataFrame.agg` where ``numeric_only=True`` was ignored when passing a list of aggregation functions, causing non-numeric columns to be included or raising TypeError (:issue:`49352`) - Bug in :meth:`DataFrame.apply` raising ``RecursionError`` when passing ``func=list[int]``. (:issue:`61565`) - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) @@ -1243,7 +1244,7 @@ Other - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`) -- Bug in :meth:`DataFrame.agg` where `numeric_only=True` was ignored when passing a list of aggregation functions, causing non-numeric columns to be included or raising TypeError (:issue:49352) + .. ***DO NOT USE THIS SECTION*** - diff --git a/pandas/core/apply.py b/pandas/core/apply.py index eaec3d4ef3a82..f2baf1ad533eb 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -849,15 +849,20 @@ def agg_or_apply_list_like( # Only proceed if we have numeric columns if not numeric_obj.empty: # Create kwargs without numeric_only to avoid passing it to Series methods - kwargs_filtered = {k: v for k, v in kwargs.items() if k != "numeric_only"} + kwargs_filtered = { + k: v for k, v in kwargs.items() if k != "numeric_only" + } # Compute with filtered object and cleaned kwargs - keys, results = self.compute_list_like(op_name, numeric_obj, kwargs_filtered) + keys, results = self.compute_list_like( + op_name, numeric_obj, kwargs_filtered + ) result = self.wrap_results_list_like(keys, results) return result else: # No numeric columns - return empty result from pandas import DataFrame + # Get function names for index keys = list(self.func) if is_list_like(self.func) else [] return DataFrame(index=keys) diff --git a/pandas/tests/apply/test_frame_apply_numeric_only.py b/pandas/tests/apply/test_frame_apply_numeric_only.py index f69ec679382a3..257d6dbc1db08 100644 --- a/pandas/tests/apply/test_frame_apply_numeric_only.py +++ b/pandas/tests/apply/test_frame_apply_numeric_only.py @@ -5,6 +5,7 @@ import numpy as np import pytest + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -15,228 +16,208 @@ class TestFrameAggNumericOnly: def test_agg_list_numeric_only_mixed_dtypes(self): """GH#49352 - Main test case from the issue.""" - df = DataFrame({ - 'A': [1, 2, 3, 4, 5], - 'B': [10.5, 20.5, 30.5, 40.5, 50.5], - 'C': ['a', 'b', 'c', 'd', 'e'] - }) - result = df.agg(['min', 'max', 'mean'], numeric_only=True) - expected = DataFrame({ - 'A': [1.0, 5.0, 3.0], - 'B': [10.5, 50.5, 30.5] - }, index=['min', 'max', 'mean']) + df = DataFrame( + { + "A": [1, 2, 3, 4, 5], + "B": [10.5, 20.5, 30.5, 40.5, 50.5], + "C": ["a", "b", "c", "d", "e"], + } + ) + result = df.agg(["min", "max", "mean"], numeric_only=True) + expected = DataFrame( + {"A": [1.0, 5.0, 3.0], "B": [10.5, 50.5, 30.5]}, + index=["min", "max", "mean"], + ) tm.assert_frame_equal(result, expected) def test_agg_list_numeric_only_all_numeric(self): """Should work when all columns are numeric.""" - df = DataFrame({ - 'A': [1, 2, 3], - 'B': [10, 20, 30] - }) - result = df.agg(['sum', 'mean'], numeric_only=True) - expected = DataFrame({ - 'A': [6.0, 2.0], - 'B': [60.0, 20.0] - }, index=['sum', 'mean']) + df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + {"A": [6.0, 2.0], "B": [60.0, 20.0]}, index=["sum", "mean"] + ) tm.assert_frame_equal(result, expected) def test_agg_list_numeric_only_no_numeric(self): """Should return empty DataFrame when no numeric columns.""" - df = DataFrame({ - 'A': ['a', 'b', 'c'], - 'B': ['x', 'y', 'z'] - }) - result = df.agg(['min', 'max'], numeric_only=True) - expected = DataFrame(index=['min', 'max']) + df = DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) + result = df.agg(["min", "max"], numeric_only=True) + expected = DataFrame(index=["min", "max"]) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("funcs,expected_index", [ - (['sum', 'mean'], ['sum', 'mean']), - ([np.sum, np.mean], ['sum', 'mean']), - (['sum', np.mean], ['sum', 'mean']), - ([np.sum, 'mean'], ['sum', 'mean']), - ]) + @pytest.mark.parametrize( + "funcs,expected_index", + [ + (["sum", "mean"], ["sum", "mean"]), + ([np.sum, np.mean], ["sum", "mean"]), + (["sum", np.mean], ["sum", "mean"]), + ([np.sum, "mean"], ["sum", "mean"]), + ], + ) def test_agg_list_numeric_only_various_function_types(self, funcs, expected_index): """Test with different combinations of string and numpy functions.""" - df = DataFrame({ - 'A': [1, 2, 3], - 'B': [10, 20, 30], - 'C': ['a', 'b', 'c'] - }) + df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30], "C": ["a", "b", "c"]}) result = df.agg(funcs, numeric_only=True) - expected = DataFrame({ - 'A': [6.0, 2.0], - 'B': [60.0, 20.0] - }, index=expected_index) + expected = DataFrame({"A": [6.0, 2.0], "B": [60.0, 20.0]}, index=expected_index) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("funcs", [ - ['min', 'max'], - ['sum', 'mean', 'std'], - ['min', 'max', 'mean', 'median'], - ]) + @pytest.mark.parametrize( + "funcs", + [ + ["min", "max"], + ["sum", "mean", "std"], + ["min", "max", "mean", "median"], + ], + ) def test_agg_list_numeric_only_different_function_counts(self, funcs): """Test with different numbers of functions.""" - df = DataFrame({ - 'A': [1, 2, 3, 4, 5], - 'B': [10, 20, 30, 40, 50], - 'C': ['a', 'b', 'c', 'd', 'e'] - }) + df = DataFrame( + { + "A": [1, 2, 3, 4, 5], + "B": [10, 20, 30, 40, 50], + "C": ["a", "b", "c", "d", "e"], + } + ) result = df.agg(funcs, numeric_only=True) # Verify structure assert isinstance(result, DataFrame) - assert list(result.columns) == ['A', 'B'] + assert list(result.columns) == ["A", "B"] assert list(result.index) == funcs assert result.shape == (len(funcs), 2) - @pytest.mark.parametrize("data,expected_cols", [ - # Only integers - ({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['x', 'y', 'z']}, ['A', 'B']), - # Only floats - ({'A': [1.1, 2.2], 'B': [3.3, 4.4], 'C': ['x', 'y']}, ['A', 'B']), - # Mix of int and float - ({'int': [1, 2], 'float': [1.5, 2.5], 'str': ['a', 'b']}, ['int', 'float']), - # Single numeric column - ({'num': [1, 2, 3], 'text': ['a', 'b', 'c']}, ['num']), - ]) + @pytest.mark.parametrize( + "data,expected_cols", + [ + # Only integers + ({"A": [1, 2, 3], "B": [4, 5, 6], "C": ["x", "y", "z"]}, ["A", "B"]), + # Only floats + ({"A": [1.1, 2.2], "B": [3.3, 4.4], "C": ["x", "y"]}, ["A", "B"]), + # Mix of int and float + ({"int": [1, 2], "float": [1.5, 2.5], "str": ["a", "b"]}, ["int", "float"]), + # Single numeric column + ({"num": [1, 2, 3], "text": ["a", "b", "c"]}, ["num"]), + ], + ) def test_agg_list_numeric_only_various_dtypes(self, data, expected_cols): """Test with various numeric dtype combinations.""" df = DataFrame(data) - result = df.agg(['sum', 'mean'], numeric_only=True) + result = df.agg(["sum", "mean"], numeric_only=True) assert isinstance(result, DataFrame) assert list(result.columns) == expected_cols - assert list(result.index) == ['sum', 'mean'] + assert list(result.index) == ["sum", "mean"] @pytest.mark.parametrize("numeric_only", [True, False, None]) def test_agg_list_numeric_only_parameter_values(self, numeric_only): """Test with different numeric_only parameter values.""" - df = DataFrame({ - 'A': [1, 2, 3], - 'B': [10, 20, 30] - }) + df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) if numeric_only is None: - result = df.agg(['sum', 'mean']) + result = df.agg(["sum", "mean"]) else: - result = df.agg(['sum', 'mean'], numeric_only=numeric_only) + result = df.agg(["sum", "mean"], numeric_only=numeric_only) - expected = DataFrame({ - 'A': [6, 2.0], - 'B': [60, 20.0] - }, index=['sum', 'mean']) + expected = DataFrame({"A": [6, 2.0], "B": [60, 20.0]}, index=["sum", "mean"]) tm.assert_frame_equal(result, expected) def test_agg_list_numeric_only_false_with_strings(self): """Verify numeric_only=False works with min/max on strings.""" - df = DataFrame({ - 'A': [1, 2, 3], - 'B': ['a', 'b', 'c'] - }) - result = df.agg(['min', 'max'], numeric_only=False) - expected = DataFrame({ - 'A': [1, 3], - 'B': ['a', 'c'] - }, index=['min', 'max']) + df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) + result = df.agg(["min", "max"], numeric_only=False) + expected = DataFrame({"A": [1, 3], "B": ["a", "c"]}, index=["min", "max"]) tm.assert_frame_equal(result, expected) def test_agg_list_numeric_only_preserves_column_order(self): """Test that column order is preserved.""" - df = DataFrame({ - 'Z': [1, 2, 3], - 'A': [10, 20, 30], - 'M': [100, 200, 300], - 'text': ['a', 'b', 'c'] - }) - result = df.agg(['sum', 'mean'], numeric_only=True) - - assert list(result.columns) == ['Z', 'A', 'M'] - - @pytest.mark.parametrize("single_func", ['sum', 'mean', 'min', 'max']) + df = DataFrame( + { + "Z": [1, 2, 3], + "A": [10, 20, 30], + "M": [100, 200, 300], + "text": ["a", "b", "c"], + } + ) + result = df.agg(["sum", "mean"], numeric_only=True) + + assert list(result.columns) == ["Z", "A", "M"] + + @pytest.mark.parametrize("single_func", ["sum", "mean", "min", "max"]) def test_agg_single_function_still_works(self, single_func): """Verify that single function (not a list) still works.""" - df = DataFrame({ - 'A': [1, 2, 3], - 'B': [10, 20, 30], - 'C': ['a', 'b', 'c'] - }) + df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30], "C": ["a", "b", "c"]}) result = df.agg(single_func, numeric_only=True) assert isinstance(result, pd.Series) - assert 'A' in result.index - assert 'B' in result.index - assert 'C' not in result.index + assert "A" in result.index + assert "B" in result.index + assert "C" not in result.index def test_agg_list_numeric_only_with_int_and_float(self): """Test that both int and float columns are included.""" - df = DataFrame({ - 'int_col': [1, 2, 3], - 'float_col': [1.5, 2.5, 3.5], - 'str_col': ['a', 'b', 'c'] - }) - result = df.agg(['sum', 'mean'], numeric_only=True) - expected = DataFrame({ - 'int_col': [6.0, 2.0], - 'float_col': [7.5, 2.5] - }, index=['sum', 'mean']) + df = DataFrame( + { + "int_col": [1, 2, 3], + "float_col": [1.5, 2.5, 3.5], + "str_col": ["a", "b", "c"], + } + ) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + {"int_col": [6.0, 2.0], "float_col": [7.5, 2.5]}, index=["sum", "mean"] + ) tm.assert_frame_equal(result, expected) def test_agg_list_numeric_only_single_row(self): """Test with single row DataFrame.""" - df = DataFrame({ - 'A': [1], - 'B': [10], - 'C': ['x'] - }) - result = df.agg(['sum', 'mean'], numeric_only=True) - expected = DataFrame({ - 'A': [1.0, 1.0], - 'B': [10.0, 10.0] - }, index=['sum', 'mean']) + df = DataFrame({"A": [1], "B": [10], "C": ["x"]}) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + {"A": [1.0, 1.0], "B": [10.0, 10.0]}, index=["sum", "mean"] + ) tm.assert_frame_equal(result, expected) # ========== NEW TESTS - Additional Edge Cases ========== def test_agg_list_numeric_only_with_nans(self): """Test DataFrame with NaN values.""" - df = DataFrame({ - 'A': [1, np.nan, 3], - 'B': [10, 20, np.nan], - 'C': ['x', 'y', 'z'] - }) - result = df.agg(['sum', 'mean'], numeric_only=True) - expected = DataFrame({ - 'A': [4.0, 2.0], - 'B': [30.0, 15.0] - }, index=['sum', 'mean']) + df = DataFrame( + {"A": [1, np.nan, 3], "B": [10, 20, np.nan], "C": ["x", "y", "z"]} + ) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame( + {"A": [4.0, 2.0], "B": [30.0, 15.0]}, index=["sum", "mean"] + ) tm.assert_frame_equal(result, expected) def test_agg_list_numeric_only_with_datetime(self): """Test that datetime columns are excluded with numeric_only=True.""" - df = DataFrame({ - 'num': [1, 2, 3], - 'date': pd.date_range('2020-01-01', periods=3), - 'text': ['a', 'b', 'c'] - }) - result = df.agg(['sum', 'mean'], numeric_only=True) - expected = DataFrame({ - 'num': [6.0, 2.0] - }, index=['sum', 'mean']) + df = DataFrame( + { + "num": [1, 2, 3], + "date": pd.date_range("2020-01-01", periods=3), + "text": ["a", "b", "c"], + } + ) + result = df.agg(["sum", "mean"], numeric_only=True) + expected = DataFrame({"num": [6.0, 2.0]}, index=["sum", "mean"]) tm.assert_frame_equal(result, expected) def test_agg_list_numeric_only_large_dataframe(self): """Test with a larger DataFrame for performance verification.""" np.random.seed(42) - df = DataFrame({ - 'A': np.random.randint(1, 100, 1000), - 'B': np.random.randn(1000), - 'C': ['text'] * 1000 - }) - result = df.agg(['sum', 'mean', 'std'], numeric_only=True) + df = DataFrame( + { + "A": np.random.randint(1, 100, 1000), + "B": np.random.randn(1000), + "C": ["text"] * 1000, + } + ) + result = df.agg(["sum", "mean", "std"], numeric_only=True) # Just verify structure, not exact values due to randomness assert isinstance(result, DataFrame) - assert list(result.columns) == ['A', 'B'] - assert list(result.index) == ['sum', 'mean', 'std'] + assert list(result.columns) == ["A", "B"] + assert list(result.index) == ["sum", "mean", "std"] assert result.shape == (3, 2) diff --git a/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py b/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py index dcad9cf6f283e..efe1735df78db 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py +++ b/pandas/tests/groupby/aggregate/test_aggregate_numeric_only.py @@ -5,6 +5,7 @@ import numpy as np import pytest + import pandas as pd from pandas import DataFrame import pandas._testing as tm @@ -15,214 +16,260 @@ class TestGroupByAggNumericOnly: def test_groupby_agg_list_numeric_only_basic(self): """GH#49352 - Basic GroupBy aggregation with mixed dtypes.""" - df = DataFrame({ - 'key': ['A', 'B', 'A', 'B', 'A'], - 'num1': [1, 2, 3, 4, 5], - 'num2': [10, 20, 30, 40, 50], - 'text': ['a', 'b', 'c', 'd', 'e'] - }) - result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + df = DataFrame( + { + "key": ["A", "B", "A", "B", "A"], + "num1": [1, 2, 3, 4, 5], + "num2": [10, 20, 30, 40, 50], + "text": ["a", "b", "c", "d", "e"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) expected = DataFrame( - [[9, 3.0, 90, 30.0], - [6, 3.0, 60, 30.0]], - index=pd.Index(['A', 'B'], name='key'), - columns=pd.MultiIndex.from_product([['num1', 'num2'], ['sum', 'mean']]) + [[9, 3.0, 90, 30.0], [6, 3.0, 60, 30.0]], + index=pd.Index(["A", "B"], name="key"), + columns=pd.MultiIndex.from_product([["num1", "num2"], ["sum", "mean"]]), ) tm.assert_frame_equal(result, expected) def test_groupby_agg_list_numeric_only_all_numeric(self): """GroupBy with all numeric columns.""" - df = DataFrame({ - 'key': ['X', 'Y', 'X', 'Y'], - 'val1': [1, 2, 3, 4], - 'val2': [10, 20, 30, 40] - }) - result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + df = DataFrame( + { + "key": ["X", "Y", "X", "Y"], + "val1": [1, 2, 3, 4], + "val2": [10, 20, 30, 40], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) expected = DataFrame( - [[4, 2.0, 40, 20.0], - [6, 3.0, 60, 30.0]], - index=pd.Index(['X', 'Y'], name='key'), - columns=pd.MultiIndex.from_product([['val1', 'val2'], ['sum', 'mean']]) + [[4, 2.0, 40, 20.0], [6, 3.0, 60, 30.0]], + index=pd.Index(["X", "Y"], name="key"), + columns=pd.MultiIndex.from_product([["val1", "val2"], ["sum", "mean"]]), ) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize("funcs,expected_func_names", [ - (['sum', 'mean'], ['sum', 'mean']), - ([np.sum, np.mean], ['sum', 'mean']), - (['sum', np.mean], ['sum', 'mean']), - (['min', 'max', 'mean'], ['min', 'max', 'mean']), - ]) - def test_groupby_agg_list_numeric_only_various_functions(self, funcs, expected_func_names): + @pytest.mark.parametrize( + "funcs,expected_func_names", + [ + (["sum", "mean"], ["sum", "mean"]), + ([np.sum, np.mean], ["sum", "mean"]), + (["sum", np.mean], ["sum", "mean"]), + (["min", "max", "mean"], ["min", "max", "mean"]), + ], + ) + def test_groupby_agg_list_numeric_only_various_functions( + self, funcs, expected_func_names + ): """Test GroupBy with different function combinations.""" - df = DataFrame({ - 'key': ['A', 'A', 'B', 'B'], - 'val': [1, 2, 3, 4], - 'text': ['a', 'b', 'c', 'd'] - }) - result = df.groupby('key').agg(funcs, numeric_only=True) + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "val": [1, 2, 3, 4], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(funcs, numeric_only=True) assert isinstance(result, DataFrame) - assert result.columns.levels[0].tolist() == ['val'] + assert result.columns.levels[0].tolist() == ["val"] assert result.columns.levels[1].tolist() == expected_func_names assert result.shape == (2, len(funcs)) - @pytest.mark.parametrize("group_cols", [ - ['key1'], - ['key1', 'key2'], - ]) + @pytest.mark.parametrize( + "group_cols", + [ + ["key1"], + ["key1", "key2"], + ], + ) def test_groupby_agg_list_numeric_only_multiple_groups(self, group_cols): """Test GroupBy with single and multiple grouping columns.""" - df = DataFrame({ - 'key1': ['A', 'A', 'B', 'B'], - 'key2': ['X', 'Y', 'X', 'Y'], - 'val': [1, 2, 3, 4], - 'text': ['a', 'b', 'c', 'd'] - }) - result = df.groupby(group_cols).agg(['sum', 'mean'], numeric_only=True) + df = DataFrame( + { + "key1": ["A", "A", "B", "B"], + "key2": ["X", "Y", "X", "Y"], + "val": [1, 2, 3, 4], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby(group_cols).agg(["sum", "mean"], numeric_only=True) assert isinstance(result, DataFrame) - assert result.columns.levels[0].tolist() == ['val'] - assert result.columns.levels[1].tolist() == ['sum', 'mean'] - - @pytest.mark.parametrize("data,expected_cols", [ - # Int and float - ({'key': ['A', 'A', 'B'], 'int': [1, 2, 3], 'float': [1.5, 2.5, 3.5], 'str': ['x', 'y', 'z']}, - ['int', 'float']), - # Only int - ({'key': ['A', 'B', 'A'], 'num': [1, 2, 3], 'text': ['a', 'b', 'c']}, - ['num']), - # Multiple numeric - ({'key': ['A', 'A'], 'n1': [1, 2], 'n2': [3, 4], 'n3': [5, 6], 'str': ['x', 'y']}, - ['n1', 'n2', 'n3']), - ]) + assert result.columns.levels[0].tolist() == ["val"] + assert result.columns.levels[1].tolist() == ["sum", "mean"] + + @pytest.mark.parametrize( + "data,expected_cols", + [ + # Int and float + ( + { + "key": ["A", "A", "B"], + "int": [1, 2, 3], + "float": [1.5, 2.5, 3.5], + "str": ["x", "y", "z"], + }, + ["int", "float"], + ), + # Only int + ( + {"key": ["A", "B", "A"], "num": [1, 2, 3], "text": ["a", "b", "c"]}, + ["num"], + ), + # Multiple numeric + ( + { + "key": ["A", "A"], + "n1": [1, 2], + "n2": [3, 4], + "n3": [5, 6], + "str": ["x", "y"], + }, + ["n1", "n2", "n3"], + ), + ], + ) def test_groupby_agg_list_numeric_only_various_dtypes(self, data, expected_cols): """Test GroupBy with various numeric column combinations.""" df = DataFrame(data) - result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) assert isinstance(result, DataFrame) assert result.columns.levels[0].tolist() == expected_cols - assert result.columns.levels[1].tolist() == ['sum', 'mean'] + assert result.columns.levels[1].tolist() == ["sum", "mean"] def test_groupby_agg_list_numeric_only_mixed_int_float(self): """Test that both int and float columns are included in GroupBy.""" - df = DataFrame({ - 'key': ['A', 'A', 'B', 'B'], - 'int_col': [1, 2, 3, 4], - 'float_col': [1.5, 2.5, 3.5, 4.5], - 'str_col': ['a', 'b', 'c', 'd'] - }) - result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "int_col": [1, 2, 3, 4], + "float_col": [1.5, 2.5, 3.5, 4.5], + "str_col": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) expected = DataFrame( - [[3, 1.5, 4.0, 2.0], - [7, 3.5, 8.0, 4.0]], - index=pd.Index(['A', 'B'], name='key'), - columns=pd.MultiIndex.from_product([['int_col', 'float_col'], ['sum', 'mean']]) + [[3, 1.5, 4.0, 2.0], [7, 3.5, 8.0, 4.0]], + index=pd.Index(["A", "B"], name="key"), + columns=pd.MultiIndex.from_product( + [["int_col", "float_col"], ["sum", "mean"]] + ), ) tm.assert_frame_equal(result, expected) def test_groupby_agg_list_numeric_only_preserves_column_order(self): """Test that GroupBy preserves column order.""" - df = DataFrame({ - 'key': ['A', 'A', 'B', 'B'], - 'z_col': [1, 2, 3, 4], - 'a_col': [10, 20, 30, 40], - 'm_col': [100, 200, 300, 400], - 'text': ['a', 'b', 'c', 'd'] - }) - result = df.groupby('key').agg(['sum'], numeric_only=True) + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "z_col": [1, 2, 3, 4], + "a_col": [10, 20, 30, 40], + "m_col": [100, 200, 300, 400], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(["sum"], numeric_only=True) - assert result.columns.levels[0].tolist() == ['z_col', 'a_col', 'm_col'] + assert result.columns.levels[0].tolist() == ["z_col", "a_col", "m_col"] @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_agg_list_numeric_only_parameter_values(self, numeric_only): """Test GroupBy with numeric_only=True and False.""" - df = DataFrame({ - 'key': ['A', 'A', 'B', 'B'], - 'val': [1, 2, 3, 4] - }) - result = df.groupby('key').agg(['sum', 'mean'], numeric_only=numeric_only) + df = DataFrame({"key": ["A", "A", "B", "B"], "val": [1, 2, 3, 4]}) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=numeric_only) expected = DataFrame( - [[3, 1.5], - [7, 3.5]], - index=pd.Index(['A', 'B'], name='key'), - columns=pd.MultiIndex.from_product([['val'], ['sum', 'mean']]) + [[3, 1.5], [7, 3.5]], + index=pd.Index(["A", "B"], name="key"), + columns=pd.MultiIndex.from_product([["val"], ["sum", "mean"]]), ) tm.assert_frame_equal(result, expected) def test_groupby_agg_list_numeric_only_single_group(self): """Test GroupBy with a single group.""" - df = DataFrame({ - 'key': ['A', 'A', 'A'], - 'val1': [1, 2, 3], - 'val2': [10, 20, 30], - 'text': ['x', 'y', 'z'] - }) - result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + df = DataFrame( + { + "key": ["A", "A", "A"], + "val1": [1, 2, 3], + "val2": [10, 20, 30], + "text": ["x", "y", "z"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) expected = DataFrame( [[6, 2.0, 60, 20.0]], - index=pd.Index(['A'], name='key'), - columns=pd.MultiIndex.from_product([['val1', 'val2'], ['sum', 'mean']]) + index=pd.Index(["A"], name="key"), + columns=pd.MultiIndex.from_product([["val1", "val2"], ["sum", "mean"]]), ) tm.assert_frame_equal(result, expected) def test_groupby_agg_list_numeric_only_many_groups(self): """Test GroupBy with many groups.""" - df = DataFrame({ - 'key': ['A', 'B', 'C', 'D', 'E'], - 'val': [1, 2, 3, 4, 5], - 'text': ['a', 'b', 'c', 'd', 'e'] - }) - result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + df = DataFrame( + { + "key": ["A", "B", "C", "D", "E"], + "val": [1, 2, 3, 4, 5], + "text": ["a", "b", "c", "d", "e"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) assert isinstance(result, DataFrame) assert len(result) == 5 - assert result.columns.levels[0].tolist() == ['val'] - assert result.columns.levels[1].tolist() == ['sum', 'mean'] + assert result.columns.levels[0].tolist() == ["val"] + assert result.columns.levels[1].tolist() == ["sum", "mean"] # ========== NEW TESTS - Additional Edge Cases ========== def test_groupby_agg_list_numeric_only_with_nans(self): """Test GroupBy with NaN values.""" - df = DataFrame({ - 'key': ['A', 'A', 'B', 'B'], - 'val': [1, np.nan, 3, 4], - 'text': ['a', 'b', 'c', 'd'] - }) - result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "val": [1, np.nan, 3, 4], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) assert isinstance(result, DataFrame) - assert result.loc['A', ('val', 'sum')] == 1.0 - assert result.loc['B', ('val', 'sum')] == 7.0 + assert result.loc["A", ("val", "sum")] == 1.0 + assert result.loc["B", ("val", "sum")] == 7.0 @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_agg_list_numeric_only_as_index(self, as_index): """Test GroupBy with as_index parameter.""" - df = DataFrame({ - 'key': ['A', 'A', 'B', 'B'], - 'val': [1, 2, 3, 4], - 'text': ['a', 'b', 'c', 'd'] - }) - result = df.groupby('key', as_index=as_index).agg(['sum', 'mean'], numeric_only=True) + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "val": [1, 2, 3, 4], + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key", as_index=as_index).agg( + ["sum", "mean"], numeric_only=True + ) if as_index: - assert result.index.name == 'key' + assert result.index.name == "key" else: - assert 'key' in result.columns + assert "key" in result.columns def test_groupby_agg_list_numeric_only_datetime_column(self): """Test GroupBy with datetime columns excluded.""" - df = DataFrame({ - 'key': ['A', 'A', 'B', 'B'], - 'val': [1, 2, 3, 4], - 'date': pd.date_range('2020-01-01', periods=4), - 'text': ['a', 'b', 'c', 'd'] - }) - result = df.groupby('key').agg(['sum', 'mean'], numeric_only=True) + df = DataFrame( + { + "key": ["A", "A", "B", "B"], + "val": [1, 2, 3, 4], + "date": pd.date_range("2020-01-01", periods=4), + "text": ["a", "b", "c", "d"], + } + ) + result = df.groupby("key").agg(["sum", "mean"], numeric_only=True) expected = DataFrame( - [[3, 1.5], - [7, 3.5]], - index=pd.Index(['A', 'B'], name='key'), - columns=pd.MultiIndex.from_product([['val'], ['sum', 'mean']]) + [[3, 1.5], [7, 3.5]], + index=pd.Index(["A", "B"], name="key"), + columns=pd.MultiIndex.from_product([["val"], ["sum", "mean"]]), ) - tm.assert_frame_equal(result, expected) \ No newline at end of file + tm.assert_frame_equal(result, expected) From 0b244f2a255412b657536b965fb635ca5e2b5792 Mon Sep 17 00:00:00 2001 From: Aqib Ali Date: Thu, 23 Oct 2025 13:36:19 +0530 Subject: [PATCH 3/3] Fix pre-commit.ci errors: line length and NumPy API deprecation, proper type checking with isinstance and type ignore --- pandas/core/apply.py | 11 +++++++++-- pandas/tests/apply/test_frame_apply_numeric_only.py | 7 ++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index f2baf1ad533eb..87d9ec13e2dc4 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -848,7 +848,8 @@ def agg_or_apply_list_like( # Only proceed if we have numeric columns if not numeric_obj.empty: - # Create kwargs without numeric_only to avoid passing it to Series methods + # Create kwargs without numeric_only to avoid + # passing it to Series methods kwargs_filtered = { k: v for k, v in kwargs.items() if k != "numeric_only" } @@ -864,7 +865,13 @@ def agg_or_apply_list_like( from pandas import DataFrame # Get function names for index - keys = list(self.func) if is_list_like(self.func) else [] + if isinstance(self.func, list): + keys = self.func # type: ignore[assignment] + elif isinstance(self.func, dict): + keys = list(self.func.keys()) + else: + keys = [] + return DataFrame(index=keys) keys, results = self.compute_list_like(op_name, obj, kwargs) diff --git a/pandas/tests/apply/test_frame_apply_numeric_only.py b/pandas/tests/apply/test_frame_apply_numeric_only.py index 257d6dbc1db08..1f76b62fdab13 100644 --- a/pandas/tests/apply/test_frame_apply_numeric_only.py +++ b/pandas/tests/apply/test_frame_apply_numeric_only.py @@ -206,14 +206,15 @@ def test_agg_list_numeric_only_with_datetime(self): def test_agg_list_numeric_only_large_dataframe(self): """Test with a larger DataFrame for performance verification.""" - np.random.seed(42) + rng = np.random.default_rng(42) df = DataFrame( { - "A": np.random.randint(1, 100, 1000), - "B": np.random.randn(1000), + "A": rng.integers(1, 100, 1000), + "B": rng.standard_normal(1000), "C": ["text"] * 1000, } ) + result = df.agg(["sum", "mean", "std"], numeric_only=True) # Just verify structure, not exact values due to randomness