Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1149,6 +1149,8 @@ Groupby/resample/rolling
- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`)
- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
- Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`)
- Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` where the end of window was not indexed correctly. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`)
- Bug in :meth:`DataFrameGroupBy.agg` where ``numeric_only=True`` was ignored when passing a list of aggregation functions (:issue:`49352`)

Reshaping
^^^^^^^^^
Expand Down Expand Up @@ -1202,6 +1204,7 @@ Other
- Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`)
- Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`)
- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
- Bug in :meth:`DataFrame.agg` where ``numeric_only=True`` was ignored when passing a list of aggregation functions, causing non-numeric columns to be included or raising TypeError (:issue:`49352`)
- Bug in :meth:`DataFrame.apply` raising ``RecursionError`` when passing ``func=list[int]``. (:issue:`61565`)
- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
Expand Down
50 changes: 50 additions & 0 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,6 +837,43 @@ def agg_or_apply_list_like(
if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")

# GH#49352 - Handle numeric_only with list of functions
# When numeric_only=True is passed with a list of functions, filter
# to numeric columns before processing to avoid TypeError on non-numeric Series
if op_name == "agg" and kwargs.get("numeric_only", False):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only two of the five paths in DataFrameGroupBy.aggregate hits here, thus introducing inconsistencies. To accept a PR, we would need to do this consistently through the op. Take a look at the code in pandas.core.groupby.generic.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right I need to ensure the filtering is applied consistently across all code paths in DataFrameGroupBy.aggregate.
I'll update the PR.

# Check if obj is a DataFrame (not Series) with 2 dimensions
if isinstance(obj, ABCDataFrame) and obj.ndim == 2:
# Filter to numeric columns before processing
numeric_obj = obj.select_dtypes(include="number")

# Only proceed if we have numeric columns
if not numeric_obj.empty:
# Create kwargs without numeric_only to avoid
# passing it to Series methods
kwargs_filtered = {
k: v for k, v in kwargs.items() if k != "numeric_only"
}

# Compute with filtered object and cleaned kwargs
keys, results = self.compute_list_like(
op_name, numeric_obj, kwargs_filtered
)
result = self.wrap_results_list_like(keys, results)
return result
else:
# No numeric columns - return empty result
from pandas import DataFrame

# Get function names for index
if isinstance(self.func, list):
keys = self.func # type: ignore[assignment]
elif isinstance(self.func, dict):
keys = list(self.func.keys())
else:
keys = []

return DataFrame(index=keys)

keys, results = self.compute_list_like(op_name, obj, kwargs)
result = self.wrap_results_list_like(keys, results)
return result
Expand Down Expand Up @@ -1629,6 +1666,19 @@ def agg_or_apply_list_like(
else:
selected_obj = obj._obj_with_exclusions

# GH#49352 - Handle numeric_only with list of functions for GroupBy
# Filter to numeric columns before processing to avoid TypeError
if op_name == "agg" and kwargs.get("numeric_only", False):
# For GroupBy, filter the selected object to numeric columns
if selected_obj.ndim == 2:
numeric_obj = selected_obj.select_dtypes(include="number")

if not numeric_obj.empty:
# Update selected_obj to filtered numeric columns
selected_obj = numeric_obj
# Remove numeric_only from kwargs to avoid passing to Series methods
kwargs = {k: v for k, v in kwargs.items() if k != "numeric_only"}

# Only set as_index=True on groupby objects, not Window or Resample
# that inherit from this class.
with com.temp_setattr(
Expand Down
224 changes: 224 additions & 0 deletions pandas/tests/apply/test_frame_apply_numeric_only.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
"""
Tests for DataFrame.agg with numeric_only parameter and list of functions.
This tests the fix for GH#49352.
"""

import numpy as np
import pytest

import pandas as pd
from pandas import DataFrame
import pandas._testing as tm


class TestFrameAggNumericOnly:
"""Tests for DataFrame.agg with numeric_only parameter and list of functions."""

def test_agg_list_numeric_only_mixed_dtypes(self):
"""GH#49352 - Main test case from the issue."""
df = DataFrame(
{
"A": [1, 2, 3, 4, 5],
"B": [10.5, 20.5, 30.5, 40.5, 50.5],
"C": ["a", "b", "c", "d", "e"],
}
)
result = df.agg(["min", "max", "mean"], numeric_only=True)
expected = DataFrame(
{"A": [1.0, 5.0, 3.0], "B": [10.5, 50.5, 30.5]},
index=["min", "max", "mean"],
)
tm.assert_frame_equal(result, expected)

def test_agg_list_numeric_only_all_numeric(self):
"""Should work when all columns are numeric."""
df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]})
result = df.agg(["sum", "mean"], numeric_only=True)
expected = DataFrame(
{"A": [6.0, 2.0], "B": [60.0, 20.0]}, index=["sum", "mean"]
)
tm.assert_frame_equal(result, expected)

def test_agg_list_numeric_only_no_numeric(self):
"""Should return empty DataFrame when no numeric columns."""
df = DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]})
result = df.agg(["min", "max"], numeric_only=True)
expected = DataFrame(index=["min", "max"])
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"funcs,expected_index",
[
(["sum", "mean"], ["sum", "mean"]),
([np.sum, np.mean], ["sum", "mean"]),
(["sum", np.mean], ["sum", "mean"]),
([np.sum, "mean"], ["sum", "mean"]),
],
)
def test_agg_list_numeric_only_various_function_types(self, funcs, expected_index):
"""Test with different combinations of string and numpy functions."""
df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30], "C": ["a", "b", "c"]})
result = df.agg(funcs, numeric_only=True)
expected = DataFrame({"A": [6.0, 2.0], "B": [60.0, 20.0]}, index=expected_index)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"funcs",
[
["min", "max"],
["sum", "mean", "std"],
["min", "max", "mean", "median"],
],
)
def test_agg_list_numeric_only_different_function_counts(self, funcs):
"""Test with different numbers of functions."""
df = DataFrame(
{
"A": [1, 2, 3, 4, 5],
"B": [10, 20, 30, 40, 50],
"C": ["a", "b", "c", "d", "e"],
}
)
result = df.agg(funcs, numeric_only=True)

# Verify structure
assert isinstance(result, DataFrame)
assert list(result.columns) == ["A", "B"]
assert list(result.index) == funcs
assert result.shape == (len(funcs), 2)

@pytest.mark.parametrize(
"data,expected_cols",
[
# Only integers
({"A": [1, 2, 3], "B": [4, 5, 6], "C": ["x", "y", "z"]}, ["A", "B"]),
# Only floats
({"A": [1.1, 2.2], "B": [3.3, 4.4], "C": ["x", "y"]}, ["A", "B"]),
# Mix of int and float
({"int": [1, 2], "float": [1.5, 2.5], "str": ["a", "b"]}, ["int", "float"]),
# Single numeric column
({"num": [1, 2, 3], "text": ["a", "b", "c"]}, ["num"]),
],
)
def test_agg_list_numeric_only_various_dtypes(self, data, expected_cols):
"""Test with various numeric dtype combinations."""
df = DataFrame(data)
result = df.agg(["sum", "mean"], numeric_only=True)

assert isinstance(result, DataFrame)
assert list(result.columns) == expected_cols
assert list(result.index) == ["sum", "mean"]

@pytest.mark.parametrize("numeric_only", [True, False, None])
def test_agg_list_numeric_only_parameter_values(self, numeric_only):
"""Test with different numeric_only parameter values."""
df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]})

if numeric_only is None:
result = df.agg(["sum", "mean"])
else:
result = df.agg(["sum", "mean"], numeric_only=numeric_only)

expected = DataFrame({"A": [6, 2.0], "B": [60, 20.0]}, index=["sum", "mean"])
tm.assert_frame_equal(result, expected)

def test_agg_list_numeric_only_false_with_strings(self):
"""Verify numeric_only=False works with min/max on strings."""
df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]})
result = df.agg(["min", "max"], numeric_only=False)
expected = DataFrame({"A": [1, 3], "B": ["a", "c"]}, index=["min", "max"])
tm.assert_frame_equal(result, expected)

def test_agg_list_numeric_only_preserves_column_order(self):
"""Test that column order is preserved."""
df = DataFrame(
{
"Z": [1, 2, 3],
"A": [10, 20, 30],
"M": [100, 200, 300],
"text": ["a", "b", "c"],
}
)
result = df.agg(["sum", "mean"], numeric_only=True)

assert list(result.columns) == ["Z", "A", "M"]

@pytest.mark.parametrize("single_func", ["sum", "mean", "min", "max"])
def test_agg_single_function_still_works(self, single_func):
"""Verify that single function (not a list) still works."""
df = DataFrame({"A": [1, 2, 3], "B": [10, 20, 30], "C": ["a", "b", "c"]})
result = df.agg(single_func, numeric_only=True)

assert isinstance(result, pd.Series)
assert "A" in result.index
assert "B" in result.index
assert "C" not in result.index

def test_agg_list_numeric_only_with_int_and_float(self):
"""Test that both int and float columns are included."""
df = DataFrame(
{
"int_col": [1, 2, 3],
"float_col": [1.5, 2.5, 3.5],
"str_col": ["a", "b", "c"],
}
)
result = df.agg(["sum", "mean"], numeric_only=True)
expected = DataFrame(
{"int_col": [6.0, 2.0], "float_col": [7.5, 2.5]}, index=["sum", "mean"]
)
tm.assert_frame_equal(result, expected)

def test_agg_list_numeric_only_single_row(self):
"""Test with single row DataFrame."""
df = DataFrame({"A": [1], "B": [10], "C": ["x"]})
result = df.agg(["sum", "mean"], numeric_only=True)
expected = DataFrame(
{"A": [1.0, 1.0], "B": [10.0, 10.0]}, index=["sum", "mean"]
)
tm.assert_frame_equal(result, expected)

# ========== NEW TESTS - Additional Edge Cases ==========

def test_agg_list_numeric_only_with_nans(self):
"""Test DataFrame with NaN values."""
df = DataFrame(
{"A": [1, np.nan, 3], "B": [10, 20, np.nan], "C": ["x", "y", "z"]}
)
result = df.agg(["sum", "mean"], numeric_only=True)
expected = DataFrame(
{"A": [4.0, 2.0], "B": [30.0, 15.0]}, index=["sum", "mean"]
)
tm.assert_frame_equal(result, expected)

def test_agg_list_numeric_only_with_datetime(self):
"""Test that datetime columns are excluded with numeric_only=True."""
df = DataFrame(
{
"num": [1, 2, 3],
"date": pd.date_range("2020-01-01", periods=3),
"text": ["a", "b", "c"],
}
)
result = df.agg(["sum", "mean"], numeric_only=True)
expected = DataFrame({"num": [6.0, 2.0]}, index=["sum", "mean"])
tm.assert_frame_equal(result, expected)

def test_agg_list_numeric_only_large_dataframe(self):
"""Test with a larger DataFrame for performance verification."""
rng = np.random.default_rng(42)
df = DataFrame(
{
"A": rng.integers(1, 100, 1000),
"B": rng.standard_normal(1000),
"C": ["text"] * 1000,
}
)

result = df.agg(["sum", "mean", "std"], numeric_only=True)

# Just verify structure, not exact values due to randomness
assert isinstance(result, DataFrame)
assert list(result.columns) == ["A", "B"]
assert list(result.index) == ["sum", "mean", "std"]
assert result.shape == (3, 2)
Loading
Loading