TST: Make old tests more performant (#55746)

mroeschke · web-flow · commit adae693b9f8f · 2023-10-28T18:45:24.000-07:00
* Use zeros instead of random data

* Mark numba apply tests as single cpu

* Parameterize and make test_grow_boundary_at_cap more specific

* parameterize and test less values in test_precise_conversion

* Parameterize and mark test_parse_trim_buffers as slow

* Reduce resample size of test_nearest_upsample_with_limit

* use start_caching_at for test_bad_date_parse

* Parameterize test_series_groupby_value_counts

* Monkeypatch magic number in test_isin_large_series_mixed_dtypes_and_nan

* Use _SIZE_CUTOFF for test_loc_setitem_with_expansion_large_dataframe

* Use switch_numexpr_min_elements for test_floordiv_axis0_numexpr_path

* Remove redundant test
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
@@ -9,7 +9,7 @@
 )
 import pandas._testing as tm
 
-pytestmark = td.skip_if_no("numba")
+pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu]
 
 
 def test_numba_vs_python_noop(float_frame, apply_axis):
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -22,15 +22,13 @@
 )
 import pandas._testing as tm
 from pandas.core.computation import expressions as expr
-from pandas.core.computation.expressions import _MIN_ELEMENTS
 from pandas.tests.frame.common import (
     _check_mixed_float,
     _check_mixed_int,
 )
-from pandas.util.version import Version
 
 
-@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"])
+@pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"])
 def switch_numexpr_min_elements(request, monkeypatch):
     with monkeypatch.context() as m:
         m.setattr(expr, "_MIN_ELEMENTS", request.param)
@@ -499,34 +497,6 @@ def test_floordiv_axis0(self):
         result2 = df.floordiv(ser.values, axis=0)
         tm.assert_frame_equal(result2, expected)
 
-    @pytest.mark.parametrize("opname", ["floordiv", "pow"])
-    def test_floordiv_axis0_numexpr_path(self, opname, request):
-        # case that goes through numexpr and has to fall back to masked_arith_op
-        ne = pytest.importorskip("numexpr")
-        if (
-            Version(ne.__version__) >= Version("2.8.7")
-            and opname == "pow"
-            and "python" in request.node.callspec.id
-        ):
-            request.applymarker(
-                pytest.mark.xfail(reason="https://github.com/pydata/numexpr/issues/454")
-            )
-
-        op = getattr(operator, opname)
-
-        arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100
-        df = DataFrame(arr)
-        df["C"] = 1.0
-
-        ser = df[0]
-        result = getattr(df, opname)(ser, axis=0)
-
-        expected = DataFrame({col: op(df[col], ser) for col in df.columns})
-        tm.assert_frame_equal(result, expected)
-
-        result2 = getattr(df, opname)(ser.values, axis=0)
-        tm.assert_frame_equal(result2, expected)
-
     def test_df_add_td64_columnwise(self):
         # GH 22534 Check that column-wise addition broadcasts correctly
         dti = pd.date_range("2016-01-01", periods=10)
diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -2186,7 +2186,7 @@ def __init__(self, *args, **kwargs) -> None:
         with monkeypatch.context() as m:
             m.setattr(reshape_lib, "_Unstacker", MockUnstacker)
             df = DataFrame(
-                np.random.default_rng(2).standard_normal((2**16, 2)),
+                np.zeros((2**16, 2)),
                 index=[np.arange(2**16), np.arange(2**16)],
             )
             msg = "The following operation may generate"
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
@@ -4,7 +4,6 @@
 and proper parameter handling
 """
 
-from itertools import product
 
 import numpy as np
 import pytest
@@ -46,7 +45,6 @@ def tests_value_counts_index_names_category_column():
     tm.assert_series_equal(result, expected)
 
 
-# our starting frame
 def seed_df(seed_nans, n, m):
     days = date_range("2015-08-24", periods=10)
 
@@ -70,29 +68,32 @@ def seed_df(seed_nans, n, m):
     return frame
 
 
-# create input df, keys, and the bins
-binned = []
-ids = []
-for seed_nans in [True, False]:
-    for n, m in product((100, 1000), (5, 20)):
-        df = seed_df(seed_nans, n, m)
-        bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
-        keys = "1st", "2nd", ["1st", "2nd"]
-        for k, b in product(keys, bins):
-            binned.append((df, k, b, n, m))
-            ids.append(f"{k}-{n}-{m}")
-
-
 @pytest.mark.slow
-@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
+@pytest.mark.parametrize("seed_nans", [True, False])
+@pytest.mark.parametrize("num_rows", [10, 50])
+@pytest.mark.parametrize("max_int", [5, 20])
+@pytest.mark.parametrize("keys", ["1st", "2nd", ["1st", "2nd"]], ids=repr)
+@pytest.mark.parametrize("bins", [None, [0, 5]], ids=repr)
 @pytest.mark.parametrize("isort", [True, False])
 @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
 @pytest.mark.parametrize("sort", [True, False])
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("dropna", [True, False])
 def test_series_groupby_value_counts(
-    df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna
+    seed_nans,
+    num_rows,
+    max_int,
+    keys,
+    bins,
+    isort,
+    normalize,
+    name,
+    sort,
+    ascending,
+    dropna,
 ):
+    df = seed_df(seed_nans, num_rows, max_int)
+
     def rebuild_index(df):
         arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
         df.index = MultiIndex.from_arrays(arr, names=df.index.names)
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -12,6 +12,7 @@
 import numpy as np
 import pytest
 
+from pandas._libs import index as libindex
 from pandas.errors import IndexingError
 import pandas.util._test_decorators as td
 
@@ -1974,12 +1975,14 @@ def test_loc_drops_level(self):
 
 
 class TestLocSetitemWithExpansion:
-    @pytest.mark.slow
-    def test_loc_setitem_with_expansion_large_dataframe(self):
+    def test_loc_setitem_with_expansion_large_dataframe(self, monkeypatch):
         # GH#10692
-        result = DataFrame({"x": range(10**6)}, dtype="int64")
-        result.loc[len(result)] = len(result) + 1
-        expected = DataFrame({"x": range(10**6 + 1)}, dtype="int64")
+        size_cutoff = 50
+        with monkeypatch.context():
+            monkeypatch.setattr(libindex, "_SIZE_CUTOFF", size_cutoff)
+            result = DataFrame({"x": range(size_cutoff)}, dtype="int64")
+            result.loc[size_cutoff] = size_cutoff
+        expected = DataFrame({"x": range(size_cutoff + 1)}, dtype="int64")
         tm.assert_frame_equal(result, expected)
 
     def test_loc_setitem_empty_series(self):
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
@@ -147,7 +147,9 @@ def test_unsupported_dtype(c_parser_only, match, kwargs):
 
 @td.skip_if_32bit
 @pytest.mark.slow
-def test_precise_conversion(c_parser_only):
+# test numbers between 1 and 2
+@pytest.mark.parametrize("num", np.linspace(1.0, 2.0, num=21))
+def test_precise_conversion(c_parser_only, num):
     parser = c_parser_only
 
     normal_errors = []
@@ -156,27 +158,23 @@ def test_precise_conversion(c_parser_only):
     def error(val: float, actual_val: Decimal) -> Decimal:
         return abs(Decimal(f"{val:.100}") - actual_val)
 
-    # test numbers between 1 and 2
-    for num in np.linspace(1.0, 2.0, num=500):
-        # 25 decimal digits of precision
-        text = f"a\n{num:.25}"
+    # 25 decimal digits of precision
+    text = f"a\n{num:.25}"
 
-        normal_val = float(
-            parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
-        )
-        precise_val = float(
-            parser.read_csv(StringIO(text), float_precision="high")["a"][0]
-        )
-        roundtrip_val = float(
-            parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
-        )
-        actual_val = Decimal(text[2:])
+    normal_val = float(
+        parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
+    )
+    precise_val = float(parser.read_csv(StringIO(text), float_precision="high")["a"][0])
+    roundtrip_val = float(
+        parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
+    )
+    actual_val = Decimal(text[2:])
 
-        normal_errors.append(error(normal_val, actual_val))
-        precise_errors.append(error(precise_val, actual_val))
+    normal_errors.append(error(normal_val, actual_val))
+    precise_errors.append(error(precise_val, actual_val))
 
-        # round-trip should match float()
-        assert roundtrip_val == float(text[2:])
+    # round-trip should match float()
+    assert roundtrip_val == float(text[2:])
 
     assert sum(precise_errors) <= sum(normal_errors)
     assert max(precise_errors) <= max(normal_errors)
@@ -287,7 +285,8 @@ def test_tokenize_CR_with_quoting(c_parser_only):
 
 
 @pytest.mark.slow
-def test_grow_boundary_at_cap(c_parser_only):
+@pytest.mark.parametrize("count", [3 * 2**n for n in range(6)])
+def test_grow_boundary_at_cap(c_parser_only, count):
     # See gh-12494
     #
     # Cause of error was that the C parser
@@ -296,19 +295,18 @@ def test_grow_boundary_at_cap(c_parser_only):
     # to capacity, which would later cause a
     # buffer overflow error when checking the
     # EOF terminator of the CSV stream.
+    # 3 * 2^n commas was observed to break the parser
     parser = c_parser_only
 
-    def test_empty_header_read(count):
-        with StringIO("," * count) as s:
-            expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
-            df = parser.read_csv(s)
-        tm.assert_frame_equal(df, expected)
-
-    for cnt in range(1, 101):
-        test_empty_header_read(cnt)
+    with StringIO("," * count) as s:
+        expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
+        df = parser.read_csv(s)
+    tm.assert_frame_equal(df, expected)
 
 
-def test_parse_trim_buffers(c_parser_only):
+@pytest.mark.slow
+@pytest.mark.parametrize("encoding", [None, "utf-8"])
+def test_parse_trim_buffers(c_parser_only, encoding):
     # This test is part of a bugfix for gh-13703. It attempts to
     # to stress the system memory allocator, to cause it to move the
     # stream buffer and either let the OS reclaim the region, or let
@@ -319,6 +317,9 @@ def test_parse_trim_buffers(c_parser_only):
     # times it fails due to memory corruption, which causes the
     # loaded DataFrame to differ from the expected one.
 
+    # Also force 'utf-8' encoding, so that `_string_convert` would take
+    # a different execution branch.
+
     parser = c_parser_only
 
     # Generate a large mixed-type CSV file on-the-fly (one record is
@@ -374,25 +375,16 @@ def test_parse_trim_buffers(c_parser_only):
     )
 
     # Iterate over the CSV file in chunks of `chunksize` lines
-    with parser.read_csv(
-        StringIO(csv_data), header=None, dtype=object, chunksize=chunksize
-    ) as chunks_:
-        result = concat(chunks_, axis=0, ignore_index=True)
-
-    # Check for data corruption if there was no segfault
-    tm.assert_frame_equal(result, expected)
-
-    # This extra test was added to replicate the fault in gh-5291.
-    # Force 'utf-8' encoding, so that `_string_convert` would take
-    # a different execution branch.
     with parser.read_csv(
         StringIO(csv_data),
         header=None,
         dtype=object,
         chunksize=chunksize,
-        encoding="utf_8",
+        encoding=encoding,
     ) as chunks_:
         result = concat(chunks_, axis=0, ignore_index=True)
+
+    # Check for data corruption if there was no segfault
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
@@ -32,6 +32,7 @@
 import pandas._testing as tm
 from pandas._testing._hypothesis import DATETIME_NO_TZ
 from pandas.core.indexes.datetimes import date_range
+from pandas.core.tools.datetimes import start_caching_at
 
 from pandas.io.parsers import read_csv
 
@@ -1285,7 +1286,7 @@ def test_bad_date_parse(all_parsers, cache_dates, value):
     # if we have an invalid date make sure that we handle this with
     # and w/o the cache properly
     parser = all_parsers
-    s = StringIO((f"{value},\n") * 50000)
+    s = StringIO((f"{value},\n") * (start_caching_at + 1))
 
     warn = None
     msg = "Passing a BlockManager to DataFrame"
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
@@ -516,7 +516,7 @@ def test_upsample_with_limit(unit):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.parametrize("freq", ["5D", "10h", "5Min", "10s"])
+@pytest.mark.parametrize("freq", ["1D", "10h", "5Min", "10s"])
 @pytest.mark.parametrize("rule", ["Y", "3ME", "15D", "30h", "15Min", "30s"])
 def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit):
     # GH 33939
diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py
@@ -7,6 +7,7 @@
     date_range,
 )
 import pandas._testing as tm
+from pandas.core import algorithms
 from pandas.core.arrays import PeriodArray
 
 
@@ -197,13 +198,16 @@ def test_isin_masked_types(self, dtype, data, values, expected):
         tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.slow
-def test_isin_large_series_mixed_dtypes_and_nan():
+def test_isin_large_series_mixed_dtypes_and_nan(monkeypatch):
     # https://github.com/pandas-dev/pandas/issues/37094
-    # combination of object dtype for the values and > 1_000_000 elements
-    ser = Series([1, 2, np.nan] * 1_000_000)
-    result = ser.isin({"foo", "bar"})
-    expected = Series([False] * 3 * 1_000_000)
+    # combination of object dtype for the values
+    # and > _MINIMUM_COMP_ARR_LEN elements
+    min_isin_comp = 5
+    ser = Series([1, 2, np.nan] * min_isin_comp)
+    with monkeypatch.context() as m:
+        m.setattr(algorithms, "_MINIMUM_COMP_ARR_LEN", min_isin_comp)
+        result = ser.isin({"foo", "bar"})
+    expected = Series([False] * 3 * min_isin_comp)
     tm.assert_series_equal(result, expected)
 
 

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`)`
`10`	`10`	`import pandas._testing as tm`
`11`	`11`
`12`		`-pytestmark = td.skip_if_no("numba")`
	`12`	`+pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu]`
`13`	`13`
`14`	`14`
`15`	`15`	`def test_numba_vs_python_noop(float_frame, apply_axis):`
Original file line number	Diff line number	Diff line change
`@@ -2186,7 +2186,7 @@ def __init__(self, args, *kwargs) -> None:`
`2186`	`2186`	`with monkeypatch.context() as m:`
`2187`	`2187`	`m.setattr(reshape_lib, "_Unstacker", MockUnstacker)`
`2188`	`2188`	`df = DataFrame(`
`2189`		`- np.random.default_rng(2).standard_normal((2**16, 2)),`
	`2189`	`+ np.zeros((2**16, 2)),`
`2190`	`2190`	`index=[np.arange(216), np.arange(216)],`
`2191`	`2191`	`)`
`2192`	`2192`	`msg = "The following operation may generate"`