Add --max-failures <n> option in test harness (#25569)

juj · web-flow · commit 0b127fffef51 · 2025-10-17T01:00:07.000+03:00
Add --max-failures &lt;n&gt; option in test harness to generalize on the
--failfast option (which is effectively a '--max-failures 0' mode).

This allows running the harness while allowing a couple of failures. I
find that when using `--failfast`, aborting on the first error is
sometimes a bit too eager, i.e. there might be a flaky test that
triggers the abort, or there might be an error that causes more than one
test to fail.

Examining test harness results, that interpreting a small handful of
failures (1,2,3...5) is still doable one-by-one, but when the number of
failures grows to, say, &gt; 5, the typically I want to deal with the first
couple of errors only, and re-run the harness after that, assuming the
rest of the errors would be from the same root cause.

So this enables running with, say, `--max-failures 5`, and get info on
the first handful of errors, and ignore if any more errors occur.

This helps retain the failfast-like behavior while producing a bit more
info, and being a bit more tolerant to instabilities.
diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
@@ -44,12 +44,17 @@ def cap_max_workers_in_pool(max_workers, is_browser):
   return max_workers
 
 
-def run_test(test, failfast_event, lock, progress_counter, num_tests):
-  # If failfast mode is in effect and any of the tests have failed,
-  # and then we should abort executing further tests immediately.
-  if failfast_event and failfast_event.is_set():
+def run_test(test, allowed_failures_counter, lock, progress_counter, num_tests):
+  # If we have exceeded the number of allowed failures during the test run,
+  # abort executing further tests immediately.
+  if allowed_failures_counter and allowed_failures_counter.value < 0:
     return None
 
+  def test_failed():
+    if allowed_failures_counter is not None:
+      with lock:
+        allowed_failures_counter.value -= 1
+
   olddir = os.getcwd()
   result = BufferedParallelTestResult(lock, progress_counter, num_tests)
   temp_dir = tempfile.mkdtemp(prefix='emtest_')
@@ -61,14 +66,13 @@ def run_test(test, failfast_event, lock, progress_counter, num_tests):
     test(result)
 
     # Alert all other multiprocess pool runners that they need to stop executing further tests.
-    if failfast_event is not None and result.test_result not in ['success', 'skipped']:
-      failfast_event.set()
+    if result.test_result not in ['success', 'skipped']:
+      test_failed()
   except unittest.SkipTest as e:
     result.addSkip(test, e)
   except Exception as e:
     result.addError(test, e)
-    if failfast_event is not None:
-      failfast_event.set()
+    test_failed()
   # Before attempting to delete the tmp dir make sure the current
   # working directory is not within it.
   os.chdir(olddir)
@@ -97,7 +101,7 @@ class ParallelTestSuite(unittest.BaseTestSuite):
   def __init__(self, max_cores, options):
     super().__init__()
     self.max_cores = max_cores
-    self.failfast = options.failfast
+    self.max_failures = options.max_failures
     self.failing_and_slow_first = options.failing_and_slow_first
 
   def addTest(self, test):
@@ -126,17 +130,17 @@ def run(self, result):
         initargs=(worker_id_counter, worker_id_lock),
       ) as pool:
         if python_multiprocessing_structures_are_buggy():
-          # When multuprocessing shared structures are buggy we don't support failfast
+          # When multiprocessing shared structures are buggy we don't support failfast
           # or the progress bar.
-          failfast_event = progress_counter = lock = None
-          if self.failfast:
-            errlog('The version of python being used is not compatible with --failfast')
+          allowed_failures_counter = progress_counter = lock = None
+          if self.max_failures < 2**31 - 1:
+            errlog('The version of python being used is not compatible with --failfast and --max-failures options. See https://github.com/python/cpython/issues/71936')
             sys.exit(1)
         else:
-          failfast_event = manager.Event() if self.failfast else None
+          allowed_failures_counter = manager.Value('i', self.max_failures)
           progress_counter = manager.Value('i', 0)
           lock = manager.Lock()
-        results = pool.starmap(run_test, ((t, failfast_event, lock, progress_counter, len(tests)) for t in tests), chunksize=1)
+        results = pool.starmap(run_test, ((t, allowed_failures_counter, lock, progress_counter, len(tests)) for t in tests), chunksize=1)
         # Send a task to each worker to tear down the browser and server. This
         # relies on the implementation detail in the worker pool that all workers
         # are cycled through once.
@@ -145,9 +149,8 @@ def run(self, result):
         if num_tear_downs != use_cores:
           errlog(f'Expected {use_cores} teardowns, got {num_tear_downs}')
 
-    # Filter out the None results which can occur in failfast mode.
-    if self.failfast:
-      results = [r for r in results if r is not None]
+    # Filter out the None results which can occur if # of allowed errors was exceeded and the harness aborted.
+    results = [r for r in results if r is not None]
 
     if self.failing_and_slow_first:
       previous_test_run_results = common.load_previous_test_run_results()
diff --git a/test/runner.py b/test/runner.py
@@ -275,7 +275,7 @@ def error_on_legacy_suite_names(args):
 # order to run the tests in. Generally this is slowest-first to maximize
 # parallelization, but if running with fail-fast, then the tests with recent
 # known failure frequency are run first, followed by slowest first.
-def create_test_run_sorter(failfast):
+def create_test_run_sorter(sort_failing_tests_at_front):
   previous_test_run_results = common.load_previous_test_run_results()
 
   def read_approx_fail_freq(test_name):
@@ -297,8 +297,8 @@ def sort_tests_failing_and_slowest_first_comparator(x, y):
     y = str(y)
 
     # Look at the number of times this test has failed, and order by failures count first
-    # Only do this in --failfast, if we are looking to fail early. (otherwise sorting by last test run duration is more productive)
-    if failfast:
+    # Only do this if we are looking to fail early. (otherwise sorting by last test run duration is more productive)
+    if sort_failing_tests_at_front:
       x_fail_freq = read_approx_fail_freq(x)
       y_fail_freq = read_approx_fail_freq(y)
       if x_fail_freq != y_fail_freq:
@@ -370,7 +370,7 @@ def load_test_suites(args, modules, options):
       tests = flattened_tests(loaded_tests)
       suite = suite_for_module(m, tests, options)
       if options.failing_and_slow_first:
-        tests = sorted(tests, key=cmp_to_key(create_test_run_sorter(options.failfast)))
+        tests = sorted(tests, key=cmp_to_key(create_test_run_sorter(options.max_failures < len(tests) / 2)))
       for test in tests:
         if not found_start:
           # Skip over tests until we find the start
@@ -480,7 +480,8 @@ def parse_args():
   parser.add_argument('--browser-auto-config', type=bool, default=True,
                       help='Use the default CI browser configuration.')
   parser.add_argument('tests', nargs='*')
-  parser.add_argument('--failfast', action='store_true')
+  parser.add_argument('--failfast', action='store_true', help='If true, test run will abort on first failed test.')
+  parser.add_argument('--max-failures', type=int, default=2**31 - 1, help='If specified, test run will abort after N failed tests.')
   parser.add_argument('--failing-and-slow-first', action='store_true', help='Run failing tests first, then sorted by slowest first. Combine with --failfast for fast fail-early CI runs.')
   parser.add_argument('--start-at', metavar='NAME', help='Skip all tests up until <NAME>')
   parser.add_argument('--continue', dest='_continue', action='store_true',
@@ -492,7 +493,15 @@ def parse_args():
   parser.add_argument('--repeat', type=int, default=1,
                       help='Repeat each test N times (default: 1).')
   parser.add_argument('--bell', action='store_true', help='Play a sound after the test suite finishes.')
-  return parser.parse_args()
+
+  options = parser.parse_args()
+
+  if options.failfast:
+    if options.max_failures != 0:
+      utils.exit_with_error('--failfast and --max-failures are mutually exclusive!')
+    options.max_failures = 0
+
+  return options
 
 
 def configure():