Add new args to BatchSizeFinder (#21163)

SkafteNicki · Borda · deependujha · web-flow · commit bd27dbf68944 · 2025-11-06T15:52:08.000+05:30
* new args to batch size scaler * add to tuner * add to callback * add testing * update * fix tests * Apply suggestion from @deependujha * Apply suggestion from @deependujha * Apply suggestion from @deependujha * Apply suggestion from @deependujha * update * safe default * update * add assertion * fix doc issue --------- Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> Co-authored-by: Deependu <deependujha21@gmail.com>
diff --git a/src/lightning/pytorch/callbacks/batch_size_finder.py b/src/lightning/pytorch/callbacks/batch_size_finder.py
@@ -63,6 +63,12 @@ class BatchSizeFinder(Callback):
             - ``model.hparams``
             - ``trainer.datamodule`` (the datamodule passed to the tune method)
 
+        margin: Margin to reduce the found batch size by to provide a safety buffer. Only applied when using
+            'binsearch' mode. Should be a float between 0 and 1. Defaults to 0.05 (5% reduction).
+        max_val: Maximum batch size limit, defaults to 8192.
+            Helps prevent testing unrealistically large or inefficient batch sizes (e.g., 2**25)
+            when running on CPU or when automatic OOM detection is not available.
+
     Example::
 
         # 1. Customize the BatchSizeFinder callback to run at different epochs. This feature is
@@ -118,17 +124,23 @@ def __init__(
         init_val: int = 2,
         max_trials: int = 25,
         batch_arg_name: str = "batch_size",
+        margin: float = 0.05,
+        max_val: int = 8192,
     ) -> None:
         mode = mode.lower()
         if mode not in self.SUPPORTED_MODES:
             raise ValueError(f"`mode` should be either of {self.SUPPORTED_MODES}")
 
+        assert 0.0 <= margin < 1.0, f"`margin` should be between 0 and 1. Found {margin=}"
+
         self.optimal_batch_size: Optional[int] = init_val
         self._mode = mode
         self._steps_per_trial = steps_per_trial
         self._init_val = init_val
         self._max_trials = max_trials
         self._batch_arg_name = batch_arg_name
+        self._margin = margin
+        self._max_val = max_val
         self._early_exit = False
 
     @override
@@ -180,6 +192,8 @@ def scale_batch_size(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule
             self._init_val,
             self._max_trials,
             self._batch_arg_name,
+            self._margin,
+            self._max_val,
         )
 
         self.optimal_batch_size = new_size
diff --git a/src/lightning/pytorch/tuner/batch_size_scaling.py b/src/lightning/pytorch/tuner/batch_size_scaling.py
@@ -32,6 +32,8 @@ def _scale_batch_size(
     init_val: int = 2,
     max_trials: int = 25,
     batch_arg_name: str = "batch_size",
+    margin: float = 0.05,
+    max_val: int = 8192,
 ) -> Optional[int]:
     """Iteratively try to find the largest batch size for a given model that does not give an out of memory (OOM)
     error.
@@ -58,7 +60,15 @@ def _scale_batch_size(
             - ``model.hparams``
             - ``trainer.datamodule`` (the datamodule passed to the tune method)
 
+        margin: Margin to reduce the found batch size by to provide a safety buffer. Only applied when using
+            'binsearch' mode. Should be a float between 0 and 1. Defaults to 0.05 (5% reduction).
+        max_val: Maximum batch size limit, defaults to 8192.
+            Helps prevent testing unrealistically large or inefficient batch sizes (e.g., 2**25)
+            when running on CPU or when automatic OOM detection is not available.
+
     """
+    assert 0.0 <= margin < 1.0, f"`margin` should be between 0 and 1. Found {margin=}"
+
     if trainer.fast_dev_run:
         rank_zero_warn("Skipping batch size scaler since `fast_dev_run` is enabled.")
         return None
@@ -80,9 +90,9 @@ def _scale_batch_size(
         new_size, _ = _adjust_batch_size(trainer, batch_arg_name, value=init_val)
 
         if mode == "power":
-            new_size = _run_power_scaling(trainer, new_size, batch_arg_name, max_trials, params)
+            new_size = _run_power_scaling(trainer, new_size, batch_arg_name, max_trials, params, max_val)
         elif mode == "binsearch":
-            new_size = _run_binary_scaling(trainer, new_size, batch_arg_name, max_trials, params)
+            new_size = _run_binsearch_scaling(trainer, new_size, batch_arg_name, max_trials, params, margin, max_val)
 
         garbage_collection_cuda()
 
@@ -173,6 +183,7 @@ def _run_power_scaling(
     batch_arg_name: str,
     max_trials: int,
     params: dict[str, Any],
+    max_val: int = 8192,
 ) -> int:
     """Batch scaling mode where the size is doubled at each iteration until an OOM error is encountered."""
     # this flag is used to determine whether the previously scaled batch size, right before OOM, was a success or not
@@ -185,6 +196,10 @@ def _run_power_scaling(
         # reset after each try
         _reset_progress(trainer)
 
+        if new_size >= max_val:
+            rank_zero_info(f"Reached the maximum batch size limit of {max_val}. Stopping search.")
+            break
+
         try:
             _try_loop_run(trainer, params)
             last_successful_size = new_size  # Store the current size before doubling
@@ -217,18 +232,22 @@ def _run_power_scaling(
     return new_size
 
 
-def _run_binary_scaling(
+def _run_binsearch_scaling(
     trainer: "pl.Trainer",
     new_size: int,
     batch_arg_name: str,
     max_trials: int,
     params: dict[str, Any],
+    margin: float,
+    max_val: int = 8192,
 ) -> int:
     """Batch scaling mode where the size is initially is doubled at each iteration until an OOM error is encountered.
 
     Hereafter, the batch size is further refined using a binary search
 
     """
+    assert 0.0 <= margin < 1.0, f"`margin` should be between 0 and 1. Found {margin=}"
+
     low = 1
     high = None
     count = 0
@@ -239,6 +258,10 @@ def _run_binary_scaling(
         # reset after each try
         _reset_progress(trainer)
 
+        if new_size >= max_val:
+            rank_zero_info(f"Reached the maximum batch size limit of {max_val}. Stopping search.")
+            break
+
         try:
             # run loop
             _try_loop_run(trainer, params)
@@ -256,9 +279,13 @@ def _run_binary_scaling(
                 if high - low <= 1:
                     break
                 midval = (high + low) // 2
-                new_size, changed = _adjust_batch_size(trainer, batch_arg_name, value=midval, desc="succeeded")
+                new_size, changed = _adjust_batch_size(
+                    trainer, batch_arg_name, value=midval, desc="succeeded", max_val=max_val
+                )
             else:
-                new_size, changed = _adjust_batch_size(trainer, batch_arg_name, factor=2.0, desc="succeeded")
+                new_size, changed = _adjust_batch_size(
+                    trainer, batch_arg_name, factor=2.0, desc="succeeded", max_val=max_val
+                )
 
             if not changed:
                 break
@@ -284,6 +311,17 @@ def _run_binary_scaling(
             else:
                 raise  # some other error not memory related
 
+    # Apply margin reduction for binsearch mode
+    if margin > 0:
+        margin_reduced_size = max(1, int(new_size * (1 - margin)))
+        if margin_reduced_size != new_size:
+            rank_zero_info(
+                f"Applying margin of {margin:.1%}, reducing batch size from {new_size} to {margin_reduced_size}"
+            )
+            new_size = margin_reduced_size
+            # propagate the reduced batch size to the model/datamodule attribute
+            lightning_setattr(trainer.lightning_module, batch_arg_name, new_size)
+
     return new_size
 
 
@@ -293,6 +331,7 @@ def _adjust_batch_size(
     factor: float = 1.0,
     value: Optional[int] = None,
     desc: Optional[str] = None,
+    max_val: int = 8192,
 ) -> tuple[int, bool]:
     """Helper function for adjusting the batch size.
 
@@ -303,6 +342,9 @@ def _adjust_batch_size(
         value: if a value is given, will override the batch size with this value.
             Note that the value of `factor` will not have an effect in this case
         desc: either ``"succeeded"`` or ``"failed"``. Used purely for logging
+        max_val: Maximum batch size limit, defaults to 8192.
+            Helps prevent testing unrealistically large or inefficient batch sizes (e.g., 2**25)
+            when running on CPU or when automatic OOM detection is not available.
 
     Returns:
         The new batch size for the next trial and a bool that signals whether the
@@ -321,13 +363,22 @@ def _adjust_batch_size(
     try:
         combined_dataset_length = combined_loader._dataset_length()
         if batch_size >= combined_dataset_length:
-            rank_zero_info(f"The batch size {batch_size} is greater or equal than the length of your dataset.")
+            rank_zero_info(
+                f"The batch size {batch_size} is greater or equal than"
+                f" the length of your dataset: {combined_dataset_length}."
+            )
             return batch_size, False
     except NotImplementedError:
         # all datasets are iterable style
         pass
 
     new_size = value if value is not None else int(batch_size * factor)
+
+    # Apply max_val limit if provided
+    if new_size > max_val:
+        if desc:
+            rank_zero_info(f"Batch size {new_size} exceeds max_val limit {max_val}, capping at {max_val}")
+        new_size = max_val
     if desc:
         rank_zero_info(f"Batch size {batch_size} {desc}, trying batch size {new_size}")
     changed = new_size != batch_size
diff --git a/src/lightning/pytorch/tuner/tuning.py b/src/lightning/pytorch/tuner/tuning.py
@@ -41,6 +41,8 @@ def scale_batch_size(
         init_val: int = 2,
         max_trials: int = 25,
         batch_arg_name: str = "batch_size",
+        margin: float = 0.05,
+        max_val: int = 8192,
     ) -> Optional[int]:
         """Iteratively try to find the largest batch size for a given model that does not give an out of memory (OOM)
         error.
@@ -75,9 +77,16 @@ def scale_batch_size(
                 - ``model.hparams``
                 - ``trainer.datamodule`` (the datamodule passed to the tune method)
 
+            margin: Margin to reduce the found batch size by to provide a safety buffer. Only applied when using
+                'binsearch' mode. Should be a float between 0 and 1. Defaults to 0.05 (5% reduction).
+            max_val: Maximum batch size limit, defaults to 8192.
+                Helps prevent testing unrealistically large or inefficient batch sizes (e.g., 2**25)
+                when running on CPU or when automatic OOM detection is not available.
+
         """
         _check_tuner_configuration(train_dataloaders, val_dataloaders, dataloaders, method)
         _check_scale_batch_size_configuration(self._trainer)
+        assert 0.0 <= margin < 1.0, f"`margin` should be between 0 and 1. Found {margin=}"
 
         # local import to avoid circular import
         from lightning.pytorch.callbacks.batch_size_finder import BatchSizeFinder
@@ -88,6 +97,8 @@ def scale_batch_size(
             init_val=init_val,
             max_trials=max_trials,
             batch_arg_name=batch_arg_name,
+            margin=margin,
+            max_val=max_val,
         )
         # do not continue with the loop in case Tuner is used
         batch_size_finder._early_exit = True
diff --git a/tests/tests_pytorch/tuner/test_scale_batch_size.py b/tests/tests_pytorch/tuner/test_scale_batch_size.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import glob
 import logging
+import math
 import os
 from copy import deepcopy
 from unittest.mock import patch
@@ -69,7 +70,7 @@ def test_scale_batch_size_method_with_model_or_datamodule(tmp_path, model_bs, dm
 
     tuner = Tuner(trainer)
     new_batch_size = tuner.scale_batch_size(model, mode="binsearch", init_val=4, max_trials=2, datamodule=datamodule)
-    assert new_batch_size == 8
+    assert new_batch_size == 7  # applied margin of 5% on 8 -> int(8 * 0.95) = 7
 
     if model_bs is not None:
         assert model.batch_size == new_batch_size
@@ -317,7 +318,12 @@ def test_dataloader_reset_with_scale_batch_size(tmp_path, caplog, scale_method,
     # With our fix, when max_trials is reached, we don't try the doubled batch size, so we get max_trials - 1 messages
     expected_tries = max_trials - 1 if init_batch_size < dataset_len and max_trials > 0 else 0
     assert caplog.text.count("trying batch size") == expected_tries
-    assert caplog.text.count("greater or equal than the length") == int(new_batch_size == dataset_len)
+
+    # Determine the largest batch size that was actually tested.
+    # For "power" this is the final found size; for "binsearch" we applied a 5% margin
+    # when storing the final value, so the largest tested value is the one before applying margin.
+    largest_tested_batch_size = new_batch_size if scale_method == "power" else int(math.ceil(new_batch_size * 100 / 95))
+    assert caplog.text.count("greater or equal than the length") == int(largest_tested_batch_size >= dataset_len)
 
     assert trainer.train_dataloader.batch_size == new_batch_size
     assert trainer.val_dataloaders.batch_size == new_batch_size
@@ -453,7 +459,7 @@ def val_dataloader(self):
         tuner.scale_batch_size(model, method="validate")
 
 
-@pytest.mark.parametrize(("scale_method", "expected_batch_size"), [("power", 62), ("binsearch", 100)])
+@pytest.mark.parametrize(("scale_method", "expected_batch_size"), [("power", 62), ("binsearch", 95)])
 @patch("lightning.pytorch.tuner.batch_size_scaling.is_oom_error", return_value=True)
 def test_dataloader_batch_size_updated_on_failure(_, tmp_path, scale_method, expected_batch_size):
     class CustomBatchSizeModel(BatchSizeModel):
@@ -493,6 +499,51 @@ def test_batch_size_finder_callback_val_batches(tmp_path):
     assert trainer.num_val_batches[0] != steps_per_trial
 
 
+@pytest.mark.parametrize("margin", [0.0, 0.1, 0.2])
+def test_scale_batch_size_margin_and_max_val(tmp_path, margin):
+    """Test margin feature for batch size scaling by comparing results with and without margin."""
+    # First, find the batch size without margin
+    model1 = BatchSizeModel(batch_size=2)
+    trainer1 = Trainer(default_root_dir=tmp_path, max_epochs=1, logger=False, enable_checkpointing=False)
+    tuner1 = Tuner(trainer1)
+
+    result_without_margin = tuner1.scale_batch_size(
+        model1, mode="binsearch", max_trials=2, steps_per_trial=1, margin=0.0
+    )
+
+    model2 = BatchSizeModel(batch_size=2)
+    trainer2 = Trainer(default_root_dir=tmp_path, max_epochs=1, logger=False, enable_checkpointing=False)
+    tuner2 = Tuner(trainer2)
+
+    result_with_margin = tuner2.scale_batch_size(
+        model2, mode="binsearch", max_trials=2, steps_per_trial=1, margin=margin
+    )
+
+    assert result_without_margin is not None
+    assert result_with_margin is not None
+
+    if margin == 0.0:
+        assert result_with_margin == result_without_margin
+    else:
+        expected_with_margin = max(1, int(result_without_margin * (1 - margin)))
+        assert result_with_margin == expected_with_margin
+        assert result_with_margin <= result_without_margin
+
+
+@pytest.mark.parametrize("mode", ["power", "binsearch"])
+def test_scale_batch_size_max_val_limit(tmp_path, mode):
+    """Test that max_val limits the batch size for both power and binsearch modes."""
+    model = BatchSizeModel(batch_size=2)
+    trainer = Trainer(default_root_dir=tmp_path, max_epochs=1)
+    tuner = Tuner(trainer)
+
+    max_val = 8  # Set a low max value
+    result = tuner.scale_batch_size(model, mode=mode, max_trials=5, steps_per_trial=1, max_val=max_val)
+
+    assert result is not None
+    assert result <= max_val
+
+
 def test_scale_batch_size_checkpoint_cleanup_on_error(tmp_path):
     """Test that temporary checkpoint files are cleaned up even when an error occurs during batch size scaling."""
 
@@ -566,7 +617,7 @@ def training_step(self, batch, batch_idx):
     ("max_trials", "mode", "init_val", "expected"),
     [
         (3, "power", 2, 8),
-        (3, "binsearch", 2, 8),
+        (3, "binsearch", 2, 7),  # applied margin of 5% on 8 -> int(8 * 0.95) = 7
         (1, "power", 4, 4),
         (0, "power", 2, 2),
     ],