From c833a0edf477fdff9a28b6f180a437fb26c6ef56 Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Fri, 7 Nov 2025 18:35:31 +0100
Subject: [PATCH 1/7] debug(ci): run `pwd` to check what we're working with

---
 .github/workflows/benchmark.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index bdbf668e1e30..525cfd343b5b 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -36,6 +36,9 @@ jobs:
         with:
           fetch-depth: 1
 
+      - name: debug
+        run: pwd
+
       - name: Install benchmark script dependencies
         run: python3 -m pip install -r benchmark_v2/requirements.txt kernels
 

From 983204312202213cd83692d7245b38d74585bada Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Fri, 7 Nov 2025 18:45:09 +0100
Subject: [PATCH 2/7] fix(ci): `ls -lR`

---
 .github/workflows/benchmark.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 525cfd343b5b..48980d7f840f 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -37,7 +37,7 @@ jobs:
           fetch-depth: 1
 
       - name: debug
-        run: pwd
+        run: ls -lR
 
       - name: Install benchmark script dependencies
         run: python3 -m pip install -r benchmark_v2/requirements.txt kernels

From 217266e675e3c494a9d306142759e22c58bb8211 Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Fri, 7 Nov 2025 18:47:39 +0100
Subject: [PATCH 3/7] fix(ci): remove working directory which should not be
 there?

---
 .github/workflows/benchmark.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 48980d7f840f..9a4e4fc3504e 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -36,14 +36,10 @@ jobs:
         with:
           fetch-depth: 1
 
-      - name: debug
-        run: ls -lR
-
       - name: Install benchmark script dependencies
         run: python3 -m pip install -r benchmark_v2/requirements.txt kernels
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
         run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
 
       - name: Run benchmark

From 0fc330b126c0f8eadcad2d8b90252bd2a598bc0d Mon Sep 17 00:00:00 2001
From: Luc Georges <luc.sydney.georges@gmail.com>
Date: Mon, 10 Nov 2025 17:36:02 +0100
Subject: [PATCH 4/7] fix(cb): make sure memory is freed when calling `stop`

---
 .../generation/continuous_batching/continuous_api.py        | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py
index e6bbfd9ad771..78f27d72146e 100644
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@@ -826,6 +826,12 @@ def stop(self, block: bool = True, timeout: Optional[float] = None) -> None:
         if block:
             self.join(stop_trigger_time, timeout)
 
+        torch.cuda.synchronize()
+        import gc
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None:
         """Wait for the background thread to finish.
 

From 566955c1f58bc323fa45671ed37045484400f43e Mon Sep 17 00:00:00 2001
From: Luc Georges <luc@huggingface.co>
Date: Mon, 10 Nov 2025 20:27:46 +0000
Subject: [PATCH 5/7] fix(ci): effectively clear cache

---
 .../generation/continuous_batching/cache.py   |  8 ++++
 .../continuous_batching/continuous_api.py     | 37 ++++++++++++++++---
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py
index 45841ee4e197..236a466f6d62 100644
--- a/src/transformers/generation/continuous_batching/cache.py
+++ b/src/transformers/generation/continuous_batching/cache.py
@@ -335,6 +335,14 @@ def update(
         # Return the new KV values
         return key_states_with_cache, value_states_with_cache
 
+    @traced
+    def close(self):
+        self.key_cache.clear()
+        self.value_cache.clear()
+
+        torch._dynamo.reset()
+        torch._dynamo.reset_code_caches()
+
 
 # TODO: rework computation with the groups and their sizes
 class PagedAttentionMemoryHandler:
diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py
index 78f27d72146e..0b11d5d24f4b 100644
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@@ -708,6 +708,35 @@ def _sample(self, probs: torch.Tensor, do_sample: bool) -> None:
         tokens = next_tokens.size(1)  # Get seq_len dimension
         self.output_ids[:, :tokens].copy_(next_tokens)
 
+    def close(self):
+        self.cache.close()
+        self.requests_in_batch.clear()
+
+        if self._graphs is not None:
+            self._graphs.clear()
+
+        del self.input_ids
+        del self.position_ids
+        del self.cumulative_seqlens_q
+        del self.logits_indices
+        del self.output_ids
+
+        self.cumulative_seqlens_k.clear()
+
+        if self.attention_mask is not None:
+            self.attention_mask.clear()
+            self.attention_mask = None
+
+        self.write_index_storage.clear()
+        self.read_index_storage.clear()
+
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+            import gc
+
+            gc.collect()
+            torch.cuda.empty_cache()
+
 
 # Manager Class (User Interface)
 @attach_tracer()
@@ -826,11 +855,9 @@ def stop(self, block: bool = True, timeout: Optional[float] = None) -> None:
         if block:
             self.join(stop_trigger_time, timeout)
 
-        torch.cuda.synchronize()
-        import gc
-
-        gc.collect()
-        torch.cuda.empty_cache()
+        if self.batch_processor is not None:
+            self.batch_processor.close()
+            self.batch_processor = None  # NOTE: this is enough to clear memory after stop, still calling `close()` because it calls torch cache intrinsics
 
     def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None:
         """Wait for the background thread to finish.

From 7923cf66e0f67099168a689fa16f43219dfab349 Mon Sep 17 00:00:00 2001
From: Luc Georges <luc@huggingface.co>
Date: Wed, 12 Nov 2025 14:50:51 +0000
Subject: [PATCH 6/7] fix(ci): reduce memory safety margin

---
 benchmark_v2/framework/benchmark_runner.py    |  2 --
 .../generation/continuous_batching/cache.py   | 10 +-----
 .../continuous_batching/continuous_api.py     | 33 +------------------
 3 files changed, 2 insertions(+), 43 deletions(-)

diff --git a/benchmark_v2/framework/benchmark_runner.py b/benchmark_v2/framework/benchmark_runner.py
index 47a60b4e0a88..69fa2b51b576 100644
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@@ -117,8 +117,6 @@ def flush_memory():
     # Clear CUDA cache
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
         torch.cuda.synchronize()
     gc.collect()
 
diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py
index 236a466f6d62..eafc7e4f4032 100644
--- a/src/transformers/generation/continuous_batching/cache.py
+++ b/src/transformers/generation/continuous_batching/cache.py
@@ -189,7 +189,7 @@ def __init__(
         num_blocks, max_batch_tokens = memory_handler.infer_num_blocks_and_max_batch_tokens(
             num_blocks=getattr(generation_config, "num_blocks", None),
             max_batch_tokens=getattr(generation_config, "max_batch_tokens", None),
-            max_memory_percent=getattr(generation_config, "max_memory", 0.9),
+            max_memory_percent=getattr(generation_config, "max_memory", 0.8),
             cache_dtype=self.dtype,
         )
 
@@ -335,14 +335,6 @@ def update(
         # Return the new KV values
         return key_states_with_cache, value_states_with_cache
 
-    @traced
-    def close(self):
-        self.key_cache.clear()
-        self.value_cache.clear()
-
-        torch._dynamo.reset()
-        torch._dynamo.reset_code_caches()
-
 
 # TODO: rework computation with the groups and their sizes
 class PagedAttentionMemoryHandler:
diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py
index 0b11d5d24f4b..407a66f775d7 100644
--- a/src/transformers/generation/continuous_batching/continuous_api.py
+++ b/src/transformers/generation/continuous_batching/continuous_api.py
@@ -708,35 +708,6 @@ def _sample(self, probs: torch.Tensor, do_sample: bool) -> None:
         tokens = next_tokens.size(1)  # Get seq_len dimension
         self.output_ids[:, :tokens].copy_(next_tokens)
 
-    def close(self):
-        self.cache.close()
-        self.requests_in_batch.clear()
-
-        if self._graphs is not None:
-            self._graphs.clear()
-
-        del self.input_ids
-        del self.position_ids
-        del self.cumulative_seqlens_q
-        del self.logits_indices
-        del self.output_ids
-
-        self.cumulative_seqlens_k.clear()
-
-        if self.attention_mask is not None:
-            self.attention_mask.clear()
-            self.attention_mask = None
-
-        self.write_index_storage.clear()
-        self.read_index_storage.clear()
-
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-            import gc
-
-            gc.collect()
-            torch.cuda.empty_cache()
-
 
 # Manager Class (User Interface)
 @attach_tracer()
@@ -855,9 +826,7 @@ def stop(self, block: bool = True, timeout: Optional[float] = None) -> None:
         if block:
             self.join(stop_trigger_time, timeout)
 
-        if self.batch_processor is not None:
-            self.batch_processor.close()
-            self.batch_processor = None  # NOTE: this is enough to clear memory after stop, still calling `close()` because it calls torch cache intrinsics
+        self.batch_processor = None
 
     def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None:
         """Wait for the background thread to finish.

From afeb1733bc4c47388d0fffee648d161dc4b62c8d Mon Sep 17 00:00:00 2001
From: Luc Georges <luc@huggingface.co>
Date: Wed, 12 Nov 2025 15:36:57 +0000
Subject: [PATCH 7/7] refactor(cb): add fixme note on default safety margin
 value

---
 .../generation/continuous_batching/cache.py          | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py
index eafc7e4f4032..780da4ce9b15 100644
--- a/src/transformers/generation/continuous_batching/cache.py
+++ b/src/transformers/generation/continuous_batching/cache.py
@@ -189,7 +189,9 @@ def __init__(
         num_blocks, max_batch_tokens = memory_handler.infer_num_blocks_and_max_batch_tokens(
             num_blocks=getattr(generation_config, "num_blocks", None),
             max_batch_tokens=getattr(generation_config, "max_batch_tokens", None),
-            max_memory_percent=getattr(generation_config, "max_memory", 0.8),
+            max_memory_percent=getattr(
+                generation_config, "max_memory", 0.8
+            ),  # FIXME: it seems we overcommit memory, was changed from 0.9 which caused OOMs in our benchmarking CI
             cache_dtype=self.dtype,
         )
 
@@ -414,7 +416,7 @@ def infer_num_blocks_and_max_batch_tokens(
         self,
         num_blocks: Optional[int] = None,
         max_batch_tokens: Optional[int] = None,
-        max_memory_percent: float = 0.9,
+        max_memory_percent: float = 0.8,  # FIXME: it seems we overcommit memory, was changed from 0.9 which caused OOMs in our benchmarking CI
         cache_dtype: torch.dtype = torch.float16,
     ) -> tuple[int, int]:
         """Determine optimal number of blocks and maximum number of tokens per batch based on available memory and
@@ -454,7 +456,7 @@ def infer_num_blocks_and_max_batch_tokens(
 
     def compute_num_blocks_and_max_batch_tokens(
         self,
-        max_memory_percent: float = 0.9,
+        max_memory_percent: float,
         cache_dtype: torch.dtype = torch.float16,
         m: float = 0.01,
     ) -> tuple[int, int]:
@@ -503,7 +505,7 @@ def compute_num_blocks_and_max_batch_tokens(
     def compute_max_batch_tokens(
         self,
         num_blocks: int,
-        max_memory_percent: float = 0.9,
+        max_memory_percent: float,
         cache_dtype: torch.dtype = torch.float16,
     ) -> int:
         """Calculate maximum batch tokens M given a fixed number of cache blocks. The formula for M is given by:
@@ -531,7 +533,7 @@ def compute_max_batch_tokens(
     def compute_num_blocks(
         self,
         max_batch_tokens: int,
-        max_memory_percent: float = 0.9,
+        max_memory_percent: float,
         cache_dtype: torch.dtype = torch.float16,
     ) -> int:
         """Calculate number of cache blocks N given a fixed maximum token per token M. The formula for N is given by: