From c833a0edf477fdff9a28b6f180a437fb26c6ef56 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 7 Nov 2025 18:35:31 +0100 Subject: [PATCH 1/7] debug(ci): run `pwd` to check what we're working with --- .github/workflows/benchmark.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index bdbf668e1e30..525cfd343b5b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -36,6 +36,9 @@ jobs: with: fetch-depth: 1 + - name: debug + run: pwd + - name: Install benchmark script dependencies run: python3 -m pip install -r benchmark_v2/requirements.txt kernels From 983204312202213cd83692d7245b38d74585bada Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 7 Nov 2025 18:45:09 +0100 Subject: [PATCH 2/7] fix(ci): `ls -lR` --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 525cfd343b5b..48980d7f840f 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -37,7 +37,7 @@ jobs: fetch-depth: 1 - name: debug - run: pwd + run: ls -lR - name: Install benchmark script dependencies run: python3 -m pip install -r benchmark_v2/requirements.txt kernels From 217266e675e3c494a9d306142759e22c58bb8211 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Fri, 7 Nov 2025 18:47:39 +0100 Subject: [PATCH 3/7] fix(ci): remove working directory which should not be there? --- .github/workflows/benchmark.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 48980d7f840f..9a4e4fc3504e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -36,14 +36,10 @@ jobs: with: fetch-depth: 1 - - name: debug - run: ls -lR - - name: Install benchmark script dependencies run: python3 -m pip install -r benchmark_v2/requirements.txt kernels - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" - name: Run benchmark From 0fc330b126c0f8eadcad2d8b90252bd2a598bc0d Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 10 Nov 2025 17:36:02 +0100 Subject: [PATCH 4/7] fix(cb): make sure memory is freed when calling `stop` --- .../generation/continuous_batching/continuous_api.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py index e6bbfd9ad771..78f27d72146e 100644 --- a/src/transformers/generation/continuous_batching/continuous_api.py +++ b/src/transformers/generation/continuous_batching/continuous_api.py @@ -826,6 +826,12 @@ def stop(self, block: bool = True, timeout: Optional[float] = None) -> None: if block: self.join(stop_trigger_time, timeout) + torch.cuda.synchronize() + import gc + + gc.collect() + torch.cuda.empty_cache() + def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None: """Wait for the background thread to finish. From 566955c1f58bc323fa45671ed37045484400f43e Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Mon, 10 Nov 2025 20:27:46 +0000 Subject: [PATCH 5/7] fix(ci): effectively clear cache --- .../generation/continuous_batching/cache.py | 8 ++++ .../continuous_batching/continuous_api.py | 37 ++++++++++++++++--- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py index 45841ee4e197..236a466f6d62 100644 --- a/src/transformers/generation/continuous_batching/cache.py +++ b/src/transformers/generation/continuous_batching/cache.py @@ -335,6 +335,14 @@ def update( # Return the new KV values return key_states_with_cache, value_states_with_cache + @traced + def close(self): + self.key_cache.clear() + self.value_cache.clear() + + torch._dynamo.reset() + torch._dynamo.reset_code_caches() + # TODO: rework computation with the groups and their sizes class PagedAttentionMemoryHandler: diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py index 78f27d72146e..0b11d5d24f4b 100644 --- a/src/transformers/generation/continuous_batching/continuous_api.py +++ b/src/transformers/generation/continuous_batching/continuous_api.py @@ -708,6 +708,35 @@ def _sample(self, probs: torch.Tensor, do_sample: bool) -> None: tokens = next_tokens.size(1) # Get seq_len dimension self.output_ids[:, :tokens].copy_(next_tokens) + def close(self): + self.cache.close() + self.requests_in_batch.clear() + + if self._graphs is not None: + self._graphs.clear() + + del self.input_ids + del self.position_ids + del self.cumulative_seqlens_q + del self.logits_indices + del self.output_ids + + self.cumulative_seqlens_k.clear() + + if self.attention_mask is not None: + self.attention_mask.clear() + self.attention_mask = None + + self.write_index_storage.clear() + self.read_index_storage.clear() + + if torch.cuda.is_available(): + torch.cuda.synchronize() + import gc + + gc.collect() + torch.cuda.empty_cache() + # Manager Class (User Interface) @attach_tracer() @@ -826,11 +855,9 @@ def stop(self, block: bool = True, timeout: Optional[float] = None) -> None: if block: self.join(stop_trigger_time, timeout) - torch.cuda.synchronize() - import gc - - gc.collect() - torch.cuda.empty_cache() + if self.batch_processor is not None: + self.batch_processor.close() + self.batch_processor = None # NOTE: this is enough to clear memory after stop, still calling `close()` because it calls torch cache intrinsics def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None: """Wait for the background thread to finish. From 7923cf66e0f67099168a689fa16f43219dfab349 Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Wed, 12 Nov 2025 14:50:51 +0000 Subject: [PATCH 6/7] fix(ci): reduce memory safety margin --- benchmark_v2/framework/benchmark_runner.py | 2 -- .../generation/continuous_batching/cache.py | 10 +----- .../continuous_batching/continuous_api.py | 33 +------------------ 3 files changed, 2 insertions(+), 43 deletions(-) diff --git a/benchmark_v2/framework/benchmark_runner.py b/benchmark_v2/framework/benchmark_runner.py index 47a60b4e0a88..69fa2b51b576 100644 --- a/benchmark_v2/framework/benchmark_runner.py +++ b/benchmark_v2/framework/benchmark_runner.py @@ -117,8 +117,6 @@ def flush_memory(): # Clear CUDA cache if torch.cuda.is_available(): torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() torch.cuda.synchronize() gc.collect() diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py index 236a466f6d62..eafc7e4f4032 100644 --- a/src/transformers/generation/continuous_batching/cache.py +++ b/src/transformers/generation/continuous_batching/cache.py @@ -189,7 +189,7 @@ def __init__( num_blocks, max_batch_tokens = memory_handler.infer_num_blocks_and_max_batch_tokens( num_blocks=getattr(generation_config, "num_blocks", None), max_batch_tokens=getattr(generation_config, "max_batch_tokens", None), - max_memory_percent=getattr(generation_config, "max_memory", 0.9), + max_memory_percent=getattr(generation_config, "max_memory", 0.8), cache_dtype=self.dtype, ) @@ -335,14 +335,6 @@ def update( # Return the new KV values return key_states_with_cache, value_states_with_cache - @traced - def close(self): - self.key_cache.clear() - self.value_cache.clear() - - torch._dynamo.reset() - torch._dynamo.reset_code_caches() - # TODO: rework computation with the groups and their sizes class PagedAttentionMemoryHandler: diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py index 0b11d5d24f4b..407a66f775d7 100644 --- a/src/transformers/generation/continuous_batching/continuous_api.py +++ b/src/transformers/generation/continuous_batching/continuous_api.py @@ -708,35 +708,6 @@ def _sample(self, probs: torch.Tensor, do_sample: bool) -> None: tokens = next_tokens.size(1) # Get seq_len dimension self.output_ids[:, :tokens].copy_(next_tokens) - def close(self): - self.cache.close() - self.requests_in_batch.clear() - - if self._graphs is not None: - self._graphs.clear() - - del self.input_ids - del self.position_ids - del self.cumulative_seqlens_q - del self.logits_indices - del self.output_ids - - self.cumulative_seqlens_k.clear() - - if self.attention_mask is not None: - self.attention_mask.clear() - self.attention_mask = None - - self.write_index_storage.clear() - self.read_index_storage.clear() - - if torch.cuda.is_available(): - torch.cuda.synchronize() - import gc - - gc.collect() - torch.cuda.empty_cache() - # Manager Class (User Interface) @attach_tracer() @@ -855,9 +826,7 @@ def stop(self, block: bool = True, timeout: Optional[float] = None) -> None: if block: self.join(stop_trigger_time, timeout) - if self.batch_processor is not None: - self.batch_processor.close() - self.batch_processor = None # NOTE: this is enough to clear memory after stop, still calling `close()` because it calls torch cache intrinsics + self.batch_processor = None def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None: """Wait for the background thread to finish. From afeb1733bc4c47388d0fffee648d161dc4b62c8d Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Wed, 12 Nov 2025 15:36:57 +0000 Subject: [PATCH 7/7] refactor(cb): add fixme note on default safety margin value --- .../generation/continuous_batching/cache.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py index eafc7e4f4032..780da4ce9b15 100644 --- a/src/transformers/generation/continuous_batching/cache.py +++ b/src/transformers/generation/continuous_batching/cache.py @@ -189,7 +189,9 @@ def __init__( num_blocks, max_batch_tokens = memory_handler.infer_num_blocks_and_max_batch_tokens( num_blocks=getattr(generation_config, "num_blocks", None), max_batch_tokens=getattr(generation_config, "max_batch_tokens", None), - max_memory_percent=getattr(generation_config, "max_memory", 0.8), + max_memory_percent=getattr( + generation_config, "max_memory", 0.8 + ), # FIXME: it seems we overcommit memory, was changed from 0.9 which caused OOMs in our benchmarking CI cache_dtype=self.dtype, ) @@ -414,7 +416,7 @@ def infer_num_blocks_and_max_batch_tokens( self, num_blocks: Optional[int] = None, max_batch_tokens: Optional[int] = None, - max_memory_percent: float = 0.9, + max_memory_percent: float = 0.8, # FIXME: it seems we overcommit memory, was changed from 0.9 which caused OOMs in our benchmarking CI cache_dtype: torch.dtype = torch.float16, ) -> tuple[int, int]: """Determine optimal number of blocks and maximum number of tokens per batch based on available memory and @@ -454,7 +456,7 @@ def infer_num_blocks_and_max_batch_tokens( def compute_num_blocks_and_max_batch_tokens( self, - max_memory_percent: float = 0.9, + max_memory_percent: float, cache_dtype: torch.dtype = torch.float16, m: float = 0.01, ) -> tuple[int, int]: @@ -503,7 +505,7 @@ def compute_num_blocks_and_max_batch_tokens( def compute_max_batch_tokens( self, num_blocks: int, - max_memory_percent: float = 0.9, + max_memory_percent: float, cache_dtype: torch.dtype = torch.float16, ) -> int: """Calculate maximum batch tokens M given a fixed number of cache blocks. The formula for M is given by: @@ -531,7 +533,7 @@ def compute_max_batch_tokens( def compute_num_blocks( self, max_batch_tokens: int, - max_memory_percent: float = 0.9, + max_memory_percent: float, cache_dtype: torch.dtype = torch.float16, ) -> int: """Calculate number of cache blocks N given a fixed maximum token per token M. The formula for N is given by: