fix(ci): reduce memory safety margin

McPatate · McPatate · commit 1f2620bcd053 · 2025-11-12T14:50:51.000Z
diff --git a/benchmark_v2/framework/benchmark_runner.py b/benchmark_v2/framework/benchmark_runner.py
@@ -117,8 +117,6 @@ def flush_memory():
     # Clear CUDA cache
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
         torch.cuda.synchronize()
     gc.collect()
 
diff --git a/src/transformers/generation/continuous_batching/cache.py b/src/transformers/generation/continuous_batching/cache.py
@@ -189,7 +189,7 @@ def __init__(
         num_blocks, max_batch_tokens = memory_handler.infer_num_blocks_and_max_batch_tokens(
             num_blocks=getattr(generation_config, "num_blocks", None),
             max_batch_tokens=getattr(generation_config, "max_batch_tokens", None),
-            max_memory_percent=getattr(generation_config, "max_memory", 0.9),
+            max_memory_percent=getattr(generation_config, "max_memory", 0.8),
             cache_dtype=self.dtype,
         )
 
@@ -335,14 +335,6 @@ def update(
         # Return the new KV values
         return key_states_with_cache, value_states_with_cache
 
-    @traced
-    def close(self):
-        self.key_cache.clear()
-        self.value_cache.clear()
-
-        torch._dynamo.reset()
-        torch._dynamo.reset_code_caches()
-
 
 # TODO: rework computation with the groups and their sizes
 class PagedAttentionMemoryHandler:
diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py
@@ -708,35 +708,6 @@ def _sample(self, probs: torch.Tensor, do_sample: bool) -> None:
         tokens = next_tokens.size(1)  # Get seq_len dimension
         self.output_ids[:, :tokens].copy_(next_tokens)
 
-    def close(self):
-        self.cache.close()
-        self.requests_in_batch.clear()
-
-        if self._graphs is not None:
-            self._graphs.clear()
-
-        del self.input_ids
-        del self.position_ids
-        del self.cumulative_seqlens_q
-        del self.logits_indices
-        del self.output_ids
-
-        self.cumulative_seqlens_k.clear()
-
-        if self.attention_mask is not None:
-            self.attention_mask.clear()
-            self.attention_mask = None
-
-        self.write_index_storage.clear()
-        self.read_index_storage.clear()
-
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-            import gc
-
-            gc.collect()
-            torch.cuda.empty_cache()
-
 
 # Manager Class (User Interface)
 @attach_tracer()
@@ -855,9 +826,7 @@ def stop(self, block: bool = True, timeout: Optional[float] = None) -> None:
         if block:
             self.join(stop_trigger_time, timeout)
 
-        if self.batch_processor is not None:
-            self.batch_processor.close()
-            self.batch_processor = None  # NOTE: this is enough to clear memory after stop, still calling `close()` because it calls torch cache intrinsics
+        self.batch_processor = None
 
     def join(self, stop_trigger_time: float, timeout: Optional[float] = None) -> None:
         """Wait for the background thread to finish.