[T5Gemma] Fix cross attention cache (#41890)

vasqu · web-flow · commit 33c60a525409 · 2025-11-11T18:08:34.000Z
* fix

* add test

* style

* added comment
diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py
@@ -797,7 +797,9 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids)
 
         if not self.training and use_cache and past_key_values is None:
-            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+            # We do not pass the config to the cross attn cache to avoid initializing SWA
+            # --> we use full attention between our cross attentions
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache())
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py
@@ -835,7 +835,9 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids)
 
         if not self.training and use_cache and past_key_values is None:
-            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+            # We do not pass the config to the cross attn cache to avoid initializing SWA
+            # --> we use full attention between our cross attentions
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache())
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
diff --git a/tests/models/t5gemma/test_modeling_t5gemma.py b/tests/models/t5gemma/test_modeling_t5gemma.py
@@ -19,11 +19,14 @@
 
 import pytest
 from parameterized import parameterized
+from pytest import mark
 
 from transformers import T5GemmaConfig, T5GemmaModuleConfig, is_torch_available
 from transformers.testing_utils import (
+    require_flash_attn,
     require_torch,
     require_torch_accelerator,
+    require_torch_gpu,
     torch_device,
 )
 
@@ -1267,6 +1270,19 @@ def test_flex_attention_with_grads(self):
             # If this does not raise an error, the test passes (see https://github.com/huggingface/transformers/pull/35605)
             _ = model(**dummy_inputs)
 
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    def test_generate_beyond_sliding_window_with_flash_attn(self):
+        config, input_ids, _, attention_mask, _, _ = self.model_tester.prepare_config_and_inputs()
+        config.decoder.sliding_window = 2  # arbitrary but less than seq_len
+
+        model = self.model_tester.causal_lm_class(config=config).to(dtype=torch.float16, device=torch_device).eval()
+        model.set_attn_implementation("flash_attention_2")
+
+        # Only generate beyond prefill, we don't care about the output as it only checks for crashes
+        _ = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=2, use_cache=True)
+
 
 class T5GemmaEncoderOnlyModelTester:
     config_class = T5GemmaConfig