Try adding a flag on CPU

larryliu0820 · larryliu0820 · commit 14a9a0ef7e47 · 2025-11-25T11:51:35.000-08:00
diff --git a/optimum/executorch/attentions/whisper_attention.py b/optimum/executorch/attentions/whisper_attention.py
@@ -77,6 +77,9 @@ def __init__(
         self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
 
+        # Force this boolean to be on CPU
+        self.is_cache_initialized = torch.tensor(False, device="cpu")
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -138,19 +141,17 @@ def recompute_kv(
         cached_keys = past_key_values.layers[self.layer_idx].keys
         cached_values = past_key_values.layers[self.layer_idx].values
 
-        # Tensor predicate: True if any element is non-zero
-        # Result is a 0-dim bool tensor suitable for torch.cond
-        cache_is_initialized = (cached_keys != 0).any()
-
         # Use torch.cond to select branch in a traceable way.
         # All operands must be (nested) tensors or simple Python values.
         key_states, value_states = torch.cond(
-            cache_is_initialized,
+            self.is_cache_initialized,
             use_cached_kv,
             recompute_kv,
             operands=(cached_keys, cached_values, key_value_states),
         )
 
+        self.is_cache_initialized = torch.tensor(True, device="cpu")
+
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]