Refactor: eagle3 training loop & loss mask (#548)

h-guo18 · web-flow · commit fd79188d4980 · 2025-11-15T19:11:39.000-08:00
## What does this PR do? **Type of change:** ?  **Overview:** A few refactors to eagle3 training code: - Put first eagle step into TTT loop; - Support TTT mask beyond 4; - Remove answer-only loss mask in data loading. ## Usage  ```python # Add a code snippet demonstrating how to use this ``` ## Testing  ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/TensorRT-Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information  --------- Signed-off-by: h-guo18 <67671475+h-guo18@users.noreply.github.com>
diff --git a/examples/speculative_decoding/eagle_utils.py b/examples/speculative_decoding/eagle_utils.py
@@ -78,31 +78,11 @@ def get_role_content(item):
             return_tensors="pt",
             add_special_tokens=False,
             truncation=True,
-            return_offsets_mapping=True,
         )
         input_ids = output.input_ids[0]
         attention_mask = output.attention_mask[0]
-        offset_mapping = output.offset_mapping[0]
-        loss_mask = torch.zeros_like(input_ids)
-        labels = torch.full_like(input_ids, IGNORE_TOKEN_ID)
-
-        for turn in messages:
-            if turn["role"] == "assistant":
-                content = turn["content"]
-                # Unfortunate strip() necessary because chat templates are doing the same.
-                start = conversation.index(content.strip())
-                stop = start + len(content)
-                indices = []
-                for tok_index, (tok_start, tok_stop) in enumerate(offset_mapping):
-                    if tok_start >= start and tok_stop <= stop:
-                        indices.append(tok_index)
-                labels[indices] = input_ids[indices]
-                loss_mask[indices] = 1
-
-        # Shift loss_mask and labels to the left by 1 token
-        loss_mask = torch.cat([loss_mask[1:], torch.zeros(1, dtype=loss_mask.dtype)])
-        labels = torch.cat([labels[1:], torch.tensor([IGNORE_TOKEN_ID], dtype=labels.dtype)])
-
+        loss_mask = torch.ones_like(input_ids)
+        labels = torch.cat([input_ids[1:], torch.tensor([IGNORE_TOKEN_ID], dtype=input_ids.dtype)])
         new_examples["input_ids"].append(input_ids)
         new_examples["attention_mask"].append(attention_mask)
         new_examples["loss_mask"].append(loss_mask)
@@ -158,7 +138,7 @@ def convert_role(role):
         input_ids = output.input_ids[0]
         attention_mask = output.attention_mask[0]
         loss_mask = torch.ones_like(input_ids)
-        labels = torch.full_like(input_ids, IGNORE_TOKEN_ID)
+        labels = torch.cat([input_ids[1:], torch.tensor([IGNORE_TOKEN_ID], dtype=input_ids.dtype)])
         # TODO: add labels and answer-only loss masking?
 
         new_examples["input_ids"].append(input_ids)
diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py
@@ -508,17 +508,15 @@ def modify(
         # https://github.com/huggingface/transformers/blob/v4.56-release/src/transformers/trainer.py#L566
         self.is_quantized = False
 
-        self.num_ttt_steps = 3  # NOTE: (hg) hardcoded for now. Might add to config later.
-        self._cached_attn_blk_masks = []
+        self.num_ttt_steps = 4  # NOTE: (hg) hardcoded for now. Might add to config later.
+        self._cached_attn_blk_masks = {}
 
     def _get_ttt_attention_mask(self, seq_length, ttt_step):
         # compile and cached flex attention masks in first call
-        if ttt_step >= len(self._cached_attn_blk_masks):
-            self._cached_attn_blk_masks.append(
-                self._compute_ttt_attention_mask(seq_length, ttt_step)
+        if ttt_step not in self._cached_attn_blk_masks:
+            self._cached_attn_blk_masks.update(
+                {ttt_step: self._compute_ttt_attention_mask(seq_length, ttt_step)}
             )
-
-        # return cached flex attention mask
         return self._cached_attn_blk_masks[ttt_step]
 
     def _prepare_decoder_attention_mask(
@@ -600,44 +598,26 @@ def _get_eagle_module_inputs(
 
     def _compute_ttt_attention_mask(self, seq_length, ttt_step) -> BlockMask | torch.Tensor:
         """Return TTT attention_mask tensor of type BlockMask or Tensor depends on eagle attn impl."""
-        if ttt_step == 0:
-
-            def msk(b, h, q_idx, kv_idx):
-                # symbolic attention mask of shape [seq_len, 2* seq_len] for TTT step 0
-                return (kv_idx <= (q_idx - 1)) | (kv_idx == q_idx + seq_length)
 
-        elif ttt_step == 1:
-
-            def msk(b, h, q_idx, kv_idx):
-                # attention mask of shape [seq_len, 3* seq_len] for TTT step 1
-                return (
-                    (kv_idx <= (q_idx - 2))
-                    | ((kv_idx == q_idx + seq_length - 1) & (kv_idx >= seq_length))
-                    | ((kv_idx == q_idx + 2 * seq_length) & (kv_idx >= seq_length * 2))
-                )
-        elif ttt_step == 2:
-
-            def msk(b, h, q_idx, kv_idx):
-                # attention mask of shape [seq_len, 4* seq_len] for TTT step 2
-                return (
-                    (kv_idx <= (q_idx - 3))
-                    | ((kv_idx == q_idx + seq_length - 2) & (kv_idx >= seq_length))
-                    | ((kv_idx == q_idx + 2 * seq_length - 1) & (kv_idx >= seq_length * 2))
-                    | ((kv_idx == q_idx + 3 * seq_length) & (kv_idx >= seq_length * 3))
+        def msk_func(b, h, q_idx, kv_idx):
+            mask = kv_idx <= (q_idx - ttt_step)
+            for i in range(1, ttt_step + 1):
+                mask_block_i = (kv_idx == q_idx + i * seq_length - (ttt_step - i)) & (
+                    kv_idx >= seq_length * i
                 )
-        else:
-            raise ValueError(f"EAGLE TTT step {ttt_step} is not supported")
+                mask = mask | mask_block_i
+            return mask
 
         dtypemin = torch.finfo(self._base_llm_config.dtype).min
         q_len = seq_length
-        kv_len = seq_length * (2 + ttt_step)
+        kv_len = seq_length * (1 + ttt_step)
         if self.eagle_module.config._attn_implementation == "flex_attention":
             # Return block mask for flex attention
-            block_mask = create_block_mask(msk, B=None, H=None, Q_LEN=q_len, KV_LEN=kv_len)
+            block_mask = create_block_mask(msk_func, B=None, H=None, Q_LEN=q_len, KV_LEN=kv_len)
             return block_mask
         else:
             # Return tensor mask for non-flex attention
-            tensor_mask = msk(
+            tensor_mask = msk_func(
                 None,
                 None,
                 torch.arange(q_len).view(1, 1, q_len, 1),
@@ -847,69 +827,54 @@ def forward(
             inputs_embeds = self._llm_or_vlm_embedding(eagle_input_ids, kwargs)
 
         position_embeddings = self.eagle_rotary_emb(eagle_input_hidden_states, position_ids)
-
-        # Then, we run eagle forward
-        _, eagle_prenorm_h, eagle_logits, eagle_cache = self._eagle_forward(
-            eagle_input_hidden_states,
-            inputs_embeds,
-            attention_mask_0,
-            position_ids,
-            position_embeddings,
-            eagle_cache,
-        )
-
         past_key_values.eagle_cache = eagle_cache
 
-        # Compute loss on the eagle modules
-        classification_loss, acc = self._eagle_loss(
-            base_model_logits[:, 1:],
-            eagle_logits[:, :-1],
-            loss_mask[:, 1:],
-        )
-        eagle_loss = classification_loss
-        train_accs.append(acc)
-
         # ====Perform training-time-testing with 3 extra eagle forward passes====
-        if self.training:
-            for ttt_step in range(self.num_ttt_steps):
-                eagle_input_hidden_states = torch.cat(
+        for ttt_step in range(self.num_ttt_steps):
+            attention_mask = (
+                attention_mask_0
+                if ttt_step == 0
+                else self._get_ttt_attention_mask(seq_length, ttt_step)
+            )
+            _, eagle_input_hidden_states, eagle_logits, eagle_cache = self._eagle_forward(
+                eagle_input_hidden_states,
+                inputs_embeds,
+                attention_mask,
+                position_ids,
+                position_embeddings,
+                eagle_cache,
+            )
+            eagle_input_hidden_states = torch.cat(
+                (
+                    torch.zeros(
+                        (b, 1, h),
+                        dtype=eagle_input_hidden_states.dtype,
+                        device=eagle_input_hidden_states.device,
+                    ),
+                    eagle_input_hidden_states[:, :-1, :],
+                ),
+                dim=1,
+            )
+            classification_loss, acc = self._eagle_loss(
+                # base model predict +1 tok, while eagle predict +2
+                # so we shift base model outputs compared to eagle outputs
+                base_model_logits[:, 1:],
+                eagle_logits[:, :-1],
+                # additionally, we mask the first n tok of eagle outputs at nth TTT step
+                torch.cat(
                     (
-                        torch.zeros(
-                            (b, 1, h),
-                            dtype=eagle_input_hidden_states.dtype,
-                            device=eagle_input_hidden_states.device,
-                        ),
-                        eagle_prenorm_h[:, :-1, :],
+                        torch.zeros(b, ttt_step, dtype=loss_mask.dtype, device=loss_mask.device),
+                        loss_mask[:, 1 + ttt_step :],
                     ),
                     dim=1,
-                )
-                attention_mask = self._get_ttt_attention_mask(seq_length, ttt_step)
-                _, eagle_prenorm_h, eagle_logits, eagle_cache = self._eagle_forward(
-                    eagle_input_hidden_states,
-                    inputs_embeds,
-                    attention_mask,
-                    position_ids,
-                    position_embeddings,
-                    eagle_cache,
-                )
-                classification_loss, acc = self._eagle_loss(
-                    # base model predict +1 tok, while eagle predict +2
-                    # so we shift base model outputs compared to eagle outputs
-                    base_model_logits[:, 1:],
-                    eagle_logits[:, :-1],
-                    # additionally, we mask the first n tok of eagle outputs at nth TTT step
-                    torch.cat(
-                        (
-                            torch.zeros(
-                                b, 1 + ttt_step, dtype=loss_mask.dtype, device=loss_mask.device
-                            ),
-                            loss_mask[:, 2 + ttt_step :],
-                        ),
-                        dim=1,
-                    ),
-                )
-                eagle_loss += classification_loss
-                train_accs.append(acc)
+                ),
+            )
+            eagle_loss = (
+                classification_loss if eagle_loss is None else eagle_loss + classification_loss
+            )
+            train_accs.append(acc)
+            if not self.training:
+                break
         # Finally, we merge base model loss and eagle loss, raise error if both are None
         if base_model_loss is not None and eagle_loss is not None:
             loss = base_model_loss + eagle_loss