address comments.

lfr-0531 · lfr-0531 · commit 311e743a22c8 · 2025-11-11T21:45:25.000-08:00
Signed-off-by: Fanrong Li &lt;23290157+lfr-0531@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -290,6 +290,8 @@ class DSAtrtllmAttentionMetadata(TrtllmAttentionMetadata):
     indexer_max_chunk_size: int
     # Topk for sparse MLA
     sparse_mla_topk: int
+    # max number of draft tokens
+    max_draft_tokens: int = 0
 
     def __init__(self, *args, **kwargs):
         self.num_sms = tensorrt_llm.deep_gemm.get_num_sms()
@@ -485,6 +487,7 @@ def update_spec_dec_param(
                                       is_spec_dec_tree,
                                       is_spec_dec_dynamic_tree,
                                       max_draft_tokens, spec_decoding_tensor)
+        self.max_draft_tokens = max_draft_tokens
         init_shape = self.kv_lens_expanded_host.shape[0]
         if self.max_num_sequences * (1 + self.max_draft_tokens) != init_shape:
             capture_graph = torch.cuda.is_current_stream_capturing()
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -597,8 +597,6 @@ class TrtllmAttentionMetadata(AttentionMetadata):
     is_spec_decoding_enabled: bool = False
     # use_spec_decoding determines if the attention layer should be run in spec-dec mode at the specific step / layer.
     use_spec_decoding: bool = False
-    # max number of draft tokens
-    max_draft_tokens: int = 0
 
     # if spec-dec tree is a tree or a chain (linear tree)
     is_spec_dec_tree: bool = False
@@ -1069,7 +1067,6 @@ def update_spec_dec_param(
         max_draft_tokens,
         spec_decoding_tensor: Optional['SpecDecodingTensor'] = None,
     ):
-        self.max_draft_tokens = max_draft_tokens
         if spec_decoding_tensor is not None:
             spec_decoding_position_offsets = spec_decoding_tensor.position_offsets
             spec_decoding_packed_mask = spec_decoding_tensor.packed_mask