[TRTLLM-7723][feat] sampling using FlashInfer.sampling (#8581)

ixlmar · web-flow · commit 979b3ae9ce27 · 2025-11-11T03:21:19.000-08:00
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -823,7 +823,8 @@ def create_py_executor_instance(
 def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
                               max_batch_size: int,
                               speculative_config: SpeculativeConfig,
-                              max_beam_width: int):
+                              max_beam_width: int,
+                              disable_flash_infer_sampling: bool):
     max_num_sequences = max_batch_size * mapping.pp_size
     max_draft_len = (0 if speculative_config is None else
                      speculative_config.max_draft_len)
@@ -836,20 +837,32 @@ def create_torch_sampler_args(mapping: Mapping, *, max_seq_len: int,
         max_total_draft_tokens=max_total_draft_tokens,
         max_num_sequences=max_num_sequences,
         max_beam_width=max_beam_width,
+        disable_flash_infer_sampling=disable_flash_infer_sampling,
     )
 
 
 def instantiate_sampler(
-        engine: PyTorchModelEngine, llm_args: TorchLlmArgs, mapping: Mapping,
-        max_batch_size: int, max_beam_width: int, max_seq_len: int,
-        mm_encoder_only: bool, speculative_config: SpeculativeConfig,
-        decoding_config: trtllm.DecodingConfig, kv_cache_config: KvCacheConfig):
+    engine: PyTorchModelEngine,
+    llm_args: TorchLlmArgs,
+    mapping: Mapping,
+    *,
+    max_batch_size: int,
+    max_beam_width: int,
+    max_seq_len: int,
+    mm_encoder_only: bool,
+    speculative_config: SpeculativeConfig,
+    decoding_config: trtllm.DecodingConfig,
+    kv_cache_config: KvCacheConfig,
+    disable_flash_infer_sampling: bool,
+):
     sampler_args = create_torch_sampler_args(
         mapping,
         max_seq_len=engine.max_seq_len,
         max_batch_size=max_batch_size,
         speculative_config=speculative_config,
-        max_beam_width=max_beam_width)
+        max_beam_width=max_beam_width,
+        disable_flash_infer_sampling=disable_flash_infer_sampling,
+    )
     decoding_mode = get_decoding_mode(decoding_config=decoding_config,
                                       max_beam_width=max_beam_width)
     if mapping.cp_config.get('cp_type') == CpType.STAR:
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -493,16 +493,19 @@ def drafting_loop_wrapper(model):
                     )
 
     with allocation_scope(ExecutorMemoryType.SAMPLER, RestoreMode.PINNED):
-        sampler = instantiate_sampler(model_engine,
-                                      llm_args,
-                                      mapping,
-                                      max_batch_size=max_batch_size,
-                                      max_beam_width=max_beam_width,
-                                      max_seq_len=max_seq_len,
-                                      mm_encoder_only=mm_encoder_only,
-                                      speculative_config=spec_config,
-                                      decoding_config=decoding_config,
-                                      kv_cache_config=kv_cache_config)
+        sampler = instantiate_sampler(
+            model_engine,
+            llm_args,
+            mapping,
+            max_batch_size=max_batch_size,
+            max_beam_width=max_beam_width,
+            max_seq_len=max_seq_len,
+            mm_encoder_only=mm_encoder_only,
+            speculative_config=spec_config,
+            decoding_config=decoding_config,
+            kv_cache_config=kv_cache_config,
+            disable_flash_infer_sampling=llm_args._disable_flash_infer_sampling,
+        )
         logger.info(f"Using Sampler: {type(sampler).__name__}")
 
     if kv_connector_config is not None:
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -20,7 +20,7 @@
 from dataclasses import dataclass
 from functools import cached_property
 from itertools import repeat
-from typing import Any, Callable, List, Optional, TypeVar, cast
+from typing import Any, Callable, List, Optional, Type, TypeVar, cast
 
 import numpy as np
 import torch
@@ -55,13 +55,15 @@
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.sampling_params import SamplingParams
 
+from ..flashinfer_utils import IS_FLASHINFER_AVAILABLE
 from ..speculative.spec_tree_manager import SpecTreeManager
 from .finish_reason import FinishedState
 from .llm_request import LlmRequest, LlmRequestState, get_draft_token_length
 from .resource_manager import ResourceManager, ResourceManagerType
 from .sampling_utils import (
     GREEDY,
     GenericStrategyKeyType,
+    GroupedStrategySampler,
     SimpleGroupedStrategySampler,
     Strategy,
     UtilsSamplingParams,
@@ -268,7 +270,7 @@ def _request_strategy(request: LlmRequest, *, vocab_size: int) -> Strategy:
 def _group_requests_by_strategy_key(
     requests: Iterable[LlmRequest],
     *,
-    strategy_to_key: Callable[[Strategy], GenericStrategyKeyType],
+    strategy_to_key: Callable[[Strategy, bool], GenericStrategyKeyType],
     pin_memory: bool = False,
     vocab_size: int,
 ) -> dict[tuple[GenericStrategyKeyType, bool], tuple[torch.Tensor, List[Strategy]]]:
@@ -278,8 +280,8 @@ def _group_requests_by_strategy_key(
     )
     for req_index, req in enumerate(requests):
         strategy = _request_strategy(req, vocab_size=vocab_size)
-        strategy_key = strategy_to_key(strategy)
         speculation_needs_probs = req.py_draft_logits is not None and strategy is not GREEDY
+        strategy_key = strategy_to_key(strategy, speculation_needs_probs)
         group_dict_entry = group_dict[(strategy_key, speculation_needs_probs)]
         group_dict_entry[0].append(req_index)
         group_dict_entry[1].append(strategy)
@@ -608,6 +610,7 @@ class Args:
         max_num_sequences: int
         max_beam_width: int
         max_total_draft_tokens: int
+        disable_flash_infer_sampling: bool = False
 
     def __init__(self, args: Args):
         self.max_seq_len = args.max_seq_len
@@ -642,6 +645,14 @@ def __init__(self, args: Args):
                 ]  # `in FinishReason` clashes with PyBind11: `TypeError: 'pybind11_type' object is not iterable`
             }
 
+        self._grouped_sampler_cls: Type[GroupedStrategySampler]
+        if IS_FLASHINFER_AVAILABLE and not args.disable_flash_infer_sampling:
+            from .sampling_utils_flashinfer import FlashInferGroupedStrategySampler
+
+            self._grouped_sampler_cls = FlashInferGroupedStrategySampler
+        else:
+            self._grouped_sampler_cls = SimpleGroupedStrategySampler
+
         # Initialize seed for multi-GPU consistency
         self._global_seed = 42
         self._generator = None
@@ -1251,7 +1262,7 @@ def _sample_batched_by_strategy(
             requests,
             pin_memory=True,
             vocab_size=logits_cuda.size(1),
-            strategy_to_key=SimpleGroupedStrategySampler.strategy_grouping_key,
+            strategy_to_key=self._grouped_sampler_cls.strategy_grouping_key,
         )
         generator_cuda = self.get_generator(cuda_device)
 
@@ -1308,7 +1319,7 @@ def _sample_batched_by_strategy(
                 for _ in range(steps)
             ]
             group_next_tokens_cuda, group_softmax_cuda = (
-                SimpleGroupedStrategySampler.sample_grouped_strategies(
+                self._grouped_sampler_cls.sample_grouped_strategies(
                     strategy_key,
                     group_strategies_per_step,
                     group_logits_cuda,
diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils.py
@@ -33,13 +33,13 @@
     from typing_extensions import override
 
 
-TemperatureOnly = tuple[Literal["temperature"], float]
-TopK = tuple[Literal["top_k"], int, float]
-TopP = tuple[Literal["top_p"], float, float]
-TopKTopP = tuple[Literal["top_k_top_p"], int, float, float]
-Greedy = tuple[Literal["greedy"], None]
+TemperatureOnly: TypeAlias = tuple[Literal["temperature"], float]
+TopK: TypeAlias = tuple[Literal["top_k"], int, float]
+TopP: TypeAlias = tuple[Literal["top_p"], float, float]
+TopKTopP: TypeAlias = tuple[Literal["top_k_top_p"], int, float, float]
+Greedy: TypeAlias = tuple[Literal["greedy"], None]
 GREEDY: Greedy = ("greedy", None)
-Strategy = TopK | TopP | Greedy | TopKTopP | TemperatureOnly
+Strategy: TypeAlias = TopK | TopP | Greedy | TopKTopP | TemperatureOnly
 
 
 @dataclass(frozen=True, kw_only=True)
@@ -258,7 +258,10 @@ def sample(
     match strategy:
         case ("top_k", top_k, temperature):
             tokens, softmax = top_k_sampling_batch(
-                logits, top_k=top_k, temperature=temperature, generator=generator
+                logits,
+                top_k=top_k,
+                temperature=temperature,
+                generator=generator,
             )
         case ("top_p", top_p, temperature):
             tokens, softmax = top_p_sampling_batch(
@@ -292,7 +295,7 @@ def sample(
 class GroupedStrategySampler(Generic[GenericStrategyKeyType], abc.ABC):
     @staticmethod
     @abc.abstractmethod
-    def strategy_grouping_key(strategy: Strategy) -> GenericStrategyKeyType:
+    def strategy_grouping_key(strategy: Strategy, return_probs: bool) -> GenericStrategyKeyType:
         raise NotImplementedError
 
     @staticmethod
@@ -314,7 +317,7 @@ class SimpleGroupedStrategySampler(GroupedStrategySampler[Strategy]):
 
     @override
     @staticmethod
-    def strategy_grouping_key(strategy: Strategy) -> STRATEGY_KEY_TYPE:
+    def strategy_grouping_key(strategy: Strategy, return_probs: bool) -> STRATEGY_KEY_TYPE:
         return strategy
 
     @override
diff --git a/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py b/tensorrt_llm/_torch/pyexecutor/sampling_utils_flashinfer.py
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py