feat(trtllm): support guided decoding

leejuyuu · leejuyuu · commit bf040884b275 · 2025-10-13T00:47:49.000+08:00
TGI already accepts grammar for guided decoding through its HTTP API,
however, this feature has been disabled for the trtllm backend.

To enable this feature:
- Replace the hard-coded disable of the grammar support with the
  `disable_grammar_support` arg present in the v3 backend.
- Pass tokenizer information when constructing the trtllm Executor and
  enable guided decoding by default.
- Pass the validated grammar type and value from requests to the
  Executor.
diff --git a/backends/trtllm/csrc/backend.cpp b/backends/trtllm/csrc/backend.cpp
@@ -26,7 +26,7 @@ namespace huggingface::tgi::backends::trtllm {
     }
 
 
-    tle::ExecutorConfig backend_workspace_t::executor_config() const {
+    tle::ExecutorConfig backend_workspace_t::executor_config(const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str) const {
         // Retrieve the compute capabilities to enable some options at runtime
         const auto compute_capabilities = hardware::cuda::compute_capabilities_t();
 
@@ -40,17 +40,24 @@ namespace huggingface::tgi::backends::trtllm {
         executor_config.setKvCacheConfig(tle::KvCacheConfig(true));
         executor_config.setEnableChunkedContext(compute_capabilities.is_at_least_ampere());
         executor_config.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
+        executor_config.setGuidedDecodingConfig(tle::GuidedDecodingConfig(
+            tle::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR,
+            encoded_vocab,
+            std::string(tokenizer_str),
+            generation_config().eos_token_ids
+        ));
         return executor_config;
     }
 
-    backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
-            : workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
+    backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str)
+            : workspace(engines_folder, executor_worker_path), 
+              executor_(executor_factory_initializer(workspace, encoded_vocab, tokenizer_str)) {}
 
     std::expected<request_id_t, backend_error_t>
     backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
                       const sampling_params_t s_params) noexcept {
         SPDLOG_DEBUG("Submit {:d} tokens for scheduling ({}, {})", token_ids.size(), g_params, s_params);
-        return executor_.enqueueRequest(tle::Request{
+        tle::Request req {
                 {token_ids.begin(), token_ids.end()},  // Making actual copy of the tokens
                 static_cast<tle::SizeType32>(g_params.max_new_tokens),
                 true,
@@ -68,7 +75,15 @@ namespace huggingface::tgi::backends::trtllm {
                 std::nullopt,
                 std::nullopt,
                 workspace.generation_config().stop_words
-        });
+        };
+
+        if (g_params.guide_type.has_value()) {
+            req.setGuidedDecodingParams(tle::GuidedDecodingParams(
+                g_params.guide_type.value(),
+                g_params.guide
+            ));
+        }
+        return executor_.enqueueRequest(req);
     }
 
     std::vector<tle::Response> backend_t::pull_tokens() noexcept {
diff --git a/backends/trtllm/csrc/backend.hpp b/backends/trtllm/csrc/backend.hpp
@@ -25,6 +25,8 @@ namespace huggingface::tgi::backends::trtllm {
      */
     struct generation_params_t {
         uint32_t max_new_tokens;
+        std::optional<tle::GuidedDecodingParams::GuideType> guide_type;
+        std::string guide;
     };
 
     /**
@@ -66,23 +68,28 @@ namespace huggingface::tgi::backends::trtllm {
         float_t top_p;
         float_t temperature;
         std::list<std::vector<int32_t>> stop_words;
+        std::vector<int32_t> eos_token_ids;
 
         constexpr explicit generation_config_t(const json &config) :
-                top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0) {
+                top_p(config.value("top_p", 1.0f)), temperature(config.value("temperature", 1.0f)), stop_words(0), eos_token_ids{} {
             if (!config.contains("/eos_token_id"_json_pointer)) {
                 return;
             }
             if (config["/eos_token_id"_json_pointer].is_array()) {
                 SPDLOG_DEBUG("generation config eos_token_id is array");
                 const auto &eos_token_id = config["/eos_token_id"_json_pointer];
                 std::for_each(eos_token_id.begin(), eos_token_id.end(), [this](const auto token_id) {
-                    stop_words.emplace_back(1, token_id.template get<int32_t>());
+                    const auto token = token_id.template get<int32_t>();
+                    stop_words.emplace_back(1, token);
+                    eos_token_ids.emplace_back(token);
                 });
             }
 
             if (config["/eos_token_id"_json_pointer].is_number()) {
                 SPDLOG_DEBUG("generation config eos_token_id is number");
-                stop_words.emplace_back(1, config["/eos_token_id"_json_pointer].get<int32_t>());
+                const auto token = config["/eos_token_id"_json_pointer].get<int32_t>();
+                stop_words.emplace_back(1, token);
+                eos_token_ids.emplace_back(token);
             }
 
             SPDLOG_DEBUG("Detected {:d} predefined stop_words from generation_config.json", stop_words.size());
@@ -143,7 +150,7 @@ namespace huggingface::tgi::backends::trtllm {
          * to initialize `tensorrt_llm::executor::Executor`
          * @return `tensorrt_llm::executor::ExecutorConfig` instance
          */
-        [[nodiscard]] tle::ExecutorConfig executor_config() const;
+        [[nodiscard]] tle::ExecutorConfig executor_config(const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str) const;
     };
 
     /**
@@ -167,10 +174,10 @@ namespace huggingface::tgi::backends::trtllm {
         tle::Executor executor_;
 
     public:
-        backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
+        backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str);
 
-        backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
-                : backend_t(engines_folder, executor_worker_path) {};
+        backend_t(std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str)
+                : backend_t(engines_folder, executor_worker_path, encoded_vocab, tokenizer_str) {};
 
         /**
          * Submit a new request to the executor
@@ -201,9 +208,9 @@ namespace huggingface::tgi::backends::trtllm {
     /**
      * Create a TensorRT-LLM executor from a workspace
      */
-    const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
+    const auto executor_factory_initializer = [](const backend_workspace_t &workspace, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str) -> tle::Executor {
         return {workspace.engines_folder(), tensorrt_llm::executor::ModelType::kDECODER_ONLY,
-                workspace.executor_config()};
+                workspace.executor_config(encoded_vocab, tokenizer_str)};
     };
 }
 
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <exception>
 #include <memory>
+#include <optional>
 #include <thread>
 
 #include <nvml.h>
@@ -115,8 +116,8 @@ namespace huggingface::tgi::backends::trtllm {
 
 
     public:
-        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path, const std::chrono::time_point<std::chrono::steady_clock>& created_time)
-                : inner_(engine_folder, executor_worker_path),
+        tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path, const std::chrono::time_point<std::chrono::steady_clock>& created_time, const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str)
+                : inner_(engine_folder, executor_worker_path, encoded_vocab, tokenizer_str),
                   m_created_time {created_time}
         {}
 
@@ -128,16 +129,31 @@ namespace huggingface::tgi::backends::trtllm {
                 float_t temperature,
                 float_t repetition_penalty,
                 float_t frequency_penalty,
-                uint64_t seed
+                uint64_t seed,
+                grammar_type_t grammar_type,
+                rust::Str grammar_value
         ) const {
             // This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
             SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
 
             // Submit the request to the executor and get back a potential request_id used to track request status
             const auto signed_tokens = std::vector<int32_t>(tokens.begin(), tokens.end());
+
+            std::optional<tle::GuidedDecodingParams::GuideType> guide_type = std::nullopt;
+            switch (grammar_type) {
+            case grammar_type_t::kJSON:
+                guide_type = tle::GuidedDecodingParams::GuideType::kJSON_SCHEMA;
+                break;
+            case grammar_type_t::kREGEX:
+                guide_type = tle::GuidedDecodingParams::GuideType::kREGEX;
+                break;
+            default:
+                break;
+            }
+
             const auto maybe_request_id = inner_.submit(
                     signed_tokens,
-                    {max_new_tokens},
+                    {max_new_tokens, guide_type, std::string(grammar_value)},
                     {top_k, top_p, repetition_penalty, frequency_penalty, temperature, seed}
             );
 
@@ -211,15 +227,25 @@ namespace huggingface::tgi::backends::trtllm {
     }
 
     std::unique_ptr<tensorrt_llm_backend_t>
-    create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
+    create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path, const rust::Str tokenizer_str, const rust::Vec<rust::String> encoded_vocab) {
         const auto created_time = std::chrono::steady_clock::now();
         std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
+
+        std::vector<std::string> encoded_vocab_std{};
+        encoded_vocab_std.reserve(encoded_vocab.size());
+
+        for (const auto& v : encoded_vocab) {
+            encoded_vocab_std.push_back(std::string(v));
+        }
+
         return std::make_unique<tensorrt_llm_backend_t>(
                 std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
                                       std::filesystem::path::format::auto_format),
                 std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
                                       std::filesystem::path::format::auto_format),
-                created_time
+                created_time,
+                encoded_vocab_std,
+                std::string_view(tokenizer_str)
         );
     }
 }
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
@@ -78,6 +78,8 @@ mod ffi {
         fn create_backend_from_engine_folder(
             engine_folder: &str,
             executor_worker: &str,
+            tokenizer_str: &str,
+            encoded_vocab: Vec<String>,
         ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
 
         fn submit(
@@ -90,6 +92,8 @@ mod ffi {
             repetition_penalty: f32,
             frequency_penalty: f32,
             seed: u64,
+            grammar_type: GrammarType,
+            grammar_value: &str,
         ) -> Result<u64>;
 
         fn pull_tokens(
@@ -98,6 +102,19 @@ mod ffi {
 
         fn cancel(self: &TensorRtLlmBackendImpl, request_id: u64);
     }
+
+    #[cxx_name = "grammar_type_t"]
+    #[derive(Debug, Clone, Copy)]
+    pub enum GrammarType {
+        #[cxx_name = "kNONE"]
+        None = 0u8,
+
+        #[cxx_name = "kJSON"]
+        Json = 1u8,
+
+        #[cxx_name = "kREGEX"]
+        Regex = 2u8,
+    }
 }
 
 use ffi::FinishReason;
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
@@ -18,12 +18,13 @@ use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStr
 use text_generation_router::validation::ValidationError::{
     EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality,
 };
-use text_generation_router::validation::{Chunk, ValidGenerateRequest};
+use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidGrammar};
 use text_generation_router::Token;
 
 use crate::errors::TensorRtLlmBackendError;
 use crate::ffi::{
-    create_backend_from_engine_folder, FinishReason, GenerationStep, TensorRtLlmBackendImpl,
+    create_backend_from_engine_folder, FinishReason, GenerationStep, GrammarType,
+    TensorRtLlmBackendImpl,
 };
 use crate::utils::first_line;
 
@@ -105,6 +106,16 @@ fn request_looper(
             1
         };
 
+        let (grammar_type, grammar_value): (GrammarType, &str) =
+            if let Some(grammar) = &generation_params.grammar {
+                match grammar {
+                    ValidGrammar::Json(v) => (GrammarType::Json, v),
+                    ValidGrammar::Regex(v) => (GrammarType::Regex, v),
+                }
+            } else {
+                (GrammarType::None, "")
+            };
+
         // Submit to the TensorRT-LLM executor for scheduling
         match backend.submit(
             &input_ids.unwrap(), // This is checked beforehand in validate()
@@ -115,6 +126,8 @@ fn request_looper(
             generation_params.repetition_penalty,
             generation_params.frequency_penalty,
             generation_params.seed,
+            grammar_type,
+            grammar_value,
         ) {
             Ok(request_id) => {
                 // Insert the context linked to the generated request id in the tracker
@@ -392,9 +405,25 @@ impl TensorRtLlmBackendV2 {
         // to rust Instant.
         let created_time = Instant::now();
 
+        let encoded_vocab = {
+            let vocab = tokenizer.get_vocab(true);
+            let mut tokens: Vec<String> = vocab.keys().map(|x| x.clone()).collect();
+            tokens.sort_by(|a, b| vocab.get(a).cmp(&vocab.get(b)));
+            tokens
+        };
+
+        let tokenizer_str = tokenizer
+            .to_string(false)
+            .map_err(|e| TensorRtLlmBackendError::Tokenizer(e.to_string()))?;
+
         // Create the FFI backend
-        let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
-            .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
+        let backend = create_backend_from_engine_folder(
+            &engine_folder,
+            &executor_worker_path,
+            &tokenizer_str,
+            encoded_vocab,
+        )
+        .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
 
         let backend = Arc::new(backend);
         let backend_response = backend.clone();
@@ -425,11 +454,6 @@ impl TensorRtLlmBackendV2 {
             return Err(ValidationError(TopNTokensDisabled));
         }
 
-        // TODO: Is it really needed? How can it be validated before?
-        if request.parameters.grammar.is_some() {
-            return Err(ValidationError(Grammar));
-        }
-
         match request.inputs.len() {
             0 => Err(ValidationError(EmptyInput)),
             2.. => Err(GenerationError(
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
@@ -67,6 +67,8 @@ struct Args {
     usage_stats: UsageStatsLevel,
     #[clap(default_value = "2000000", long, env)]
     payload_limit: usize,
+    #[clap(long, env, default_value_t = false)]
+    disable_grammar_support: bool,
 }
 
 async fn get_tokenizer(tokenizer_name: &str, revision: Option<&str>) -> Option<Tokenizer> {
@@ -244,6 +246,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         executor_worker,
         usage_stats,
         payload_limit,
+        disable_grammar_support,
     } = args;
 
     // Launch Tokio runtime
@@ -321,7 +324,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
                 false,
                 None,
                 None,
-                true,
+                disable_grammar_support,
                 max_client_batch_size,
                 usage_stats,
                 payload_limit,