@@ -25,6 +25,8 @@ namespace huggingface::tgi::backends::trtllm {
2525 */
2626 struct generation_params_t {
2727 uint32_t max_new_tokens;
28+ std::optional<tle::GuidedDecodingParams::GuideType> guide_type;
29+ std::string guide;
2830 };
2931
3032 /* *
@@ -66,23 +68,28 @@ namespace huggingface::tgi::backends::trtllm {
6668 float_t top_p;
6769 float_t temperature;
6870 std::list<std::vector<int32_t >> stop_words;
71+ std::vector<int32_t > eos_token_ids;
6972
7073 constexpr explicit generation_config_t (const json &config) :
71- top_p(config.value(" top_p" , 1 .0f )), temperature(config.value(" temperature" , 1 .0f )), stop_words(0 ) {
74+ top_p(config.value(" top_p" , 1 .0f )), temperature(config.value(" temperature" , 1 .0f )), stop_words(0 ), eos_token_ids{} {
7275 if (!config.contains (" /eos_token_id" _json_pointer)) {
7376 return ;
7477 }
7578 if (config[" /eos_token_id" _json_pointer].is_array ()) {
7679 SPDLOG_DEBUG (" generation config eos_token_id is array" );
7780 const auto &eos_token_id = config[" /eos_token_id" _json_pointer];
7881 std::for_each (eos_token_id.begin (), eos_token_id.end (), [this ](const auto token_id) {
79- stop_words.emplace_back (1 , token_id.template get <int32_t >());
82+ const auto token = token_id.template get <int32_t >();
83+ stop_words.emplace_back (1 , token);
84+ eos_token_ids.emplace_back (token);
8085 });
8186 }
8287
8388 if (config[" /eos_token_id" _json_pointer].is_number ()) {
8489 SPDLOG_DEBUG (" generation config eos_token_id is number" );
85- stop_words.emplace_back (1 , config[" /eos_token_id" _json_pointer].get <int32_t >());
90+ const auto token = config[" /eos_token_id" _json_pointer].get <int32_t >();
91+ stop_words.emplace_back (1 , token);
92+ eos_token_ids.emplace_back (token);
8693 }
8794
8895 SPDLOG_DEBUG (" Detected {:d} predefined stop_words from generation_config.json" , stop_words.size ());
@@ -143,7 +150,7 @@ namespace huggingface::tgi::backends::trtllm {
143150 * to initialize `tensorrt_llm::executor::Executor`
144151 * @return `tensorrt_llm::executor::ExecutorConfig` instance
145152 */
146- [[nodiscard]] tle::ExecutorConfig executor_config () const ;
153+ [[nodiscard]] tle::ExecutorConfig executor_config (const std::vector<std::string>& encoded_vocab, std::string_view tokenizer_str ) const ;
147154 };
148155
149156 /* *
@@ -167,10 +174,10 @@ namespace huggingface::tgi::backends::trtllm {
167174 tle::Executor executor_;
168175
169176 public:
170- backend_t (std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path);
177+ backend_t (std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str );
171178
172- backend_t (std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path)
173- : backend_t (engines_folder, executor_worker_path) {};
179+ backend_t (std::filesystem::path &&engines_folder, std::filesystem::path &&executor_worker_path, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str )
180+ : backend_t (engines_folder, executor_worker_path, encoded_vocab, tokenizer_str ) {};
174181
175182 /* *
176183 * Submit a new request to the executor
@@ -201,9 +208,9 @@ namespace huggingface::tgi::backends::trtllm {
201208 /* *
202209 * Create a TensorRT-LLM executor from a workspace
203210 */
204- const auto executor_factory_initializer = [](const backend_workspace_t &workspace) -> tle::Executor {
211+ const auto executor_factory_initializer = [](const backend_workspace_t &workspace, const std::vector<std::string> &encoded_vocab, std::string_view tokenizer_str ) -> tle::Executor {
205212 return {workspace.engines_folder (), tensorrt_llm::executor::ModelType::kDECODER_ONLY ,
206- workspace.executor_config ()};
213+ workspace.executor_config (encoded_vocab, tokenizer_str )};
207214 };
208215}
209216
0 commit comments