refactor: add interior mutability to tensorrt_llm_backend_t

leejuyuu · leejuyuu · commit f94e026495c7 · 2025-05-30T17:20:30.000+08:00
Make `tensorrt_llm_backend_t` interior mutable by marking the `inner_` struct
as a `mutable` field, so we can make the methods `const`.

This makes the pointer accessible from multiple threads at the Rust
side without wrapping a Mutex. The underlying
tensorrt_llm::executor::Executor already contains a mutex.
diff --git a/backends/trtllm/csrc/ffi.hpp b/backends/trtllm/csrc/ffi.hpp
@@ -107,7 +107,7 @@ namespace huggingface::tgi::backends::trtllm {
 
     class tensorrt_llm_backend_t {
     private:
-        backend_t inner_;
+        mutable backend_t inner_;
 
         // m_created_time is a reference point to convert time from c++ time_point
         // to rust Instant.
@@ -131,7 +131,7 @@ namespace huggingface::tgi::backends::trtllm {
                 float_t repetition_penalty,
                 float_t frequency_penalty,
                 uint64_t seed
-        ) {
+        ) const {
             // This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
             SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));
 
@@ -152,7 +152,7 @@ namespace huggingface::tgi::backends::trtllm {
             }
         }
 
-        std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {
+        std::unique_ptr<std::vector<generation_step_t>> pull_tokens() const noexcept {
             if (num_tokens_ready() > 0) [[likely]] {
                 const auto responses = inner_.pull_tokens();
 
@@ -176,7 +176,7 @@ namespace huggingface::tgi::backends::trtllm {
             }
         }
 
-        void cancel(request_id_t request_id) noexcept {
+        void cancel(request_id_t request_id) const noexcept {
             SPDLOG_DEBUG("[FFI] cancelling request {:d}", request_id);
             inner_.cancel(request_id);
         }
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
@@ -83,7 +83,7 @@ mod ffi {
         fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
 
         fn submit(
-            self: Pin<&mut TensorRtLlmBackendImpl>,
+            self: &TensorRtLlmBackendImpl,
             tokens: &[u32],
             max_new_tokens: u32,
             top_k: u32,
@@ -95,10 +95,10 @@ mod ffi {
         ) -> Result<u64>;
 
         fn pull_tokens(
-            self: Pin<&mut TensorRtLlmBackendImpl>,
+            self: &TensorRtLlmBackendImpl,
         ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
 
-        fn cancel(self: Pin<&mut TensorRtLlmBackendImpl>, request_id: u64);
+        fn cancel(self: &TensorRtLlmBackendImpl, request_id: u64);
     }
 }
 
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
@@ -80,7 +80,7 @@ impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
 fn executor_status_looper(
     max_inflight_requests: usize,
     tokenizer: Tokenizer,
-    mut backend: UniquePtr<TensorRtLlmBackendImpl>,
+    backend: UniquePtr<TensorRtLlmBackendImpl>,
     mut backlog: UnboundedReceiver<GenerationContext>,
     created_time: Instant,
 ) {
@@ -111,7 +111,7 @@ fn executor_status_looper(
                 };
 
                 // Submit to the TensorRT-LLM executor for scheduling
-                match backend.pin_mut().submit(
+                match backend.submit(
                     &input_ids.unwrap(), // This is checked beforehand in validate()
                     stopping_params.max_new_tokens,
                     top_k,
@@ -143,8 +143,7 @@ fn executor_status_looper(
         }
 
         if backend.num_tokens_ready() > 0 {
-            let mut backend = backend.pin_mut();
-            match backend.as_mut().pull_tokens() {
+            match backend.pull_tokens() {
                 Ok(responses) => {
                     // Iterate through all the decoded token
                     for step in responses.deref() {
@@ -183,7 +182,7 @@ fn executor_status_looper(
                                     "Client dropped - removing request {} from tracked requests",
                                     step.request_id
                                 );
-                                backend.as_mut().cancel(step.request_id);
+                                backend.cancel(step.request_id);
                                 let _ = in_flights.remove(&step.request_id);
                             }
                         } else {

Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ namespace huggingface::tgi::backends::trtllm {`
`107`	`107`
`108`	`108`	`class tensorrt_llm_backend_t {`
`109`	`109`	`private:`
`110`		`- backend_t inner_;`
	`110`	`+ mutable backend_t inner_;`
`111`	`111`
`112`	`112`	`// m_created_time is a reference point to convert time from c++ time_point`
`113`	`113`	`// to rust Instant.`
`@@ -131,7 +131,7 @@ namespace huggingface::tgi::backends::trtllm {`
`131`	`131`	`float_t repetition_penalty,`
`132`	`132`	`float_t frequency_penalty,`
`133`	`133`	`uint64_t seed`
`134`		`- ) {`
	`134`	`+ ) const {`
`135`	`135`	`// This is enabled only if using add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)`
`136`	`136`	`SPDLOG_TRACE(FMT_STRING("[FFI] Submitting {:d} prompt tokens to the executor"));`
`137`	`137`
`@@ -152,7 +152,7 @@ namespace huggingface::tgi::backends::trtllm {`
`152`	`152`	`}`
`153`	`153`	`}`
`154`	`154`
`155`		`- std::unique_ptr<std::vector<generation_step_t>> pull_tokens() noexcept {`
	`155`	`+ std::unique_ptr<std::vector<generation_step_t>> pull_tokens() const noexcept {`
`156`	`156`	`if (num_tokens_ready() > 0) [[likely]] {`
`157`	`157`	`const auto responses = inner_.pull_tokens();`
`158`	`158`
`@@ -176,7 +176,7 @@ namespace huggingface::tgi::backends::trtllm {`
`176`	`176`	`}`
`177`	`177`	`}`
`178`	`178`
`179`		`- void cancel(request_id_t request_id) noexcept {`
	`179`	`+ void cancel(request_id_t request_id) const noexcept {`
`180`	`180`	`SPDLOG_DEBUG("[FFI] cancelling request {:d}", request_id);`
`181`	`181`	`inner_.cancel(request_id);`
`182`	`182`	`}`