Skip to content

Commit e2b0063

Browse files
committed
feat(trtllm): separate request and response loop
The executor_status_looper spend CPU time polling at the number of tokens. Because the function is protected by mutex inside, this also interferes with the Executor. Because now the TensorRtLlmBackendImpl is interior mutable, we can mark it as `Send` and share it in multiple threads. Therefore, the loop can be split into request and response parts, and we can await for tokens instead of constantly polling.
1 parent 161f62e commit e2b0063

File tree

5 files changed

+159
-148
lines changed

5 files changed

+159
-148
lines changed

backends/trtllm/csrc/backend.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,6 @@ namespace huggingface::tgi::backends::trtllm {
4646
backend_t::backend_t(std::filesystem::path &engines_folder, std::filesystem::path &executor_worker_path)
4747
: workspace(engines_folder, executor_worker_path), executor_(executor_factory_initializer(workspace)) {}
4848

49-
size_t backend_t::num_tokens_ready() const noexcept {
50-
return executor_.getNumResponsesReady();
51-
}
52-
5349
std::expected<request_id_t, backend_error_t>
5450
backend_t::submit(std::span<const token_id_t> token_ids, const generation_params_t g_params,
5551
const sampling_params_t s_params) noexcept {

backends/trtllm/csrc/backend.hpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -175,13 +175,6 @@ namespace huggingface::tgi::backends::trtllm {
175175
submit(std::span<const token_id_t> token_ids, generation_params_t generation_params,
176176
sampling_params_t sampling_params) noexcept;
177177

178-
/**
179-
* Query the number of tokens available across all in-flight generations
180-
* @return
181-
*/
182-
[[nodiscard("Pulling out the number of tokens")]]
183-
size_t num_tokens_ready() const noexcept;
184-
185178
/**
186179
* Pull out newly generated tokens from the executor
187180
* @return

backends/trtllm/csrc/ffi.hpp

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,6 @@ namespace huggingface::tgi::backends::trtllm {
120120
m_created_time {created_time}
121121
{}
122122

123-
size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
124-
125123
request_id_t submit(
126124
rust::Slice<const uint32_t> tokens,
127125
uint32_t max_new_tokens,
@@ -153,27 +151,22 @@ namespace huggingface::tgi::backends::trtllm {
153151
}
154152

155153
std::unique_ptr<std::vector<generation_step_t>> pull_tokens() const noexcept {
156-
if (num_tokens_ready() > 0) [[likely]] {
157-
const auto responses = inner_.pull_tokens();
154+
const auto responses = inner_.pull_tokens();
158155

159-
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
156+
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
160157

161-
auto f = [this](const tle::Response &r){
162-
return as_generation_step(r, m_created_time);
163-
};
164-
// Transform tle::Response to generation_step_t
158+
auto f = [this](const tle::Response &r){
159+
return as_generation_step(r, m_created_time);
160+
};
161+
auto steps = std::make_unique<std::vector<generation_step_t>>();
162+
// Transform tle::Response to generation_step_t
165163
#ifdef __cpp_lib_ranges_to_container
166-
auto steps = responses | std::views::transform(f) | std::ranges::to<std::vector>();
164+
*steps = responses | std::views::transform(f) | std::ranges::to<std::vector>();
167165
#else
168-
auto steps = std::vector<generation_step_t>();
169-
steps.reserve(responses.size());
170-
std::transform(responses.begin(), responses.end(), std::back_inserter(steps), f);
166+
steps->reserve(responses.size());
167+
std::transform(responses.begin(), responses.end(), std::back_inserter(steps), f);
171168
#endif
172-
return std::make_unique<std::vector<generation_step_t>>(steps);
173-
174-
} else {
175-
return std::make_unique<std::vector<generation_step_t>>();
176-
}
169+
return steps;
177170
}
178171

179172
void cancel(request_id_t request_id) const noexcept {

backends/trtllm/src/lib.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,6 @@ mod ffi {
8080
executor_worker: &str,
8181
) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
8282

83-
fn num_tokens_ready(self: &TensorRtLlmBackendImpl) -> usize;
84-
8583
fn submit(
8684
self: &TensorRtLlmBackendImpl,
8785
tokens: &[u32],

0 commit comments

Comments
 (0)