Skip to content

Commit f7bd82a

Browse files
committed
feat(trtllm): get more accurate start time
Get a more accurate inference start time from the trtllm response. Because `Instant` does not expose absolute value, create reference points on both sides and return duration relative to the reference point instead.
1 parent 41819d7 commit f7bd82a

File tree

4 files changed

+70
-14
lines changed

4 files changed

+70
-14
lines changed

backends/trtllm/csrc/backend.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,14 @@ namespace huggingface::tgi::backends::trtllm {
5959
static_cast<tle::SizeType32>(g_params.max_new_tokens),
6060
true,
6161
(tle::SamplingConfig) s_params,
62-
tle::OutputConfig{ /* returnLogProbs= */ true},
62+
tle::OutputConfig{
63+
/* returnLogProbs= */ true,
64+
false,
65+
false,
66+
false,
67+
false,
68+
/* returnPerfMetrics=*/ true,
69+
},
6370
std::nullopt,
6471
std::nullopt,
6572
std::nullopt,

backends/trtllm/csrc/ffi.hpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef TGI_BACKEND_TRTLLM_FFI
22
#define TGI_BACKEND_TRTLLM_FFI
33

4+
#include <chrono>
45
#include <exception>
56
#include <memory>
67
#include <thread>
@@ -52,7 +53,7 @@ namespace huggingface::tgi::backends::trtllm {
5253
}
5354
}
5455

55-
static auto as_generation_step = [](const tle::Response &r) {
56+
static auto as_generation_step = [](const tle::Response &r, const std::chrono::time_point<std::chrono::steady_clock> created) {
5657
const auto reqId = r.getRequestId();
5758
if (!r.hasError()) [[likely]] {
5859
const auto result = r.getResult();
@@ -66,14 +67,23 @@ namespace huggingface::tgi::backends::trtllm {
6667
log_prob = result.logProbs.value()[0].back();
6768
}
6869

70+
std::optional<int64_t> first_scheduled_time_ns = std::nullopt;
71+
if (result.requestPerfMetrics) {
72+
const auto &t = result.requestPerfMetrics->timingMetrics;
73+
const auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t.firstScheduledTime - created).count();
74+
first_scheduled_time_ns = static_cast<int64_t>(ns);
75+
}
76+
6977
return generation_step_t{
7078
reqId,
7179
token_id.value_or(0),
7280
log_prob.value_or(0.0),
81+
first_scheduled_time_ns.value_or(0),
7382
result.isFinal,
7483
as_finish_reason_t(result.finishReasons[0]),
7584
token_id.has_value(),
7685
log_prob.has_value(),
86+
first_scheduled_time_ns.has_value(),
7787
false,
7888
std::string()
7989
};
@@ -82,10 +92,12 @@ namespace huggingface::tgi::backends::trtllm {
8292
reqId,
8393
0,
8494
0.0,
95+
0,
8596
true,
8697
finish_reason_t::kNOT_FINISHED,
8798
false,
8899
false,
100+
false,
89101
true,
90102
std::move(r.getErrorMsg())
91103
};
@@ -97,9 +109,16 @@ namespace huggingface::tgi::backends::trtllm {
97109
private:
98110
backend_t inner_;
99111

112+
// m_created_time is a reference point to convert time from c++ time_point
113+
// to rust Instant.
114+
std::chrono::time_point<std::chrono::steady_clock> m_created_time;
115+
116+
100117
public:
101-
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path)
102-
: inner_(engine_folder, executor_worker_path) {}
118+
tensorrt_llm_backend_t(std::filesystem::path &&engine_folder, std::filesystem::path &&executor_worker_path, const std::chrono::time_point<std::chrono::steady_clock>& created_time)
119+
: inner_(engine_folder, executor_worker_path),
120+
m_created_time {created_time}
121+
{}
103122

104123
size_t num_tokens_ready() const noexcept { return inner_.num_tokens_ready(); }
105124

@@ -139,13 +158,16 @@ namespace huggingface::tgi::backends::trtllm {
139158

140159
SPDLOG_TRACE("[FFI] Successfully pulled out {:d} responses from executor", responses.size());
141160

161+
auto f = [this](const tle::Response &r){
162+
return as_generation_step(r, m_created_time);
163+
};
142164
// Transform tle::Response to generation_step_t
143165
#ifdef __cpp_lib_ranges_to_container
144-
auto steps = responses | std::views::transform(as_generation_step) | std::ranges::to<std::vector>();
166+
auto steps = responses | std::views::transform(f) | std::ranges::to<std::vector>();
145167
#else
146168
auto steps = std::vector<generation_step_t>();
147169
steps.reserve(responses.size());
148-
std::transform(responses.begin(), responses.end(), std::back_inserter(steps), as_generation_step);
170+
std::transform(responses.begin(), responses.end(), std::back_inserter(steps), f);
149171
#endif
150172
return std::make_unique<std::vector<generation_step_t>>(steps);
151173

@@ -197,12 +219,14 @@ namespace huggingface::tgi::backends::trtllm {
197219

198220
std::unique_ptr<tensorrt_llm_backend_t>
199221
create_backend_from_engine_folder(const rust::Str engines_folder, const rust::Str executor_worker_path) {
222+
const auto created_time = std::chrono::steady_clock::now();
200223
std::call_once(backend_initialized_flag, initialize_tensorrt_llm_backend);
201224
return std::make_unique<tensorrt_llm_backend_t>(
202225
std::filesystem::path(std::string_view(engines_folder.begin(), engines_folder.end()),
203226
std::filesystem::path::format::auto_format),
204227
std::filesystem::path(std::string_view(executor_worker_path.begin(), executor_worker_path.end()),
205-
std::filesystem::path::format::auto_format)
228+
std::filesystem::path::format::auto_format),
229+
created_time
206230
);
207231
}
208232
}

backends/trtllm/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,14 @@ mod ffi {
4242
request_id: u64,
4343
token_id: u32,
4444
log_prob: f32,
45+
46+
/// The time of first schedule since the creation of the backend
47+
first_scheduled_time_ns: i64,
4548
is_final: bool,
4649
finish_reason: FinishReason,
4750
token_id_valid: bool,
4851
log_prob_valid: bool,
52+
first_scheduled_time_ns_valid: bool,
4953
has_error: bool,
5054
error_msg: String,
5155
}

backends/trtllm/src/looper.rs

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use tokenizers::Tokenizer;
88
use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
99
use tokio::sync::TryAcquireError;
1010
use tokio::task::spawn_blocking;
11-
use tokio::time::Instant;
11+
use tokio::time::{Duration, Instant};
1212
use tokio_stream::wrappers::UnboundedReceiverStream;
1313
use tracing::{debug, error, warn};
1414

@@ -82,6 +82,7 @@ fn executor_status_looper(
8282
tokenizer: Tokenizer,
8383
mut backend: UniquePtr<TensorRtLlmBackendImpl>,
8484
mut backlog: UnboundedReceiver<GenerationContext>,
85+
created_time: Instant,
8586
) {
8687
// Track the tuple (request_id, stream) for each request
8788
let mut in_flights =
@@ -144,12 +145,22 @@ fn executor_status_looper(
144145
for step in responses.deref() {
145146
if let Some(ctx) = in_flights.get_mut(&step.request_id) {
146147
// Update the starting timestamp if not set
147-
// This value might not be the actual real starting time of the request
148-
// on the executor side - Need to expose more info from the executor to
149-
// retrieve this value
150-
// TODO : Expose actual real starting time for a request on FFI layer
151148
if ctx.start.is_none() {
152-
ctx.start = Some(Instant::now());
149+
if step.first_scheduled_time_ns_valid {
150+
if step.first_scheduled_time_ns >= 0 {
151+
ctx.start = created_time.checked_add(Duration::from_nanos(
152+
step.first_scheduled_time_ns as u64,
153+
));
154+
} else {
155+
ctx.start = created_time.checked_sub(Duration::from_nanos(
156+
-step.first_scheduled_time_ns as u64,
157+
));
158+
}
159+
}
160+
161+
if ctx.start.is_none() {
162+
ctx.start = Some(Instant::now());
163+
}
153164
}
154165

155166
// Try to map the generation step to a DecodedToken
@@ -348,13 +359,23 @@ impl TensorRtLlmBackendV2 {
348359
// Allocate the IPC layer to communicate with the backend
349360
let (executor_sender, executor_receiver) = unbounded_channel();
350361

362+
// This is a reference point to convert time from c++ time_point
363+
// to rust Instant.
364+
let created_time = Instant::now();
365+
351366
// Create the FFI backend
352367
let backend = create_backend_from_engine_folder(&engine_folder, &executor_worker_path)
353368
.map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
354369

355370
// Executor looper is responsible for scheduling and pulling requests state at regular interval
356371
spawn_blocking(move || {
357-
executor_status_looper(max_inflight_requests, tokenizer, backend, executor_receiver)
372+
executor_status_looper(
373+
max_inflight_requests,
374+
tokenizer,
375+
backend,
376+
executor_receiver,
377+
created_time,
378+
)
358379
});
359380

360381
Ok(TensorRtLlmBackendV2(executor_sender))

0 commit comments

Comments
 (0)