Skip to content

Commit d90dc39

Browse files
GuanLuormccorm4
andauthored
feat: add Kserve readiness endpoint support: ServerLive / ServerReady / ModelReady (#4708)
Signed-off-by: Guan Luo <gluo@nvidia.com> Signed-off-by: GuanLuo <41310872+GuanLuo@users.noreply.github.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
1 parent 66f319a commit d90dc39

File tree

5 files changed

+249
-28
lines changed

5 files changed

+249
-28
lines changed

lib/llm/src/grpc/protos/kserve.proto

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,27 @@ import "model_config.proto";
1616
//@@
1717
service GRPCInferenceService
1818
{
19+
//@@ .. cpp:var:: rpc ServerLive(ServerLiveRequest) returns
20+
//@@ (ServerLiveResponse)
21+
//@@
22+
//@@ Check liveness of the inference server.
23+
//@@
24+
rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {}
25+
26+
//@@ .. cpp:var:: rpc ServerReady(ServerReadyRequest) returns
27+
//@@ (ServerReadyResponse)
28+
//@@
29+
//@@ Check readiness of the inference server.
30+
//@@
31+
rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {}
32+
33+
//@@ .. cpp:var:: rpc ModelReady(ModelReadyRequest) returns
34+
//@@ (ModelReadyResponse)
35+
//@@
36+
//@@ Check readiness of a model in the inference server.
37+
//@@
38+
rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {}
39+
1940
//@@ .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns
2041
//@@ (ModelMetadataResponse)
2142
//@@
@@ -45,6 +66,89 @@ service GRPCInferenceService
4566
rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {}
4667
}
4768

69+
//@@
70+
//@@.. cpp:var:: message ServerLiveRequest
71+
//@@
72+
//@@ Request message for ServerLive.
73+
//@@
74+
message ServerLiveRequest {}
75+
76+
//@@
77+
//@@.. cpp:var:: message ServerLiveResponse
78+
//@@
79+
//@@ Response message for ServerLive.
80+
//@@
81+
message ServerLiveResponse
82+
{
83+
//@@
84+
//@@ .. cpp:var:: bool live
85+
//@@
86+
//@@ True if the inference server is live, false if not live.
87+
//@@
88+
bool live = 1;
89+
}
90+
91+
//@@
92+
//@@.. cpp:var:: message ServerReadyRequest
93+
//@@
94+
//@@ Request message for ServerReady.
95+
//@@
96+
message ServerReadyRequest {}
97+
98+
//@@
99+
//@@.. cpp:var:: message ServerReadyResponse
100+
//@@
101+
//@@ Response message for ServerReady.
102+
//@@
103+
message ServerReadyResponse
104+
{
105+
//@@
106+
//@@ .. cpp:var:: bool ready
107+
//@@
108+
//@@ True if the inference server is ready, false if not ready. The server
109+
//@@ is considered ready if it has any registered models, since models
110+
//@@ can freely be registered and unregistered at runtime.
111+
//@@
112+
bool ready = 1;
113+
}
114+
115+
//@@
116+
//@@.. cpp:var:: message ModelReadyRequest
117+
//@@
118+
//@@ Request message for ModelReady.
119+
//@@
120+
message ModelReadyRequest
121+
{
122+
//@@
123+
//@@ .. cpp:var:: string name
124+
//@@
125+
//@@ The name of the model to check for readiness.
126+
//@@
127+
string name = 1;
128+
129+
//@@ .. cpp:var:: string version
130+
//@@
131+
//@@ The version of the model to check for readiness. If not given the
132+
//@@ server will choose a version based on the model and internal policy.
133+
//@@
134+
string version = 2;
135+
}
136+
137+
//@@
138+
//@@.. cpp:var:: message ModelReadyResponse
139+
//@@
140+
//@@ Response message for ModelReady.
141+
//@@
142+
message ModelReadyResponse
143+
{
144+
//@@
145+
//@@ .. cpp:var:: bool ready
146+
//@@
147+
//@@ True if the model is ready, false if not ready.
148+
//@@
149+
bool ready = 1;
150+
}
151+
48152
//@@
49153
//@@.. cpp:var:: message ModelMetadataRequest
50154
//@@

lib/llm/src/grpc/service/kserve.rs

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,4 +675,38 @@ impl GrpcInferenceService for KserveService {
675675
request_model_name
676676
)))
677677
}
678+
679+
async fn server_live(
680+
&self,
681+
_request: Request<inference::ServerLiveRequest>,
682+
) -> Result<Response<inference::ServerLiveResponse>, Status> {
683+
// server is live if we can respond
684+
Ok(Response::new(inference::ServerLiveResponse { live: true }))
685+
}
686+
687+
async fn server_ready(
688+
&self,
689+
_request: Request<inference::ServerReadyRequest>,
690+
) -> Result<Response<inference::ServerReadyResponse>, Status> {
691+
let has_models = !self.state.manager().get_model_cards().is_empty();
692+
Ok(Response::new(inference::ServerReadyResponse {
693+
ready: has_models,
694+
}))
695+
}
696+
697+
async fn model_ready(
698+
&self,
699+
request: Request<inference::ModelReadyRequest>,
700+
) -> Result<Response<inference::ModelReadyResponse>, Status> {
701+
let request_model_name = &request.into_inner().name;
702+
let is_ready = self
703+
.state
704+
.manager()
705+
.get_model_cards()
706+
.into_iter()
707+
.any(|card| request_model_name == &card.display_name);
708+
Ok(Response::new(inference::ModelReadyResponse {
709+
ready: is_ready,
710+
}))
711+
}
678712
}

lib/llm/tests/kserve_service.rs

Lines changed: 99 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,22 @@
55
mod ports;
66

77
pub mod kserve_test {
8+
// [gluo NOTE] Tests may run in parallel, use this enum to keep track of
9+
// port used for different test cases
10+
enum TestPort {
11+
InferFailure = 8988,
12+
InferSuccess = 8989,
13+
StreamInferFailure = 8990,
14+
StreamInferSuccess = 8991,
15+
InferCancellation = 8992,
16+
StreamInferCancellation = 8993,
17+
ModelInfo = 8994,
18+
TensorModel = 8995,
19+
TensorModelTypes = 8996,
20+
TritonModelConfig = 8997,
21+
LiveReady = 8998,
22+
}
23+
824
// For using gRPC client for test
925
pub mod inference {
1026
tonic::include_proto!("inference");
@@ -16,6 +32,7 @@ pub mod kserve_test {
1632
use inference::grpc_inference_service_client::GrpcInferenceServiceClient;
1733
use inference::{
1834
DataType, ModelConfigRequest, ModelInferRequest, ModelInferResponse, ModelMetadataRequest,
35+
ModelReadyRequest, ServerLiveRequest, ServerReadyRequest,
1936
};
2037

2138
use anyhow::Error;
@@ -354,21 +371,6 @@ pub mod kserve_test {
354371
}
355372
}
356373

357-
// Tests may run in parallel, use this enum to keep track of port used for different
358-
// test cases
359-
enum TestPort {
360-
InferFailure = 8988,
361-
InferSuccess = 8989,
362-
StreamInferFailure = 8990,
363-
StreamInferSuccess = 8991,
364-
InferCancellation = 8992,
365-
StreamInferCancellation = 8993,
366-
ModelInfo = 8994,
367-
TensorModel = 8995,
368-
TensorModelTypes = 8996,
369-
TritonModelConfig = 8997,
370-
}
371-
372374
#[rstest]
373375
#[tokio::test]
374376
async fn test_infer_failure(
@@ -1971,4 +1973,86 @@ pub mod kserve_test {
19711973
cancel_token.cancel();
19721974
let _ = tokio::join!(grpc_task, http_task);
19731975
}
1976+
1977+
#[rstest]
1978+
#[tokio::test]
1979+
async fn test_live_ready() {
1980+
let grpc_port = TestPort::LiveReady as u16;
1981+
let service = KserveService::builder().port(grpc_port).build().unwrap();
1982+
1983+
// start server
1984+
let _running = RunningService::spawn(service.clone());
1985+
1986+
let mut client = get_ready_client(grpc_port, 5).await;
1987+
1988+
// Check server liveness
1989+
let server_live_request = tonic::Request::new(ServerLiveRequest {});
1990+
let server_live_response = client.server_live(server_live_request).await.unwrap();
1991+
let server_live = server_live_response.get_ref().live;
1992+
assert!(server_live, "Server should be live");
1993+
1994+
// Check server readiness
1995+
let server_ready_request = tonic::Request::new(ServerReadyRequest {});
1996+
let server_ready_response = client.server_ready(server_ready_request).await.unwrap();
1997+
let server_ready = server_ready_response.get_ref().ready;
1998+
assert!(
1999+
!server_ready,
2000+
"Server should not be ready without model registered"
2001+
);
2002+
2003+
// Check model readiness for unregistered model
2004+
let model_ready_request = tonic::Request::new(ModelReadyRequest {
2005+
name: "tensor".into(),
2006+
version: "".into(),
2007+
});
2008+
let model_ready_response = client.model_ready(model_ready_request).await.unwrap();
2009+
let model_ready = model_ready_response.get_ref().ready;
2010+
assert!(!model_ready, "Unregistered model should not be ready");
2011+
2012+
// Register a tensor model
2013+
let mut card = ModelDeploymentCard::with_name_only("tensor");
2014+
card.model_type = ModelType::TensorBased;
2015+
card.model_input = ModelInput::Tensor;
2016+
card.runtime_config = ModelRuntimeConfig {
2017+
tensor_model_config: Some(tensor::TensorModelConfig {
2018+
name: "tensor".to_string(),
2019+
inputs: vec![tensor::TensorMetadata {
2020+
name: "input".to_string(),
2021+
data_type: tensor::DataType::Int32,
2022+
shape: vec![1],
2023+
parameters: Default::default(),
2024+
}],
2025+
outputs: vec![tensor::TensorMetadata {
2026+
name: "output".to_string(),
2027+
data_type: tensor::DataType::Bool,
2028+
shape: vec![-1],
2029+
parameters: Default::default(),
2030+
}],
2031+
triton_model_config: None,
2032+
}),
2033+
..Default::default()
2034+
};
2035+
let tensor = Arc::new(TensorEngine {});
2036+
service
2037+
.model_manager()
2038+
.add_tensor_model("tensor", card.mdcsum(), tensor.clone())
2039+
.unwrap();
2040+
let _ = service.model_manager().save_model_card("key", card);
2041+
2042+
// Re-check readiness
2043+
// Check server readiness
2044+
let server_ready_request = tonic::Request::new(ServerReadyRequest {});
2045+
let server_ready_response = client.server_ready(server_ready_request).await.unwrap();
2046+
let server_ready = server_ready_response.get_ref().ready;
2047+
assert!(server_ready, "Server should be ready with model registered");
2048+
2049+
// Check model readiness for unregistered model
2050+
let model_ready_request = tonic::Request::new(ModelReadyRequest {
2051+
name: "tensor".into(),
2052+
version: "".into(),
2053+
});
2054+
let model_ready_response = client.model_ready(model_ready_request).await.unwrap();
2055+
let model_ready = model_ready_response.get_ref().ready;
2056+
assert!(model_ready, "Registered model should be ready");
2057+
}
19742058
}

tests/frontend/grpc/test_tensor_mocker_engine.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,5 +125,6 @@ def start_services(request, runtime_services):
125125
@pytest.mark.integration
126126
@pytest.mark.model(TEST_MODEL)
127127
def test_echo() -> None:
128+
triton_echo_client.check_health()
128129
triton_echo_client.run_infer()
129130
triton_echo_client.get_config()

tests/frontend/grpc/triton_echo_client.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,22 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import sys
54

65
import numpy as np
76
import tritonclient.grpc as grpcclient
87

8+
SERVER_URL = "localhost:8000"
9+
10+
11+
def check_health():
12+
triton_client = grpcclient.InferenceServerClient(url=SERVER_URL)
13+
assert triton_client.is_server_live()
14+
assert triton_client.is_server_ready()
15+
assert triton_client.is_model_ready("echo")
16+
917

1018
def run_infer():
11-
server_url = "localhost:8000"
12-
try:
13-
triton_client = grpcclient.InferenceServerClient(url=server_url)
14-
except Exception as e:
15-
print("channel creation failed: " + str(e))
16-
sys.exit()
19+
triton_client = grpcclient.InferenceServerClient(url=SERVER_URL)
1720

1821
model_name = "echo"
1922

@@ -46,12 +49,7 @@ def run_infer():
4649

4750

4851
def get_config():
49-
server_url = "localhost:8000"
50-
try:
51-
triton_client = grpcclient.InferenceServerClient(url=server_url)
52-
except Exception as e:
53-
print("channel creation failed: " + str(e))
54-
sys.exit()
52+
triton_client = grpcclient.InferenceServerClient(url=SERVER_URL)
5553

5654
model_name = "echo"
5755
response = triton_client.get_model_config(model_name=model_name)

0 commit comments

Comments
 (0)