Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions lib/llm/src/grpc/protos/kserve.proto
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,27 @@ import "model_config.proto";
//@@
service GRPCInferenceService
{
//@@ .. cpp:var:: rpc ServerLive(ServerLiveRequest) returns
//@@ (ServerLiveResponse)
//@@
//@@ Check liveness of the inference server.
//@@
rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {}

//@@ .. cpp:var:: rpc ServerReady(ServerReadyRequest) returns
//@@ (ServerReadyResponse)
//@@
//@@ Check readiness of the inference server.
//@@
rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {}

//@@ .. cpp:var:: rpc ModelReady(ModelReadyRequest) returns
//@@ (ModelReadyResponse)
//@@
//@@ Check readiness of a model in the inference server.
//@@
rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {}

//@@ .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns
//@@ (ModelMetadataResponse)
//@@
Expand Down Expand Up @@ -45,6 +66,87 @@ service GRPCInferenceService
rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {}
}

//@@
//@@.. cpp:var:: message ServerLiveRequest
//@@
//@@ Request message for ServerLive.
//@@
message ServerLiveRequest {}

//@@
//@@.. cpp:var:: message ServerLiveResponse
//@@
//@@ Response message for ServerLive.
//@@
message ServerLiveResponse
{
//@@
//@@ .. cpp:var:: bool live
//@@
//@@ True if the inference server is live, false it not live.
//@@
bool live = 1;
}

//@@
//@@.. cpp:var:: message ServerReadyRequest
//@@
//@@ Request message for ServerReady.
//@@
message ServerReadyRequest {}

//@@
//@@.. cpp:var:: message ServerReadyResponse
//@@
//@@ Response message for ServerReady.
//@@
message ServerReadyResponse
{
//@@
//@@ .. cpp:var:: bool ready
//@@
//@@ True if the inference server is ready, false it not ready.
//@@
bool ready = 1;
}

//@@
//@@.. cpp:var:: message ModelReadyRequest
//@@
//@@ Request message for ModelReady.
//@@
message ModelReadyRequest
{
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The name of the model to check for readiness.
//@@
string name = 1;

//@@ .. cpp:var:: string version
//@@
//@@ The version of the model to check for readiness. If not given the
//@@ server will choose a version based on the model and internal policy.
//@@
string version = 2;
}

//@@
//@@.. cpp:var:: message ModelReadyResponse
//@@
//@@ Response message for ModelReady.
//@@
message ModelReadyResponse
{
//@@
//@@ .. cpp:var:: bool ready
//@@
//@@ True if the model is ready, false it not ready.
//@@
bool ready = 1;
}

//@@
//@@.. cpp:var:: message ModelMetadataRequest
//@@
Expand Down
34 changes: 34 additions & 0 deletions lib/llm/src/grpc/service/kserve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -675,4 +675,38 @@ impl GrpcInferenceService for KserveService {
request_model_name
)))
}

async fn server_live(
&self,
_request: Request<inference::ServerLiveRequest>,
) -> Result<Response<inference::ServerLiveResponse>, Status> {
// server is live if we can respond
Ok(Response::new(inference::ServerLiveResponse { live: true }))
}

async fn server_ready(
&self,
_request: Request<inference::ServerReadyRequest>,
) -> Result<Response<inference::ServerReadyResponse>, Status> {
let has_models = !self.state.manager().get_model_cards().is_empty();
Ok(Response::new(inference::ServerReadyResponse {
ready: has_models,
}))
}

async fn model_ready(
&self,
request: Request<inference::ModelReadyRequest>,
) -> Result<Response<inference::ModelReadyResponse>, Status> {
let request_model_name = &request.into_inner().name;
let is_ready = self
.state
.manager()
.get_model_cards()
.into_iter()
.any(|card| request_model_name == &card.display_name);
Ok(Response::new(inference::ModelReadyResponse {
ready: is_ready,
}))
}
}
114 changes: 99 additions & 15 deletions lib/llm/tests/kserve_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,22 @@
mod ports;

pub mod kserve_test {
// [gluo NOTE] Tests may run in parallel, use this enum to keep track of
// port used for different test cases
enum TestPort {
InferFailure = 8988,
InferSuccess = 8989,
StreamInferFailure = 8990,
StreamInferSuccess = 8991,
InferCancellation = 8992,
StreamInferCancellation = 8993,
ModelInfo = 8994,
TensorModel = 8995,
TensorModelTypes = 8996,
TritonModelConfig = 8997,
LiveReady = 8998,
}

// For using gRPC client for test
pub mod inference {
tonic::include_proto!("inference");
Expand All @@ -16,6 +32,7 @@ pub mod kserve_test {
use inference::grpc_inference_service_client::GrpcInferenceServiceClient;
use inference::{
DataType, ModelConfigRequest, ModelInferRequest, ModelInferResponse, ModelMetadataRequest,
ModelReadyRequest, ServerLiveRequest, ServerReadyRequest,
};

use anyhow::Error;
Expand Down Expand Up @@ -354,21 +371,6 @@ pub mod kserve_test {
}
}

// Tests may run in parallel, use this enum to keep track of port used for different
// test cases
enum TestPort {
InferFailure = 8988,
InferSuccess = 8989,
StreamInferFailure = 8990,
StreamInferSuccess = 8991,
InferCancellation = 8992,
StreamInferCancellation = 8993,
ModelInfo = 8994,
TensorModel = 8995,
TensorModelTypes = 8996,
TritonModelConfig = 8997,
}

#[rstest]
#[tokio::test]
async fn test_infer_failure(
Expand Down Expand Up @@ -1971,4 +1973,86 @@ pub mod kserve_test {
cancel_token.cancel();
let _ = tokio::join!(grpc_task, http_task);
}

#[rstest]
#[tokio::test]
async fn test_live_ready() {
let grpc_port = TestPort::LiveReady as u16;
let service = KserveService::builder().port(grpc_port).build().unwrap();

// start server
let _running = RunningService::spawn(service.clone());

let mut client = get_ready_client(grpc_port, 5).await;

// Check server liveness
let server_live_request = tonic::Request::new(ServerLiveRequest {});
let server_live_response = client.server_live(server_live_request).await.unwrap();
let server_live = server_live_response.get_ref().live;
assert!(server_live, "Server should be live");

// Check server readiness
let server_ready_request = tonic::Request::new(ServerReadyRequest {});
let server_ready_response = client.server_ready(server_ready_request).await.unwrap();
let server_ready = server_ready_response.get_ref().ready;
assert!(
!server_ready,
"Server should not be ready without model registered"
);

// Check model readiness for unregistered model
let model_ready_request = tonic::Request::new(ModelReadyRequest {
name: "tensor".into(),
version: "".into(),
});
let model_ready_response = client.model_ready(model_ready_request).await.unwrap();
let model_ready = model_ready_response.get_ref().ready;
assert!(!model_ready, "Unregistered model should not be ready");

// Register a tensor model
let mut card = ModelDeploymentCard::with_name_only("tensor");
card.model_type = ModelType::TensorBased;
card.model_input = ModelInput::Tensor;
card.runtime_config = ModelRuntimeConfig {
tensor_model_config: Some(tensor::TensorModelConfig {
name: "tensor".to_string(),
inputs: vec![tensor::TensorMetadata {
name: "input".to_string(),
data_type: tensor::DataType::Int32,
shape: vec![1],
parameters: Default::default(),
}],
outputs: vec![tensor::TensorMetadata {
name: "output".to_string(),
data_type: tensor::DataType::Bool,
shape: vec![-1],
parameters: Default::default(),
}],
triton_model_config: None,
}),
..Default::default()
};
let tensor = Arc::new(TensorEngine {});
service
.model_manager()
.add_tensor_model("tensor", card.mdcsum(), tensor.clone())
.unwrap();
let _ = service.model_manager().save_model_card("key", card);

// Re-check readiness
// Check server readiness
let server_ready_request = tonic::Request::new(ServerReadyRequest {});
let server_ready_response = client.server_ready(server_ready_request).await.unwrap();
let server_ready = server_ready_response.get_ref().ready;
assert!(server_ready, "Server should be ready with model registered");

// Check model readiness for unregistered model
let model_ready_request = tonic::Request::new(ModelReadyRequest {
name: "tensor".into(),
version: "".into(),
});
let model_ready_response = client.model_ready(model_ready_request).await.unwrap();
let model_ready = model_ready_response.get_ref().ready;
assert!(model_ready, "Registered model should be ready");
}
}
1 change: 1 addition & 0 deletions tests/frontend/grpc/test_tensor_mocker_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,6 @@ def start_services(request, runtime_services):
@pytest.mark.integration
@pytest.mark.model(TEST_MODEL)
def test_echo() -> None:
triton_echo_client.check_health()
triton_echo_client.run_infer()
triton_echo_client.get_config()
12 changes: 12 additions & 0 deletions tests/frontend/grpc/triton_echo_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@
import tritonclient.grpc as grpcclient


def check_health():
server_url = "localhost:8000"
try:
triton_client = grpcclient.InferenceServerClient(url=server_url)
except Exception as e:
print("channel creation failed: " + str(e))
sys.exit()
assert triton_client.is_server_live()
assert triton_client.is_server_ready()
assert triton_client.is_model_ready("echo")


def run_infer():
server_url = "localhost:8000"
try:
Expand Down
Loading