Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions lib/llm/src/grpc/protos/kserve.proto
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,27 @@ import "model_config.proto";
//@@
service GRPCInferenceService
{
//@@ .. cpp:var:: rpc ServerLive(ServerLiveRequest) returns
//@@ (ServerLiveResponse)
//@@
//@@ Check liveness of the inference server.
//@@
rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {}

//@@ .. cpp:var:: rpc ServerReady(ServerReadyRequest) returns
//@@ (ServerReadyResponse)
//@@
//@@ Check readiness of the inference server.
//@@
rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {}

//@@ .. cpp:var:: rpc ModelReady(ModelReadyRequest) returns
//@@ (ModelReadyResponse)
//@@
//@@ Check readiness of a model in the inference server.
//@@
rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {}

//@@ .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns
//@@ (ModelMetadataResponse)
//@@
Expand Down Expand Up @@ -45,6 +66,89 @@ service GRPCInferenceService
rpc ModelConfig(ModelConfigRequest) returns (ModelConfigResponse) {}
}

//@@
//@@.. cpp:var:: message ServerLiveRequest
//@@
//@@ Request message for ServerLive.
//@@
message ServerLiveRequest {}

//@@
//@@.. cpp:var:: message ServerLiveResponse
//@@
//@@ Response message for ServerLive.
//@@
message ServerLiveResponse
{
//@@
//@@ .. cpp:var:: bool live
//@@
//@@ True if the inference server is live, false if not live.
//@@
bool live = 1;
}

//@@
//@@.. cpp:var:: message ServerReadyRequest
//@@
//@@ Request message for ServerReady.
//@@
message ServerReadyRequest {}

//@@
//@@.. cpp:var:: message ServerReadyResponse
//@@
//@@ Response message for ServerReady.
//@@
message ServerReadyResponse
{
//@@
//@@ .. cpp:var:: bool ready
//@@
//@@ True if the inference server is ready, false if not ready. The server
//@@ is considered ready if it has any registered models, since models
//@@ can freely be registered and unregistered at runtime.
//@@
bool ready = 1;
}

//@@
//@@.. cpp:var:: message ModelReadyRequest
//@@
//@@ Request message for ModelReady.
//@@
message ModelReadyRequest
{
//@@
//@@ .. cpp:var:: string name
//@@
//@@ The name of the model to check for readiness.
//@@
string name = 1;

//@@ .. cpp:var:: string version
//@@
//@@ The version of the model to check for readiness. If not given the
//@@ server will choose a version based on the model and internal policy.
//@@
string version = 2;
}

//@@
//@@.. cpp:var:: message ModelReadyResponse
//@@
//@@ Response message for ModelReady.
//@@
message ModelReadyResponse
{
//@@
//@@ .. cpp:var:: bool ready
//@@
//@@ True if the model is ready, false if not ready.
//@@
bool ready = 1;
}

//@@
//@@.. cpp:var:: message ModelMetadataRequest
//@@
Expand Down
34 changes: 34 additions & 0 deletions lib/llm/src/grpc/service/kserve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -675,4 +675,38 @@ impl GrpcInferenceService for KserveService {
request_model_name
)))
}

async fn server_live(
&self,
_request: Request<inference::ServerLiveRequest>,
) -> Result<Response<inference::ServerLiveResponse>, Status> {
// server is live if we can respond
Ok(Response::new(inference::ServerLiveResponse { live: true }))
}

async fn server_ready(
&self,
_request: Request<inference::ServerReadyRequest>,
) -> Result<Response<inference::ServerReadyResponse>, Status> {
let has_models = !self.state.manager().get_model_cards().is_empty();
Ok(Response::new(inference::ServerReadyResponse {
ready: has_models,
}))
}

async fn model_ready(
&self,
request: Request<inference::ModelReadyRequest>,
) -> Result<Response<inference::ModelReadyResponse>, Status> {
let request_model_name = &request.into_inner().name;
let is_ready = self
.state
.manager()
.get_model_cards()
.into_iter()
.any(|card| request_model_name == &card.display_name);
Ok(Response::new(inference::ModelReadyResponse {
ready: is_ready,
}))
}
}
114 changes: 99 additions & 15 deletions lib/llm/tests/kserve_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,22 @@
mod ports;

pub mod kserve_test {
// [gluo NOTE] Tests may run in parallel, use this enum to keep track of
// port used for different test cases
enum TestPort {
InferFailure = 8988,
InferSuccess = 8989,
StreamInferFailure = 8990,
StreamInferSuccess = 8991,
InferCancellation = 8992,
StreamInferCancellation = 8993,
ModelInfo = 8994,
TensorModel = 8995,
TensorModelTypes = 8996,
TritonModelConfig = 8997,
LiveReady = 8998,
}

// For using gRPC client for test
pub mod inference {
tonic::include_proto!("inference");
Expand All @@ -16,6 +32,7 @@ pub mod kserve_test {
use inference::grpc_inference_service_client::GrpcInferenceServiceClient;
use inference::{
DataType, ModelConfigRequest, ModelInferRequest, ModelInferResponse, ModelMetadataRequest,
ModelReadyRequest, ServerLiveRequest, ServerReadyRequest,
};

use anyhow::Error;
Expand Down Expand Up @@ -354,21 +371,6 @@ pub mod kserve_test {
}
}

// Tests may run in parallel, use this enum to keep track of port used for different
// test cases
enum TestPort {
InferFailure = 8988,
InferSuccess = 8989,
StreamInferFailure = 8990,
StreamInferSuccess = 8991,
InferCancellation = 8992,
StreamInferCancellation = 8993,
ModelInfo = 8994,
TensorModel = 8995,
TensorModelTypes = 8996,
TritonModelConfig = 8997,
}

#[rstest]
#[tokio::test]
async fn test_infer_failure(
Expand Down Expand Up @@ -1971,4 +1973,86 @@ pub mod kserve_test {
cancel_token.cancel();
let _ = tokio::join!(grpc_task, http_task);
}

#[rstest]
#[tokio::test]
async fn test_live_ready() {
let grpc_port = TestPort::LiveReady as u16;
let service = KserveService::builder().port(grpc_port).build().unwrap();

// start server
let _running = RunningService::spawn(service.clone());

let mut client = get_ready_client(grpc_port, 5).await;

// Check server liveness
let server_live_request = tonic::Request::new(ServerLiveRequest {});
let server_live_response = client.server_live(server_live_request).await.unwrap();
let server_live = server_live_response.get_ref().live;
assert!(server_live, "Server should be live");

// Check server readiness
let server_ready_request = tonic::Request::new(ServerReadyRequest {});
let server_ready_response = client.server_ready(server_ready_request).await.unwrap();
let server_ready = server_ready_response.get_ref().ready;
assert!(
!server_ready,
"Server should not be ready without model registered"
);

// Check model readiness for unregistered model
let model_ready_request = tonic::Request::new(ModelReadyRequest {
name: "tensor".into(),
version: "".into(),
});
let model_ready_response = client.model_ready(model_ready_request).await.unwrap();
let model_ready = model_ready_response.get_ref().ready;
assert!(!model_ready, "Unregistered model should not be ready");

// Register a tensor model
let mut card = ModelDeploymentCard::with_name_only("tensor");
card.model_type = ModelType::TensorBased;
card.model_input = ModelInput::Tensor;
card.runtime_config = ModelRuntimeConfig {
tensor_model_config: Some(tensor::TensorModelConfig {
name: "tensor".to_string(),
inputs: vec![tensor::TensorMetadata {
name: "input".to_string(),
data_type: tensor::DataType::Int32,
shape: vec![1],
parameters: Default::default(),
}],
outputs: vec![tensor::TensorMetadata {
name: "output".to_string(),
data_type: tensor::DataType::Bool,
shape: vec![-1],
parameters: Default::default(),
}],
triton_model_config: None,
}),
..Default::default()
};
let tensor = Arc::new(TensorEngine {});
service
.model_manager()
.add_tensor_model("tensor", card.mdcsum(), tensor.clone())
.unwrap();
let _ = service.model_manager().save_model_card("key", card);

// Re-check readiness
// Check server readiness
let server_ready_request = tonic::Request::new(ServerReadyRequest {});
let server_ready_response = client.server_ready(server_ready_request).await.unwrap();
let server_ready = server_ready_response.get_ref().ready;
assert!(server_ready, "Server should be ready with model registered");

// Check model readiness for unregistered model
let model_ready_request = tonic::Request::new(ModelReadyRequest {
name: "tensor".into(),
version: "".into(),
});
let model_ready_response = client.model_ready(model_ready_request).await.unwrap();
let model_ready = model_ready_response.get_ref().ready;
assert!(model_ready, "Registered model should be ready");
}
}
1 change: 1 addition & 0 deletions tests/frontend/grpc/test_tensor_mocker_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,6 @@ def start_services(request, runtime_services):
@pytest.mark.integration
@pytest.mark.model(TEST_MODEL)
def test_echo() -> None:
triton_echo_client.check_health()
triton_echo_client.run_infer()
triton_echo_client.get_config()
24 changes: 11 additions & 13 deletions tests/frontend/grpc/triton_echo_client.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import sys

import numpy as np
import tritonclient.grpc as grpcclient

SERVER_URL = "localhost:8000"


def check_health():
triton_client = grpcclient.InferenceServerClient(url=SERVER_URL)
assert triton_client.is_server_live()
assert triton_client.is_server_ready()
assert triton_client.is_model_ready("echo")


def run_infer():
server_url = "localhost:8000"
try:
triton_client = grpcclient.InferenceServerClient(url=server_url)
except Exception as e:
print("channel creation failed: " + str(e))
sys.exit()
triton_client = grpcclient.InferenceServerClient(url=SERVER_URL)

model_name = "echo"

Expand Down Expand Up @@ -46,12 +49,7 @@ def run_infer():


def get_config():
server_url = "localhost:8000"
try:
triton_client = grpcclient.InferenceServerClient(url=server_url)
except Exception as e:
print("channel creation failed: " + str(e))
sys.exit()
triton_client = grpcclient.InferenceServerClient(url=SERVER_URL)

model_name = "echo"
response = triton_client.get_model_config(model_name=model_name)
Expand Down
Loading