From b1dee6baab32b0676c500d36d34807aebafd4128 Mon Sep 17 00:00:00 2001 From: meher-m Date: Wed, 1 Oct 2025 18:10:12 +0000 Subject: [PATCH 1/3] initial code, cursor --- .../llmengine/data_types/model_endpoints.py | 16 ++ examples/multi_route_client_example.py | 216 ++++++++++++++++++ examples/multi_route_fastapi_server.py | 167 ++++++++++++++ .../common/dtos/model_endpoints.py | 16 ++ .../use_cases/llm_model_endpoint_use_cases.py | 59 ++++- 5 files changed, 469 insertions(+), 5 deletions(-) create mode 100644 examples/multi_route_client_example.py create mode 100644 examples/multi_route_fastapi_server.py diff --git a/clients/python/llmengine/data_types/model_endpoints.py b/clients/python/llmengine/data_types/model_endpoints.py index 2e0877732..3d3756b35 100644 --- a/clients/python/llmengine/data_types/model_endpoints.py +++ b/clients/python/llmengine/data_types/model_endpoints.py @@ -64,6 +64,22 @@ class CreateLLMEndpointRequest(VLLMEndpointAdditionalArgs, BaseModel): default=None, description="A Jinja template to use for this endpoint. If not provided, will use the chat template from the checkpoint", ) + # Route configuration for multiple endpoints support + routes: Optional[List[str]] = Field( + default=None, + description="List of additional routes to forward to the user's service. " + "These routes will be added alongside the default /predict route. " + "Requires passthrough forwarder type." + ) + extra_routes: Optional[List[str]] = Field( + default=None, + description="Legacy field for additional routes. Use 'routes' instead." + ) + forwarder_type: Optional[str] = Field( + default=None, + description="Type of forwarder to use. Set to 'passthrough' to enable " + "multiple route forwarding to your FastAPI service." + ) class CreateLLMEndpointResponse(BaseModel): diff --git a/examples/multi_route_client_example.py b/examples/multi_route_client_example.py new file mode 100644 index 000000000..f5c6d8f90 --- /dev/null +++ b/examples/multi_route_client_example.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +""" +Example demonstrating how to deploy a multi-route FastAPI server using Launch. + +This example shows how to use the new route configuration parameters to deploy +a FastAPI server with multiple endpoints that can be accessed through their +natural paths rather than being restricted to just /predict. +""" + +from llmengine import Model +from llmengine.data_types.model_endpoints import CreateLLMEndpointRequest +from llmengine.data_types.core import ModelEndpointType +import requests +import time + +def create_multi_route_endpoint(): + """ + Create a model endpoint with multiple routes using the new passthrough forwarder. + """ + + # Define the routes we want to expose from our FastAPI server + custom_routes = [ + "/v1/chat/completions", # OpenAI-compatible chat endpoint + "/v1/completions", # OpenAI-compatible completions endpoint + "/analyze", # Custom analysis endpoint + "/custom/endpoint", # Custom GET endpoint + "/batch/process", # Batch processing endpoint + ] + + print("Creating model endpoint with multiple routes...") + print(f"Routes to be exposed: {custom_routes}") + + # Create the endpoint with multi-route support + response = Model.create( + name="multi-route-fastapi-example", + model="llama-2-7b", # This is just for the bundle creation, our custom server will handle the logic + inference_framework_image_tag="latest", + + # Hardware configuration + cpus=4, + memory="8Gi", + storage="20Gi", + gpus=1, + gpu_type="nvidia-ampere-a10", + + # Scaling configuration + min_workers=1, + max_workers=3, + per_worker=10, + endpoint_type=ModelEndpointType.STREAMING, + + # NEW: Multi-route configuration + routes=custom_routes, # List of routes to forward + forwarder_type="passthrough", # Enable passthrough forwarding + + # Other settings + public_inference=False, + labels={"example": "multi-route", "type": "fastapi"}, + ) + + print(f"Endpoint created! Task ID: {response.endpoint_creation_task_id}") + return response.endpoint_creation_task_id + +def test_multi_route_endpoint(endpoint_name: str, base_url: str): + """ + Test the multi-route endpoint by making requests to different routes. + """ + print(f"\nTesting multi-route endpoint: {endpoint_name}") + print(f"Base URL: {base_url}") + + # Test cases for different routes + test_cases = [ + { + "name": "Traditional Predict", + "method": "POST", + "url": f"{base_url}/predict", + "data": {"text": "Hello world", "model": "custom"} + }, + { + "name": "OpenAI Chat Completions", + "method": "POST", + "url": f"{base_url}/v1/chat/completions", + "data": { + "messages": [{"role": "user", "content": "Hello, how are you?"}], + "model": "gpt-3.5-turbo", + "max_tokens": 50 + } + }, + { + "name": "OpenAI Completions", + "method": "POST", + "url": f"{base_url}/v1/completions", + "data": { + "prompt": "The future of AI is", + "model": "text-davinci-003", + "max_tokens": 50 + } + }, + { + "name": "Custom Analysis", + "method": "POST", + "url": f"{base_url}/analyze", + "data": {"text": "This is a good example of multi-route functionality"} + }, + { + "name": "Custom GET Endpoint", + "method": "GET", + "url": f"{base_url}/custom/endpoint", + "data": None + }, + { + "name": "Batch Processing", + "method": "POST", + "url": f"{base_url}/batch/process", + "data": {"texts": ["First text", "Second text", "Third text"]} + } + ] + + # Execute test cases + for test_case in test_cases: + print(f"\n--- Testing {test_case['name']} ---") + print(f"URL: {test_case['url']}") + + try: + if test_case['method'] == 'GET': + response = requests.get(test_case['url']) + else: + response = requests.post(test_case['url'], json=test_case['data']) + + print(f"Status: {response.status_code}") + if response.status_code == 200: + result = response.json() + print(f"Response: {result}") + else: + print(f"Error: {response.text}") + + except requests.exceptions.RequestException as e: + print(f"Request failed: {e}") + +def main(): + """ + Main example workflow. + """ + + print("=" * 60) + print("Launch Multi-Route FastAPI Server Example") + print("=" * 60) + + print("""\ +This example demonstrates the new multi-route passthrough functionality in Launch. + +Instead of being limited to a single /predict endpoint, you can now: +1. Specify multiple routes to be forwarded to your FastAPI server +2. Use the passthrough forwarder type to enable full HTTP method support +3. Access your endpoints through their natural paths + +Key benefits: +- No more single endpoint limitation +- Full FastAPI server compatibility +- Support for GET, POST, PUT, DELETE, PATCH, HEAD, OPTIONS +- OpenAI-compatible endpoints alongside custom routes +- Easy migration of existing FastAPI applications +""") + + # Step 1: Create the multi-route endpoint + task_id = create_multi_route_endpoint() + + print(f"\nEndpoint creation initiated with task ID: {task_id}") + print("Waiting for endpoint to be ready...") + + # In a real scenario, you would poll the endpoint status + # For this example, we'll simulate waiting + print("⏳ Endpoint is being deployed...") + print("⏳ This may take several minutes...") + + # Step 2: Once ready, test the endpoints + # Note: In practice, you'd get the actual endpoint URL from the Launch API + endpoint_name = "multi-route-fastapi-example" + base_url = f"https://your-launch-domain.com/v1/endpoints/{endpoint_name}" + + print(f"\n✅ Endpoint ready! You can now test it at: {base_url}") + print("\nExample test calls you can make:") + + # Show example curl commands + curl_examples = [ + { + "name": "Traditional predict", + "cmd": f'curl -X POST {base_url}/predict -H "Content-Type: application/json" -d \'{{"text": "Hello world", "model": "custom"}}\'' + }, + { + "name": "OpenAI chat", + "cmd": f'curl -X POST {base_url}/v1/chat/completions -H "Content-Type: application/json" -d \'{{"messages": [{{"role": "user", "content": "Hello!"}}], "model": "gpt-3.5-turbo"}}\'' + }, + { + "name": "Custom analysis", + "cmd": f'curl -X POST {base_url}/analyze -H "Content-Type: application/json" -d \'{{"text": "This is amazing!"}}\'' + }, + { + "name": "Custom GET endpoint", + "cmd": f'curl -X GET {base_url}/custom/endpoint' + } + ] + + for example in curl_examples: + print(f"\n{example['name']}:") + print(f" {example['cmd']}") + + print(f"\n" + "=" * 60) + print("Multi-Route Support Successfully Configured!") + print("=" * 60) + + # Uncomment the following line to run actual tests if you have a deployed endpoint + # test_multi_route_endpoint(endpoint_name, base_url) + +if __name__ == "__main__": + main() diff --git a/examples/multi_route_fastapi_server.py b/examples/multi_route_fastapi_server.py new file mode 100644 index 000000000..40f43f16e --- /dev/null +++ b/examples/multi_route_fastapi_server.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +""" +End-to-end example demonstrating multiple routes passthrough in Launch. + +This example shows how to create a FastAPI server with multiple routes and deploy it +using Launch's model endpoint creation with the passthrough forwarder. + +The server implements several endpoints that would normally require the single /predict +restriction, but now can be accessed through their natural paths. +""" + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +from typing import Dict, List, Optional, Any +import uvicorn + +# FastAPI server with multiple routes +app = FastAPI(title="Multi-Route Example Server", version="1.0.0") + +# Data models +class PredictRequest(BaseModel): + text: str + model: Optional[str] = "default" + +class PredictResponse(BaseModel): + result: str + model: str + route: str + +class HealthResponse(BaseModel): + status: str + routes: List[str] + +class ChatMessage(BaseModel): + role: str + content: str + +class ChatRequest(BaseModel): + messages: List[ChatMessage] + model: Optional[str] = "gpt-3.5-turbo" + max_tokens: Optional[int] = 100 + +class ChatResponse(BaseModel): + choices: List[Dict[str, Any]] + model: str + usage: Dict[str, int] + +class CompletionRequest(BaseModel): + prompt: str + model: Optional[str] = "text-davinci-003" + max_tokens: Optional[int] = 100 + +class CompletionResponse(BaseModel): + choices: List[Dict[str, str]] + model: str + usage: Dict[str, int] + +# Health check endpoint (required by Launch) +@app.get("/health", response_model=HealthResponse) +@app.get("/readyz", response_model=HealthResponse) +def health_check(): + """Health check endpoint required by Launch forwarder.""" + return HealthResponse( + status="healthy", + routes=[ + "/predict", + "/v1/chat/completions", + "/v1/completions", + "/analyze", + "/custom/endpoint" + ] + ) + +# Traditional predict endpoint +@app.post("/predict", response_model=PredictResponse) +def predict(request: PredictRequest): + """Traditional ML prediction endpoint.""" + return PredictResponse( + result=f"Processed text: {request.text}", + model=request.model, + route="/predict" + ) + +# OpenAI-compatible chat completions endpoint +@app.post("/v1/chat/completions", response_model=ChatResponse) +def chat_completions(request: ChatRequest): + """OpenAI-compatible chat completions endpoint.""" + # Simple echo implementation for example + last_message = request.messages[-1] if request.messages else ChatMessage(role="user", content="") + + return ChatResponse( + choices=[{ + "message": { + "role": "assistant", + "content": f"Echo: {last_message.content}" + }, + "finish_reason": "stop", + "index": 0 + }], + model=request.model, + usage={ + "prompt_tokens": len(last_message.content.split()), + "completion_tokens": len(last_message.content.split()) + 1, + "total_tokens": len(last_message.content.split()) * 2 + 1 + } + ) + +# OpenAI-compatible completions endpoint +@app.post("/v1/completions", response_model=CompletionResponse) +def completions(request: CompletionRequest): + """OpenAI-compatible completions endpoint.""" + return CompletionResponse( + choices=[{ + "text": f" -> Completion for: {request.prompt}", + "finish_reason": "stop", + "index": 0 + }], + model=request.model, + usage={ + "prompt_tokens": len(request.prompt.split()), + "completion_tokens": 10, + "total_tokens": len(request.prompt.split()) + 10 + } + ) + +# Custom analysis endpoint +@app.post("/analyze") +def analyze_text(data: Dict[str, Any]): + """Custom text analysis endpoint.""" + text = data.get("text", "") + if not text: + raise HTTPException(status_code=400, detail="Text field is required") + + return { + "analysis": { + "word_count": len(text.split()), + "char_count": len(text), + "sentiment": "positive" if "good" in text.lower() else "neutral" + }, + "text": text, + "route": "/analyze" + } + +# Another custom endpoint +@app.get("/custom/endpoint") +def custom_endpoint(): + """A custom GET endpoint to demonstrate method flexibility.""" + return { + "message": "This is a custom endpoint accessible via passthrough routing", + "methods_supported": ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"], + "route": "/custom/endpoint" + } + +# Batch processing endpoint +@app.post("/batch/process") +def batch_process(data: Dict[str, List[str]]): + """Batch processing endpoint for multiple texts.""" + texts = data.get("texts", []) + return { + "results": [f"Processed: {text}" for text in texts], + "count": len(texts), + "route": "/batch/process" + } + +if __name__ == "__main__": + # Run the server + uvicorn.run(app, host="0.0.0.0", port=5005) diff --git a/model-engine/model_engine_server/common/dtos/model_endpoints.py b/model-engine/model_engine_server/common/dtos/model_endpoints.py index 36a7c7f68..775fc3b40 100644 --- a/model-engine/model_engine_server/common/dtos/model_endpoints.py +++ b/model-engine/model_engine_server/common/dtos/model_endpoints.py @@ -73,6 +73,22 @@ class CreateModelEndpointV1Request(BaseModel): default_callback_url: Optional[HttpUrlStr] = None default_callback_auth: Optional[CallbackAuth] = None public_inference: Optional[bool] = Field(default=False) + # Route configuration for multiple endpoints support + routes: Optional[List[str]] = Field( + default=None, + description="List of additional routes to forward to the user's service. " + "These routes will be added alongside the default /predict route. " + "Requires passthrough forwarder type." + ) + extra_routes: Optional[List[str]] = Field( + default=None, + description="Legacy field for additional routes. Use 'routes' instead." + ) + forwarder_type: Optional[str] = Field( + default=None, + description="Type of forwarder to use. Set to 'passthrough' to enable " + "multiple route forwarding to your FastAPI service." + ) class CreateModelEndpointV1Response(BaseModel): diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 352b7a060..155a027d3 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -392,6 +392,10 @@ async def execute( chat_template_override: Optional[str], nodes_per_worker: int, additional_args: Optional[Dict[str, Any]] = None, + # Route configuration for multiple endpoints support + routes: Optional[List[str]] = None, + extra_routes: Optional[List[str]] = None, + forwarder_type: Optional[str] = None, ) -> ModelBundle: multinode = nodes_per_worker > 1 if source != LLMSource.HUGGING_FACE: @@ -459,6 +463,9 @@ async def execute( checkpoint_path, chat_template_override, additional_args=additional_vllm_args, + routes=routes, + extra_routes=extra_routes, + forwarder_type=forwarder_type, ) else: bundle_id = await self.create_vllm_bundle( @@ -471,6 +478,9 @@ async def execute( checkpoint_path, chat_template_override, additional_args=additional_vllm_args, + routes=routes, + extra_routes=extra_routes, + forwarder_type=forwarder_type, ) case LLMInferenceFramework.SGLANG: # pragma: no cover if not hmi_config.sglang_repository: @@ -991,6 +1001,9 @@ async def create_vllm_bundle( checkpoint_path: Optional[str], chat_template_override: Optional[str], additional_args: Optional[VLLMEndpointAdditionalArgs] = None, + routes: Optional[List[str]] = None, + extra_routes: Optional[List[str]] = None, + forwarder_type: Optional[str] = None, ): command = self._create_vllm_bundle_command( model_name, @@ -1005,6 +1018,20 @@ async def create_vllm_bundle( additional_args=additional_args, ) + # Determine which routes to use - user-provided or defaults + final_routes = [] + final_extra_routes = [] + final_forwarder_type = forwarder_type + + if routes is not None: + final_routes = routes + else: + # Default to OpenAI compatibility routes for VLLM + final_routes = [OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH] + + if extra_routes is not None: + final_extra_routes = extra_routes + create_model_bundle_v2_request = CreateModelBundleV2Request( name=endpoint_unique_name, schema_location="TBA", @@ -1019,10 +1046,9 @@ async def create_vllm_bundle( healthcheck_route="/health", predict_route="/predict", streaming_predict_route="/stream", - routes=[ - OPENAI_CHAT_COMPLETION_PATH, - OPENAI_COMPLETION_PATH, - ], + routes=final_routes, + extra_routes=final_extra_routes, + forwarder_type=final_forwarder_type, env={}, ), metadata={}, @@ -1051,6 +1077,9 @@ async def create_vllm_multinode_bundle( checkpoint_path: Optional[str], chat_template_override: Optional[str], additional_args: Optional[VLLMEndpointAdditionalArgs] = None, + routes: Optional[List[str]] = None, + extra_routes: Optional[List[str]] = None, + forwarder_type: Optional[str] = None, ): leader_command = self._create_vllm_bundle_command( model_name, @@ -1087,6 +1116,20 @@ async def create_vllm_multinode_bundle( "RAY_CLUSTER_SIZE": "$(K8S_LWS_CLUSTER_SIZE)", } + # Determine which routes to use - user-provided or defaults + final_routes = [] + final_extra_routes = [] + final_forwarder_type = forwarder_type + + if routes is not None: + final_routes = routes + else: + # Default to OpenAI compatibility routes for VLLM + final_routes = [OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH] + + if extra_routes is not None: + final_extra_routes = extra_routes + create_model_bundle_v2_request = CreateModelBundleV2Request( name=endpoint_unique_name, schema_location="TBA", @@ -1101,7 +1144,9 @@ async def create_vllm_multinode_bundle( healthcheck_route="/health", predict_route="/predict", streaming_predict_route="/stream", - routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH], + routes=final_routes, + extra_routes=final_extra_routes, + forwarder_type=final_forwarder_type, env=common_vllm_envs, worker_command=worker_command, worker_env=common_vllm_envs, @@ -1343,6 +1388,10 @@ async def execute( chat_template_override=request.chat_template_override, nodes_per_worker=request.nodes_per_worker, additional_args=request.model_dump(exclude_none=True), + # Pass route configuration to bundle creation + routes=request.routes, + extra_routes=request.extra_routes, + forwarder_type=request.forwarder_type, ) validate_resource_requests( bundle=bundle, From e9de35ba7348d6ff7d946ba89f1eedf523ef7d80 Mon Sep 17 00:00:00 2001 From: meher-m Date: Tue, 14 Oct 2025 15:41:59 +0000 Subject: [PATCH 2/3] reformat --- .../llmengine/data_types/model_endpoints.py | 7 +- examples/multi_route_client_example.py | 63 +++++++++-------- examples/multi_route_fastapi_server.py | 67 +++++++++++-------- .../common/dtos/model_endpoints.py | 7 +- 4 files changed, 77 insertions(+), 67 deletions(-) diff --git a/clients/python/llmengine/data_types/model_endpoints.py b/clients/python/llmengine/data_types/model_endpoints.py index 3d3756b35..d6f23aa19 100644 --- a/clients/python/llmengine/data_types/model_endpoints.py +++ b/clients/python/llmengine/data_types/model_endpoints.py @@ -69,16 +69,15 @@ class CreateLLMEndpointRequest(VLLMEndpointAdditionalArgs, BaseModel): default=None, description="List of additional routes to forward to the user's service. " "These routes will be added alongside the default /predict route. " - "Requires passthrough forwarder type." + "Requires passthrough forwarder type.", ) extra_routes: Optional[List[str]] = Field( - default=None, - description="Legacy field for additional routes. Use 'routes' instead." + default=None, description="Legacy field for additional routes. Use 'routes' instead." ) forwarder_type: Optional[str] = Field( default=None, description="Type of forwarder to use. Set to 'passthrough' to enable " - "multiple route forwarding to your FastAPI service." + "multiple route forwarding to your FastAPI service.", ) diff --git a/examples/multi_route_client_example.py b/examples/multi_route_client_example.py index f5c6d8f90..2ca433f5e 100644 --- a/examples/multi_route_client_example.py +++ b/examples/multi_route_client_example.py @@ -13,6 +13,7 @@ import requests import time + def create_multi_route_endpoint(): """ Create a model endpoint with multiple routes using the new passthrough forwarder. @@ -20,11 +21,11 @@ def create_multi_route_endpoint(): # Define the routes we want to expose from our FastAPI server custom_routes = [ - "/v1/chat/completions", # OpenAI-compatible chat endpoint - "/v1/completions", # OpenAI-compatible completions endpoint - "/analyze", # Custom analysis endpoint - "/custom/endpoint", # Custom GET endpoint - "/batch/process", # Batch processing endpoint + "/v1/chat/completions", # OpenAI-compatible chat endpoint + "/v1/completions", # OpenAI-compatible completions endpoint + "/analyze", # Custom analysis endpoint + "/custom/endpoint", # Custom GET endpoint + "/batch/process", # Batch processing endpoint ] print("Creating model endpoint with multiple routes...") @@ -35,24 +36,20 @@ def create_multi_route_endpoint(): name="multi-route-fastapi-example", model="llama-2-7b", # This is just for the bundle creation, our custom server will handle the logic inference_framework_image_tag="latest", - # Hardware configuration cpus=4, memory="8Gi", storage="20Gi", gpus=1, gpu_type="nvidia-ampere-a10", - # Scaling configuration min_workers=1, max_workers=3, per_worker=10, endpoint_type=ModelEndpointType.STREAMING, - # NEW: Multi-route configuration - routes=custom_routes, # List of routes to forward - forwarder_type="passthrough", # Enable passthrough forwarding - + routes=custom_routes, # List of routes to forward + forwarder_type="passthrough", # Enable passthrough forwarding # Other settings public_inference=False, labels={"example": "multi-route", "type": "fastapi"}, @@ -61,6 +58,7 @@ def create_multi_route_endpoint(): print(f"Endpoint created! Task ID: {response.endpoint_creation_task_id}") return response.endpoint_creation_task_id + def test_multi_route_endpoint(endpoint_name: str, base_url: str): """ Test the multi-route endpoint by making requests to different routes. @@ -74,7 +72,7 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str): "name": "Traditional Predict", "method": "POST", "url": f"{base_url}/predict", - "data": {"text": "Hello world", "model": "custom"} + "data": {"text": "Hello world", "model": "custom"}, }, { "name": "OpenAI Chat Completions", @@ -83,8 +81,8 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str): "data": { "messages": [{"role": "user", "content": "Hello, how are you?"}], "model": "gpt-3.5-turbo", - "max_tokens": 50 - } + "max_tokens": 50, + }, }, { "name": "OpenAI Completions", @@ -93,27 +91,27 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str): "data": { "prompt": "The future of AI is", "model": "text-davinci-003", - "max_tokens": 50 - } + "max_tokens": 50, + }, }, { "name": "Custom Analysis", "method": "POST", "url": f"{base_url}/analyze", - "data": {"text": "This is a good example of multi-route functionality"} + "data": {"text": "This is a good example of multi-route functionality"}, }, { "name": "Custom GET Endpoint", "method": "GET", "url": f"{base_url}/custom/endpoint", - "data": None + "data": None, }, { "name": "Batch Processing", "method": "POST", "url": f"{base_url}/batch/process", - "data": {"texts": ["First text", "Second text", "Third text"]} - } + "data": {"texts": ["First text", "Second text", "Third text"]}, + }, ] # Execute test cases @@ -122,10 +120,10 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str): print(f"URL: {test_case['url']}") try: - if test_case['method'] == 'GET': - response = requests.get(test_case['url']) + if test_case["method"] == "GET": + response = requests.get(test_case["url"]) else: - response = requests.post(test_case['url'], json=test_case['data']) + response = requests.post(test_case["url"], json=test_case["data"]) print(f"Status: {response.status_code}") if response.status_code == 200: @@ -137,6 +135,7 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str): except requests.exceptions.RequestException as e: print(f"Request failed: {e}") + def main(): """ Main example workflow. @@ -146,7 +145,8 @@ def main(): print("Launch Multi-Route FastAPI Server Example") print("=" * 60) - print("""\ + print( + """\ This example demonstrates the new multi-route passthrough functionality in Launch. Instead of being limited to a single /predict endpoint, you can now: @@ -160,7 +160,8 @@ def main(): - Support for GET, POST, PUT, DELETE, PATCH, HEAD, OPTIONS - OpenAI-compatible endpoints alongside custom routes - Easy migration of existing FastAPI applications -""") +""" + ) # Step 1: Create the multi-route endpoint task_id = create_multi_route_endpoint() @@ -185,20 +186,17 @@ def main(): curl_examples = [ { "name": "Traditional predict", - "cmd": f'curl -X POST {base_url}/predict -H "Content-Type: application/json" -d \'{{"text": "Hello world", "model": "custom"}}\'' + "cmd": f'curl -X POST {base_url}/predict -H "Content-Type: application/json" -d \'{{"text": "Hello world", "model": "custom"}}\'', }, { "name": "OpenAI chat", - "cmd": f'curl -X POST {base_url}/v1/chat/completions -H "Content-Type: application/json" -d \'{{"messages": [{{"role": "user", "content": "Hello!"}}], "model": "gpt-3.5-turbo"}}\'' + "cmd": f'curl -X POST {base_url}/v1/chat/completions -H "Content-Type: application/json" -d \'{{"messages": [{{"role": "user", "content": "Hello!"}}], "model": "gpt-3.5-turbo"}}\'', }, { "name": "Custom analysis", - "cmd": f'curl -X POST {base_url}/analyze -H "Content-Type: application/json" -d \'{{"text": "This is amazing!"}}\'' + "cmd": f'curl -X POST {base_url}/analyze -H "Content-Type: application/json" -d \'{{"text": "This is amazing!"}}\'', }, - { - "name": "Custom GET endpoint", - "cmd": f'curl -X GET {base_url}/custom/endpoint' - } + {"name": "Custom GET endpoint", "cmd": f"curl -X GET {base_url}/custom/endpoint"}, ] for example in curl_examples: @@ -212,5 +210,6 @@ def main(): # Uncomment the following line to run actual tests if you have a deployed endpoint # test_multi_route_endpoint(endpoint_name, base_url) + if __name__ == "__main__": main() diff --git a/examples/multi_route_fastapi_server.py b/examples/multi_route_fastapi_server.py index 40f43f16e..e0c06a11a 100644 --- a/examples/multi_route_fastapi_server.py +++ b/examples/multi_route_fastapi_server.py @@ -17,44 +17,53 @@ # FastAPI server with multiple routes app = FastAPI(title="Multi-Route Example Server", version="1.0.0") + # Data models class PredictRequest(BaseModel): text: str model: Optional[str] = "default" + class PredictResponse(BaseModel): result: str model: str route: str + class HealthResponse(BaseModel): status: str routes: List[str] + class ChatMessage(BaseModel): role: str content: str + class ChatRequest(BaseModel): messages: List[ChatMessage] model: Optional[str] = "gpt-3.5-turbo" max_tokens: Optional[int] = 100 + class ChatResponse(BaseModel): choices: List[Dict[str, Any]] model: str usage: Dict[str, int] + class CompletionRequest(BaseModel): prompt: str model: Optional[str] = "text-davinci-003" max_tokens: Optional[int] = 100 + class CompletionResponse(BaseModel): choices: List[Dict[str, str]] model: str usage: Dict[str, int] + # Health check endpoint (required by Launch) @app.get("/health", response_model=HealthResponse) @app.get("/readyz", response_model=HealthResponse) @@ -67,62 +76,63 @@ def health_check(): "/v1/chat/completions", "/v1/completions", "/analyze", - "/custom/endpoint" - ] + "/custom/endpoint", + ], ) + # Traditional predict endpoint @app.post("/predict", response_model=PredictResponse) def predict(request: PredictRequest): """Traditional ML prediction endpoint.""" return PredictResponse( - result=f"Processed text: {request.text}", - model=request.model, - route="/predict" + result=f"Processed text: {request.text}", model=request.model, route="/predict" ) + # OpenAI-compatible chat completions endpoint @app.post("/v1/chat/completions", response_model=ChatResponse) def chat_completions(request: ChatRequest): """OpenAI-compatible chat completions endpoint.""" # Simple echo implementation for example - last_message = request.messages[-1] if request.messages else ChatMessage(role="user", content="") + last_message = ( + request.messages[-1] if request.messages else ChatMessage(role="user", content="") + ) return ChatResponse( - choices=[{ - "message": { - "role": "assistant", - "content": f"Echo: {last_message.content}" - }, - "finish_reason": "stop", - "index": 0 - }], + choices=[ + { + "message": {"role": "assistant", "content": f"Echo: {last_message.content}"}, + "finish_reason": "stop", + "index": 0, + } + ], model=request.model, usage={ "prompt_tokens": len(last_message.content.split()), "completion_tokens": len(last_message.content.split()) + 1, - "total_tokens": len(last_message.content.split()) * 2 + 1 - } + "total_tokens": len(last_message.content.split()) * 2 + 1, + }, ) + # OpenAI-compatible completions endpoint @app.post("/v1/completions", response_model=CompletionResponse) def completions(request: CompletionRequest): """OpenAI-compatible completions endpoint.""" return CompletionResponse( - choices=[{ - "text": f" -> Completion for: {request.prompt}", - "finish_reason": "stop", - "index": 0 - }], + choices=[ + {"text": f" -> Completion for: {request.prompt}", "finish_reason": "stop", "index": 0} + ], model=request.model, usage={ "prompt_tokens": len(request.prompt.split()), "completion_tokens": 10, - "total_tokens": len(request.prompt.split()) + 10 - } + "total_tokens": len(request.prompt.split()) + 10, + }, ) + # Custom analysis endpoint @app.post("/analyze") def analyze_text(data: Dict[str, Any]): @@ -135,12 +145,13 @@ def analyze_text(data: Dict[str, Any]): "analysis": { "word_count": len(text.split()), "char_count": len(text), - "sentiment": "positive" if "good" in text.lower() else "neutral" + "sentiment": "positive" if "good" in text.lower() else "neutral", }, "text": text, - "route": "/analyze" + "route": "/analyze", } + # Another custom endpoint @app.get("/custom/endpoint") def custom_endpoint(): @@ -148,9 +159,10 @@ def custom_endpoint(): return { "message": "This is a custom endpoint accessible via passthrough routing", "methods_supported": ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"], - "route": "/custom/endpoint" + "route": "/custom/endpoint", } + # Batch processing endpoint @app.post("/batch/process") def batch_process(data: Dict[str, List[str]]): @@ -159,9 +171,10 @@ def batch_process(data: Dict[str, List[str]]): return { "results": [f"Processed: {text}" for text in texts], "count": len(texts), - "route": "/batch/process" + "route": "/batch/process", } + if __name__ == "__main__": # Run the server uvicorn.run(app, host="0.0.0.0", port=5005) diff --git a/model-engine/model_engine_server/common/dtos/model_endpoints.py b/model-engine/model_engine_server/common/dtos/model_endpoints.py index 775fc3b40..18d0aa66f 100644 --- a/model-engine/model_engine_server/common/dtos/model_endpoints.py +++ b/model-engine/model_engine_server/common/dtos/model_endpoints.py @@ -78,16 +78,15 @@ class CreateModelEndpointV1Request(BaseModel): default=None, description="List of additional routes to forward to the user's service. " "These routes will be added alongside the default /predict route. " - "Requires passthrough forwarder type." + "Requires passthrough forwarder type.", ) extra_routes: Optional[List[str]] = Field( - default=None, - description="Legacy field for additional routes. Use 'routes' instead." + default=None, description="Legacy field for additional routes. Use 'routes' instead." ) forwarder_type: Optional[str] = Field( default=None, description="Type of forwarder to use. Set to 'passthrough' to enable " - "multiple route forwarding to your FastAPI service." + "multiple route forwarding to your FastAPI service.", ) From 91b2006221d64ec609cebafb3ceb7c6961937342 Mon Sep 17 00:00:00 2001 From: meher-m Date: Tue, 14 Oct 2025 15:48:33 +0000 Subject: [PATCH 3/3] reformat isort --- examples/multi_route_client_example.py | 7 ++++--- examples/multi_route_fastapi_server.py | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/multi_route_client_example.py b/examples/multi_route_client_example.py index 2ca433f5e..56caa5c4b 100644 --- a/examples/multi_route_client_example.py +++ b/examples/multi_route_client_example.py @@ -7,11 +7,12 @@ natural paths rather than being restricted to just /predict. """ +import time + +import requests from llmengine import Model -from llmengine.data_types.model_endpoints import CreateLLMEndpointRequest from llmengine.data_types.core import ModelEndpointType -import requests -import time +from llmengine.data_types.model_endpoints import CreateLLMEndpointRequest def create_multi_route_endpoint(): diff --git a/examples/multi_route_fastapi_server.py b/examples/multi_route_fastapi_server.py index e0c06a11a..0f0ad0e7d 100644 --- a/examples/multi_route_fastapi_server.py +++ b/examples/multi_route_fastapi_server.py @@ -9,10 +9,11 @@ restriction, but now can be accessed through their natural paths. """ +from typing import Any, Dict, List, Optional + +import uvicorn from fastapi import FastAPI, HTTPException from pydantic import BaseModel -from typing import Dict, List, Optional, Any -import uvicorn # FastAPI server with multiple routes app = FastAPI(title="Multi-Route Example Server", version="1.0.0")