From b1dee6baab32b0676c500d36d34807aebafd4128 Mon Sep 17 00:00:00 2001
From: meher-m <meher.mankikar@scale.com>
Date: Wed, 1 Oct 2025 18:10:12 +0000
Subject: [PATCH 1/3] initial code, cursor

---
 .../llmengine/data_types/model_endpoints.py   |  16 ++
 examples/multi_route_client_example.py        | 216 ++++++++++++++++++
 examples/multi_route_fastapi_server.py        | 167 ++++++++++++++
 .../common/dtos/model_endpoints.py            |  16 ++
 .../use_cases/llm_model_endpoint_use_cases.py |  59 ++++-
 5 files changed, 469 insertions(+), 5 deletions(-)
 create mode 100644 examples/multi_route_client_example.py
 create mode 100644 examples/multi_route_fastapi_server.py

diff --git a/clients/python/llmengine/data_types/model_endpoints.py b/clients/python/llmengine/data_types/model_endpoints.py
index 2e0877732..3d3756b35 100644
--- a/clients/python/llmengine/data_types/model_endpoints.py
+++ b/clients/python/llmengine/data_types/model_endpoints.py
@@ -64,6 +64,22 @@ class CreateLLMEndpointRequest(VLLMEndpointAdditionalArgs, BaseModel):
         default=None,
         description="A Jinja template to use for this endpoint. If not provided, will use the chat template from the checkpoint",
     )
+    # Route configuration for multiple endpoints support
+    routes: Optional[List[str]] = Field(
+        default=None,
+        description="List of additional routes to forward to the user's service. "
+        "These routes will be added alongside the default /predict route. "
+        "Requires passthrough forwarder type."
+    )
+    extra_routes: Optional[List[str]] = Field(
+        default=None,
+        description="Legacy field for additional routes. Use 'routes' instead."
+    )
+    forwarder_type: Optional[str] = Field(
+        default=None,
+        description="Type of forwarder to use. Set to 'passthrough' to enable "
+        "multiple route forwarding to your FastAPI service."
+    )
 
 
 class CreateLLMEndpointResponse(BaseModel):
diff --git a/examples/multi_route_client_example.py b/examples/multi_route_client_example.py
new file mode 100644
index 000000000..f5c6d8f90
--- /dev/null
+++ b/examples/multi_route_client_example.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+Example demonstrating how to deploy a multi-route FastAPI server using Launch.
+
+This example shows how to use the new route configuration parameters to deploy
+a FastAPI server with multiple endpoints that can be accessed through their
+natural paths rather than being restricted to just /predict.
+"""
+
+from llmengine import Model
+from llmengine.data_types.model_endpoints import CreateLLMEndpointRequest
+from llmengine.data_types.core import ModelEndpointType
+import requests
+import time
+
+def create_multi_route_endpoint():
+    """
+    Create a model endpoint with multiple routes using the new passthrough forwarder.
+    """
+
+    # Define the routes we want to expose from our FastAPI server
+    custom_routes = [
+        "/v1/chat/completions",    # OpenAI-compatible chat endpoint
+        "/v1/completions",         # OpenAI-compatible completions endpoint
+        "/analyze",                # Custom analysis endpoint
+        "/custom/endpoint",        # Custom GET endpoint
+        "/batch/process",          # Batch processing endpoint
+    ]
+
+    print("Creating model endpoint with multiple routes...")
+    print(f"Routes to be exposed: {custom_routes}")
+
+    # Create the endpoint with multi-route support
+    response = Model.create(
+        name="multi-route-fastapi-example",
+        model="llama-2-7b",  # This is just for the bundle creation, our custom server will handle the logic
+        inference_framework_image_tag="latest",
+
+        # Hardware configuration
+        cpus=4,
+        memory="8Gi",
+        storage="20Gi",
+        gpus=1,
+        gpu_type="nvidia-ampere-a10",
+
+        # Scaling configuration
+        min_workers=1,
+        max_workers=3,
+        per_worker=10,
+        endpoint_type=ModelEndpointType.STREAMING,
+
+        # NEW: Multi-route configuration
+        routes=custom_routes,                    # List of routes to forward
+        forwarder_type="passthrough",           # Enable passthrough forwarding
+
+        # Other settings
+        public_inference=False,
+        labels={"example": "multi-route", "type": "fastapi"},
+    )
+
+    print(f"Endpoint created! Task ID: {response.endpoint_creation_task_id}")
+    return response.endpoint_creation_task_id
+
+def test_multi_route_endpoint(endpoint_name: str, base_url: str):
+    """
+    Test the multi-route endpoint by making requests to different routes.
+    """
+    print(f"\nTesting multi-route endpoint: {endpoint_name}")
+    print(f"Base URL: {base_url}")
+
+    # Test cases for different routes
+    test_cases = [
+        {
+            "name": "Traditional Predict",
+            "method": "POST",
+            "url": f"{base_url}/predict",
+            "data": {"text": "Hello world", "model": "custom"}
+        },
+        {
+            "name": "OpenAI Chat Completions",
+            "method": "POST",
+            "url": f"{base_url}/v1/chat/completions",
+            "data": {
+                "messages": [{"role": "user", "content": "Hello, how are you?"}],
+                "model": "gpt-3.5-turbo",
+                "max_tokens": 50
+            }
+        },
+        {
+            "name": "OpenAI Completions",
+            "method": "POST",
+            "url": f"{base_url}/v1/completions",
+            "data": {
+                "prompt": "The future of AI is",
+                "model": "text-davinci-003",
+                "max_tokens": 50
+            }
+        },
+        {
+            "name": "Custom Analysis",
+            "method": "POST",
+            "url": f"{base_url}/analyze",
+            "data": {"text": "This is a good example of multi-route functionality"}
+        },
+        {
+            "name": "Custom GET Endpoint",
+            "method": "GET",
+            "url": f"{base_url}/custom/endpoint",
+            "data": None
+        },
+        {
+            "name": "Batch Processing",
+            "method": "POST",
+            "url": f"{base_url}/batch/process",
+            "data": {"texts": ["First text", "Second text", "Third text"]}
+        }
+    ]
+
+    # Execute test cases
+    for test_case in test_cases:
+        print(f"\n--- Testing {test_case['name']} ---")
+        print(f"URL: {test_case['url']}")
+
+        try:
+            if test_case['method'] == 'GET':
+                response = requests.get(test_case['url'])
+            else:
+                response = requests.post(test_case['url'], json=test_case['data'])
+
+            print(f"Status: {response.status_code}")
+            if response.status_code == 200:
+                result = response.json()
+                print(f"Response: {result}")
+            else:
+                print(f"Error: {response.text}")
+
+        except requests.exceptions.RequestException as e:
+            print(f"Request failed: {e}")
+
+def main():
+    """
+    Main example workflow.
+    """
+
+    print("=" * 60)
+    print("Launch Multi-Route FastAPI Server Example")
+    print("=" * 60)
+
+    print("""\
+This example demonstrates the new multi-route passthrough functionality in Launch.
+
+Instead of being limited to a single /predict endpoint, you can now:
+1. Specify multiple routes to be forwarded to your FastAPI server
+2. Use the passthrough forwarder type to enable full HTTP method support
+3. Access your endpoints through their natural paths
+
+Key benefits:
+- No more single endpoint limitation
+- Full FastAPI server compatibility
+- Support for GET, POST, PUT, DELETE, PATCH, HEAD, OPTIONS
+- OpenAI-compatible endpoints alongside custom routes
+- Easy migration of existing FastAPI applications
+""")
+
+    # Step 1: Create the multi-route endpoint
+    task_id = create_multi_route_endpoint()
+
+    print(f"\nEndpoint creation initiated with task ID: {task_id}")
+    print("Waiting for endpoint to be ready...")
+
+    # In a real scenario, you would poll the endpoint status
+    # For this example, we'll simulate waiting
+    print("⏳ Endpoint is being deployed...")
+    print("⏳ This may take several minutes...")
+
+    # Step 2: Once ready, test the endpoints
+    # Note: In practice, you'd get the actual endpoint URL from the Launch API
+    endpoint_name = "multi-route-fastapi-example"
+    base_url = f"https://your-launch-domain.com/v1/endpoints/{endpoint_name}"
+
+    print(f"\n✅ Endpoint ready! You can now test it at: {base_url}")
+    print("\nExample test calls you can make:")
+
+    # Show example curl commands
+    curl_examples = [
+        {
+            "name": "Traditional predict",
+            "cmd": f'curl -X POST {base_url}/predict -H "Content-Type: application/json" -d \'{{"text": "Hello world", "model": "custom"}}\''
+        },
+        {
+            "name": "OpenAI chat",
+            "cmd": f'curl -X POST {base_url}/v1/chat/completions -H "Content-Type: application/json" -d \'{{"messages": [{{"role": "user", "content": "Hello!"}}], "model": "gpt-3.5-turbo"}}\''
+        },
+        {
+            "name": "Custom analysis",
+            "cmd": f'curl -X POST {base_url}/analyze -H "Content-Type: application/json" -d \'{{"text": "This is amazing!"}}\''
+        },
+        {
+            "name": "Custom GET endpoint",
+            "cmd": f'curl -X GET {base_url}/custom/endpoint'
+        }
+    ]
+
+    for example in curl_examples:
+        print(f"\n{example['name']}:")
+        print(f"  {example['cmd']}")
+
+    print(f"\n" + "=" * 60)
+    print("Multi-Route Support Successfully Configured!")
+    print("=" * 60)
+
+    # Uncomment the following line to run actual tests if you have a deployed endpoint
+    # test_multi_route_endpoint(endpoint_name, base_url)
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/multi_route_fastapi_server.py b/examples/multi_route_fastapi_server.py
new file mode 100644
index 000000000..40f43f16e
--- /dev/null
+++ b/examples/multi_route_fastapi_server.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+"""
+End-to-end example demonstrating multiple routes passthrough in Launch.
+
+This example shows how to create a FastAPI server with multiple routes and deploy it
+using Launch's model endpoint creation with the passthrough forwarder.
+
+The server implements several endpoints that would normally require the single /predict
+restriction, but now can be accessed through their natural paths.
+"""
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Dict, List, Optional, Any
+import uvicorn
+
+# FastAPI server with multiple routes
+app = FastAPI(title="Multi-Route Example Server", version="1.0.0")
+
+# Data models
+class PredictRequest(BaseModel):
+    text: str
+    model: Optional[str] = "default"
+
+class PredictResponse(BaseModel):
+    result: str
+    model: str
+    route: str
+
+class HealthResponse(BaseModel):
+    status: str
+    routes: List[str]
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+class ChatRequest(BaseModel):
+    messages: List[ChatMessage]
+    model: Optional[str] = "gpt-3.5-turbo"
+    max_tokens: Optional[int] = 100
+
+class ChatResponse(BaseModel):
+    choices: List[Dict[str, Any]]
+    model: str
+    usage: Dict[str, int]
+
+class CompletionRequest(BaseModel):
+    prompt: str
+    model: Optional[str] = "text-davinci-003"
+    max_tokens: Optional[int] = 100
+
+class CompletionResponse(BaseModel):
+    choices: List[Dict[str, str]]
+    model: str
+    usage: Dict[str, int]
+
+# Health check endpoint (required by Launch)
+@app.get("/health", response_model=HealthResponse)
+@app.get("/readyz", response_model=HealthResponse)
+def health_check():
+    """Health check endpoint required by Launch forwarder."""
+    return HealthResponse(
+        status="healthy",
+        routes=[
+            "/predict",
+            "/v1/chat/completions",
+            "/v1/completions",
+            "/analyze",
+            "/custom/endpoint"
+        ]
+    )
+
+# Traditional predict endpoint
+@app.post("/predict", response_model=PredictResponse)
+def predict(request: PredictRequest):
+    """Traditional ML prediction endpoint."""
+    return PredictResponse(
+        result=f"Processed text: {request.text}",
+        model=request.model,
+        route="/predict"
+    )
+
+# OpenAI-compatible chat completions endpoint
+@app.post("/v1/chat/completions", response_model=ChatResponse)
+def chat_completions(request: ChatRequest):
+    """OpenAI-compatible chat completions endpoint."""
+    # Simple echo implementation for example
+    last_message = request.messages[-1] if request.messages else ChatMessage(role="user", content="")
+
+    return ChatResponse(
+        choices=[{
+            "message": {
+                "role": "assistant",
+                "content": f"Echo: {last_message.content}"
+            },
+            "finish_reason": "stop",
+            "index": 0
+        }],
+        model=request.model,
+        usage={
+            "prompt_tokens": len(last_message.content.split()),
+            "completion_tokens": len(last_message.content.split()) + 1,
+            "total_tokens": len(last_message.content.split()) * 2 + 1
+        }
+    )
+
+# OpenAI-compatible completions endpoint
+@app.post("/v1/completions", response_model=CompletionResponse)
+def completions(request: CompletionRequest):
+    """OpenAI-compatible completions endpoint."""
+    return CompletionResponse(
+        choices=[{
+            "text": f" -> Completion for: {request.prompt}",
+            "finish_reason": "stop",
+            "index": 0
+        }],
+        model=request.model,
+        usage={
+            "prompt_tokens": len(request.prompt.split()),
+            "completion_tokens": 10,
+            "total_tokens": len(request.prompt.split()) + 10
+        }
+    )
+
+# Custom analysis endpoint
+@app.post("/analyze")
+def analyze_text(data: Dict[str, Any]):
+    """Custom text analysis endpoint."""
+    text = data.get("text", "")
+    if not text:
+        raise HTTPException(status_code=400, detail="Text field is required")
+
+    return {
+        "analysis": {
+            "word_count": len(text.split()),
+            "char_count": len(text),
+            "sentiment": "positive" if "good" in text.lower() else "neutral"
+        },
+        "text": text,
+        "route": "/analyze"
+    }
+
+# Another custom endpoint
+@app.get("/custom/endpoint")
+def custom_endpoint():
+    """A custom GET endpoint to demonstrate method flexibility."""
+    return {
+        "message": "This is a custom endpoint accessible via passthrough routing",
+        "methods_supported": ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"],
+        "route": "/custom/endpoint"
+    }
+
+# Batch processing endpoint
+@app.post("/batch/process")
+def batch_process(data: Dict[str, List[str]]):
+    """Batch processing endpoint for multiple texts."""
+    texts = data.get("texts", [])
+    return {
+        "results": [f"Processed: {text}" for text in texts],
+        "count": len(texts),
+        "route": "/batch/process"
+    }
+
+if __name__ == "__main__":
+    # Run the server
+    uvicorn.run(app, host="0.0.0.0", port=5005)
diff --git a/model-engine/model_engine_server/common/dtos/model_endpoints.py b/model-engine/model_engine_server/common/dtos/model_endpoints.py
index 36a7c7f68..775fc3b40 100644
--- a/model-engine/model_engine_server/common/dtos/model_endpoints.py
+++ b/model-engine/model_engine_server/common/dtos/model_endpoints.py
@@ -73,6 +73,22 @@ class CreateModelEndpointV1Request(BaseModel):
     default_callback_url: Optional[HttpUrlStr] = None
     default_callback_auth: Optional[CallbackAuth] = None
     public_inference: Optional[bool] = Field(default=False)
+    # Route configuration for multiple endpoints support
+    routes: Optional[List[str]] = Field(
+        default=None,
+        description="List of additional routes to forward to the user's service. "
+        "These routes will be added alongside the default /predict route. "
+        "Requires passthrough forwarder type."
+    )
+    extra_routes: Optional[List[str]] = Field(
+        default=None,
+        description="Legacy field for additional routes. Use 'routes' instead."
+    )
+    forwarder_type: Optional[str] = Field(
+        default=None,
+        description="Type of forwarder to use. Set to 'passthrough' to enable "
+        "multiple route forwarding to your FastAPI service."
+    )
 
 
 class CreateModelEndpointV1Response(BaseModel):
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 352b7a060..155a027d3 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -392,6 +392,10 @@ async def execute(
         chat_template_override: Optional[str],
         nodes_per_worker: int,
         additional_args: Optional[Dict[str, Any]] = None,
+        # Route configuration for multiple endpoints support
+        routes: Optional[List[str]] = None,
+        extra_routes: Optional[List[str]] = None,
+        forwarder_type: Optional[str] = None,
     ) -> ModelBundle:
         multinode = nodes_per_worker > 1
         if source != LLMSource.HUGGING_FACE:
@@ -459,6 +463,9 @@ async def execute(
                         checkpoint_path,
                         chat_template_override,
                         additional_args=additional_vllm_args,
+                        routes=routes,
+                        extra_routes=extra_routes,
+                        forwarder_type=forwarder_type,
                     )
                 else:
                     bundle_id = await self.create_vllm_bundle(
@@ -471,6 +478,9 @@ async def execute(
                         checkpoint_path,
                         chat_template_override,
                         additional_args=additional_vllm_args,
+                        routes=routes,
+                        extra_routes=extra_routes,
+                        forwarder_type=forwarder_type,
                     )
             case LLMInferenceFramework.SGLANG:  # pragma: no cover
                 if not hmi_config.sglang_repository:
@@ -991,6 +1001,9 @@ async def create_vllm_bundle(
         checkpoint_path: Optional[str],
         chat_template_override: Optional[str],
         additional_args: Optional[VLLMEndpointAdditionalArgs] = None,
+        routes: Optional[List[str]] = None,
+        extra_routes: Optional[List[str]] = None,
+        forwarder_type: Optional[str] = None,
     ):
         command = self._create_vllm_bundle_command(
             model_name,
@@ -1005,6 +1018,20 @@ async def create_vllm_bundle(
             additional_args=additional_args,
         )
 
+        # Determine which routes to use - user-provided or defaults
+        final_routes = []
+        final_extra_routes = []
+        final_forwarder_type = forwarder_type
+
+        if routes is not None:
+            final_routes = routes
+        else:
+            # Default to OpenAI compatibility routes for VLLM
+            final_routes = [OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH]
+
+        if extra_routes is not None:
+            final_extra_routes = extra_routes
+
         create_model_bundle_v2_request = CreateModelBundleV2Request(
             name=endpoint_unique_name,
             schema_location="TBA",
@@ -1019,10 +1046,9 @@ async def create_vllm_bundle(
                 healthcheck_route="/health",
                 predict_route="/predict",
                 streaming_predict_route="/stream",
-                routes=[
-                    OPENAI_CHAT_COMPLETION_PATH,
-                    OPENAI_COMPLETION_PATH,
-                ],
+                routes=final_routes,
+                extra_routes=final_extra_routes,
+                forwarder_type=final_forwarder_type,
                 env={},
             ),
             metadata={},
@@ -1051,6 +1077,9 @@ async def create_vllm_multinode_bundle(
         checkpoint_path: Optional[str],
         chat_template_override: Optional[str],
         additional_args: Optional[VLLMEndpointAdditionalArgs] = None,
+        routes: Optional[List[str]] = None,
+        extra_routes: Optional[List[str]] = None,
+        forwarder_type: Optional[str] = None,
     ):
         leader_command = self._create_vllm_bundle_command(
             model_name,
@@ -1087,6 +1116,20 @@ async def create_vllm_multinode_bundle(
             "RAY_CLUSTER_SIZE": "$(K8S_LWS_CLUSTER_SIZE)",
         }
 
+        # Determine which routes to use - user-provided or defaults
+        final_routes = []
+        final_extra_routes = []
+        final_forwarder_type = forwarder_type
+
+        if routes is not None:
+            final_routes = routes
+        else:
+            # Default to OpenAI compatibility routes for VLLM
+            final_routes = [OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH]
+
+        if extra_routes is not None:
+            final_extra_routes = extra_routes
+
         create_model_bundle_v2_request = CreateModelBundleV2Request(
             name=endpoint_unique_name,
             schema_location="TBA",
@@ -1101,7 +1144,9 @@ async def create_vllm_multinode_bundle(
                 healthcheck_route="/health",
                 predict_route="/predict",
                 streaming_predict_route="/stream",
-                routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH],
+                routes=final_routes,
+                extra_routes=final_extra_routes,
+                forwarder_type=final_forwarder_type,
                 env=common_vllm_envs,
                 worker_command=worker_command,
                 worker_env=common_vllm_envs,
@@ -1343,6 +1388,10 @@ async def execute(
             chat_template_override=request.chat_template_override,
             nodes_per_worker=request.nodes_per_worker,
             additional_args=request.model_dump(exclude_none=True),
+            # Pass route configuration to bundle creation
+            routes=request.routes,
+            extra_routes=request.extra_routes,
+            forwarder_type=request.forwarder_type,
         )
         validate_resource_requests(
             bundle=bundle,

From e9de35ba7348d6ff7d946ba89f1eedf523ef7d80 Mon Sep 17 00:00:00 2001
From: meher-m <meher.mankikar@scale.com>
Date: Tue, 14 Oct 2025 15:41:59 +0000
Subject: [PATCH 2/3] reformat

---
 .../llmengine/data_types/model_endpoints.py   |  7 +-
 examples/multi_route_client_example.py        | 63 +++++++++--------
 examples/multi_route_fastapi_server.py        | 67 +++++++++++--------
 .../common/dtos/model_endpoints.py            |  7 +-
 4 files changed, 77 insertions(+), 67 deletions(-)

diff --git a/clients/python/llmengine/data_types/model_endpoints.py b/clients/python/llmengine/data_types/model_endpoints.py
index 3d3756b35..d6f23aa19 100644
--- a/clients/python/llmengine/data_types/model_endpoints.py
+++ b/clients/python/llmengine/data_types/model_endpoints.py
@@ -69,16 +69,15 @@ class CreateLLMEndpointRequest(VLLMEndpointAdditionalArgs, BaseModel):
         default=None,
         description="List of additional routes to forward to the user's service. "
         "These routes will be added alongside the default /predict route. "
-        "Requires passthrough forwarder type."
+        "Requires passthrough forwarder type.",
     )
     extra_routes: Optional[List[str]] = Field(
-        default=None,
-        description="Legacy field for additional routes. Use 'routes' instead."
+        default=None, description="Legacy field for additional routes. Use 'routes' instead."
     )
     forwarder_type: Optional[str] = Field(
         default=None,
         description="Type of forwarder to use. Set to 'passthrough' to enable "
-        "multiple route forwarding to your FastAPI service."
+        "multiple route forwarding to your FastAPI service.",
     )
 
 
diff --git a/examples/multi_route_client_example.py b/examples/multi_route_client_example.py
index f5c6d8f90..2ca433f5e 100644
--- a/examples/multi_route_client_example.py
+++ b/examples/multi_route_client_example.py
@@ -13,6 +13,7 @@
 import requests
 import time
 
+
 def create_multi_route_endpoint():
     """
     Create a model endpoint with multiple routes using the new passthrough forwarder.
@@ -20,11 +21,11 @@ def create_multi_route_endpoint():
 
     # Define the routes we want to expose from our FastAPI server
     custom_routes = [
-        "/v1/chat/completions",    # OpenAI-compatible chat endpoint
-        "/v1/completions",         # OpenAI-compatible completions endpoint
-        "/analyze",                # Custom analysis endpoint
-        "/custom/endpoint",        # Custom GET endpoint
-        "/batch/process",          # Batch processing endpoint
+        "/v1/chat/completions",  # OpenAI-compatible chat endpoint
+        "/v1/completions",  # OpenAI-compatible completions endpoint
+        "/analyze",  # Custom analysis endpoint
+        "/custom/endpoint",  # Custom GET endpoint
+        "/batch/process",  # Batch processing endpoint
     ]
 
     print("Creating model endpoint with multiple routes...")
@@ -35,24 +36,20 @@ def create_multi_route_endpoint():
         name="multi-route-fastapi-example",
         model="llama-2-7b",  # This is just for the bundle creation, our custom server will handle the logic
         inference_framework_image_tag="latest",
-
         # Hardware configuration
         cpus=4,
         memory="8Gi",
         storage="20Gi",
         gpus=1,
         gpu_type="nvidia-ampere-a10",
-
         # Scaling configuration
         min_workers=1,
         max_workers=3,
         per_worker=10,
         endpoint_type=ModelEndpointType.STREAMING,
-
         # NEW: Multi-route configuration
-        routes=custom_routes,                    # List of routes to forward
-        forwarder_type="passthrough",           # Enable passthrough forwarding
-
+        routes=custom_routes,  # List of routes to forward
+        forwarder_type="passthrough",  # Enable passthrough forwarding
         # Other settings
         public_inference=False,
         labels={"example": "multi-route", "type": "fastapi"},
@@ -61,6 +58,7 @@ def create_multi_route_endpoint():
     print(f"Endpoint created! Task ID: {response.endpoint_creation_task_id}")
     return response.endpoint_creation_task_id
 
+
 def test_multi_route_endpoint(endpoint_name: str, base_url: str):
     """
     Test the multi-route endpoint by making requests to different routes.
@@ -74,7 +72,7 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str):
             "name": "Traditional Predict",
             "method": "POST",
             "url": f"{base_url}/predict",
-            "data": {"text": "Hello world", "model": "custom"}
+            "data": {"text": "Hello world", "model": "custom"},
         },
         {
             "name": "OpenAI Chat Completions",
@@ -83,8 +81,8 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str):
             "data": {
                 "messages": [{"role": "user", "content": "Hello, how are you?"}],
                 "model": "gpt-3.5-turbo",
-                "max_tokens": 50
-            }
+                "max_tokens": 50,
+            },
         },
         {
             "name": "OpenAI Completions",
@@ -93,27 +91,27 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str):
             "data": {
                 "prompt": "The future of AI is",
                 "model": "text-davinci-003",
-                "max_tokens": 50
-            }
+                "max_tokens": 50,
+            },
         },
         {
             "name": "Custom Analysis",
             "method": "POST",
             "url": f"{base_url}/analyze",
-            "data": {"text": "This is a good example of multi-route functionality"}
+            "data": {"text": "This is a good example of multi-route functionality"},
         },
         {
             "name": "Custom GET Endpoint",
             "method": "GET",
             "url": f"{base_url}/custom/endpoint",
-            "data": None
+            "data": None,
         },
         {
             "name": "Batch Processing",
             "method": "POST",
             "url": f"{base_url}/batch/process",
-            "data": {"texts": ["First text", "Second text", "Third text"]}
-        }
+            "data": {"texts": ["First text", "Second text", "Third text"]},
+        },
     ]
 
     # Execute test cases
@@ -122,10 +120,10 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str):
         print(f"URL: {test_case['url']}")
 
         try:
-            if test_case['method'] == 'GET':
-                response = requests.get(test_case['url'])
+            if test_case["method"] == "GET":
+                response = requests.get(test_case["url"])
             else:
-                response = requests.post(test_case['url'], json=test_case['data'])
+                response = requests.post(test_case["url"], json=test_case["data"])
 
             print(f"Status: {response.status_code}")
             if response.status_code == 200:
@@ -137,6 +135,7 @@ def test_multi_route_endpoint(endpoint_name: str, base_url: str):
         except requests.exceptions.RequestException as e:
             print(f"Request failed: {e}")
 
+
 def main():
     """
     Main example workflow.
@@ -146,7 +145,8 @@ def main():
     print("Launch Multi-Route FastAPI Server Example")
     print("=" * 60)
 
-    print("""\
+    print(
+        """\
 This example demonstrates the new multi-route passthrough functionality in Launch.
 
 Instead of being limited to a single /predict endpoint, you can now:
@@ -160,7 +160,8 @@ def main():
 - Support for GET, POST, PUT, DELETE, PATCH, HEAD, OPTIONS
 - OpenAI-compatible endpoints alongside custom routes
 - Easy migration of existing FastAPI applications
-""")
+"""
+    )
 
     # Step 1: Create the multi-route endpoint
     task_id = create_multi_route_endpoint()
@@ -185,20 +186,17 @@ def main():
     curl_examples = [
         {
             "name": "Traditional predict",
-            "cmd": f'curl -X POST {base_url}/predict -H "Content-Type: application/json" -d \'{{"text": "Hello world", "model": "custom"}}\''
+            "cmd": f'curl -X POST {base_url}/predict -H "Content-Type: application/json" -d \'{{"text": "Hello world", "model": "custom"}}\'',
         },
         {
             "name": "OpenAI chat",
-            "cmd": f'curl -X POST {base_url}/v1/chat/completions -H "Content-Type: application/json" -d \'{{"messages": [{{"role": "user", "content": "Hello!"}}], "model": "gpt-3.5-turbo"}}\''
+            "cmd": f'curl -X POST {base_url}/v1/chat/completions -H "Content-Type: application/json" -d \'{{"messages": [{{"role": "user", "content": "Hello!"}}], "model": "gpt-3.5-turbo"}}\'',
         },
         {
             "name": "Custom analysis",
-            "cmd": f'curl -X POST {base_url}/analyze -H "Content-Type: application/json" -d \'{{"text": "This is amazing!"}}\''
+            "cmd": f'curl -X POST {base_url}/analyze -H "Content-Type: application/json" -d \'{{"text": "This is amazing!"}}\'',
         },
-        {
-            "name": "Custom GET endpoint",
-            "cmd": f'curl -X GET {base_url}/custom/endpoint'
-        }
+        {"name": "Custom GET endpoint", "cmd": f"curl -X GET {base_url}/custom/endpoint"},
     ]
 
     for example in curl_examples:
@@ -212,5 +210,6 @@ def main():
     # Uncomment the following line to run actual tests if you have a deployed endpoint
     # test_multi_route_endpoint(endpoint_name, base_url)
 
+
 if __name__ == "__main__":
     main()
diff --git a/examples/multi_route_fastapi_server.py b/examples/multi_route_fastapi_server.py
index 40f43f16e..e0c06a11a 100644
--- a/examples/multi_route_fastapi_server.py
+++ b/examples/multi_route_fastapi_server.py
@@ -17,44 +17,53 @@
 # FastAPI server with multiple routes
 app = FastAPI(title="Multi-Route Example Server", version="1.0.0")
 
+
 # Data models
 class PredictRequest(BaseModel):
     text: str
     model: Optional[str] = "default"
 
+
 class PredictResponse(BaseModel):
     result: str
     model: str
     route: str
 
+
 class HealthResponse(BaseModel):
     status: str
     routes: List[str]
 
+
 class ChatMessage(BaseModel):
     role: str
     content: str
 
+
 class ChatRequest(BaseModel):
     messages: List[ChatMessage]
     model: Optional[str] = "gpt-3.5-turbo"
     max_tokens: Optional[int] = 100
 
+
 class ChatResponse(BaseModel):
     choices: List[Dict[str, Any]]
     model: str
     usage: Dict[str, int]
 
+
 class CompletionRequest(BaseModel):
     prompt: str
     model: Optional[str] = "text-davinci-003"
     max_tokens: Optional[int] = 100
 
+
 class CompletionResponse(BaseModel):
     choices: List[Dict[str, str]]
     model: str
     usage: Dict[str, int]
 
+
 # Health check endpoint (required by Launch)
 @app.get("/health", response_model=HealthResponse)
 @app.get("/readyz", response_model=HealthResponse)
@@ -67,62 +76,63 @@ def health_check():
             "/v1/chat/completions",
             "/v1/completions",
             "/analyze",
-            "/custom/endpoint"
-        ]
+            "/custom/endpoint",
+        ],
     )
 
+
 # Traditional predict endpoint
 @app.post("/predict", response_model=PredictResponse)
 def predict(request: PredictRequest):
     """Traditional ML prediction endpoint."""
     return PredictResponse(
-        result=f"Processed text: {request.text}",
-        model=request.model,
-        route="/predict"
+        result=f"Processed text: {request.text}", model=request.model, route="/predict"
     )
 
+
 # OpenAI-compatible chat completions endpoint
 @app.post("/v1/chat/completions", response_model=ChatResponse)
 def chat_completions(request: ChatRequest):
     """OpenAI-compatible chat completions endpoint."""
     # Simple echo implementation for example
-    last_message = request.messages[-1] if request.messages else ChatMessage(role="user", content="")
+    last_message = (
+        request.messages[-1] if request.messages else ChatMessage(role="user", content="")
+    )
 
     return ChatResponse(
-        choices=[{
-            "message": {
-                "role": "assistant",
-                "content": f"Echo: {last_message.content}"
-            },
-            "finish_reason": "stop",
-            "index": 0
-        }],
+        choices=[
+            {
+                "message": {"role": "assistant", "content": f"Echo: {last_message.content}"},
+                "finish_reason": "stop",
+                "index": 0,
+            }
+        ],
         model=request.model,
         usage={
             "prompt_tokens": len(last_message.content.split()),
             "completion_tokens": len(last_message.content.split()) + 1,
-            "total_tokens": len(last_message.content.split()) * 2 + 1
-        }
+            "total_tokens": len(last_message.content.split()) * 2 + 1,
+        },
     )
 
+
 # OpenAI-compatible completions endpoint
 @app.post("/v1/completions", response_model=CompletionResponse)
 def completions(request: CompletionRequest):
     """OpenAI-compatible completions endpoint."""
     return CompletionResponse(
-        choices=[{
-            "text": f" -> Completion for: {request.prompt}",
-            "finish_reason": "stop",
-            "index": 0
-        }],
+        choices=[
+            {"text": f" -> Completion for: {request.prompt}", "finish_reason": "stop", "index": 0}
+        ],
         model=request.model,
         usage={
             "prompt_tokens": len(request.prompt.split()),
             "completion_tokens": 10,
-            "total_tokens": len(request.prompt.split()) + 10
-        }
+            "total_tokens": len(request.prompt.split()) + 10,
+        },
     )
 
+
 # Custom analysis endpoint
 @app.post("/analyze")
 def analyze_text(data: Dict[str, Any]):
@@ -135,12 +145,13 @@ def analyze_text(data: Dict[str, Any]):
         "analysis": {
             "word_count": len(text.split()),
             "char_count": len(text),
-            "sentiment": "positive" if "good" in text.lower() else "neutral"
+            "sentiment": "positive" if "good" in text.lower() else "neutral",
         },
         "text": text,
-        "route": "/analyze"
+        "route": "/analyze",
     }
 
+
 # Another custom endpoint
 @app.get("/custom/endpoint")
 def custom_endpoint():
@@ -148,9 +159,10 @@ def custom_endpoint():
     return {
         "message": "This is a custom endpoint accessible via passthrough routing",
         "methods_supported": ["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS"],
-        "route": "/custom/endpoint"
+        "route": "/custom/endpoint",
     }
 
+
 # Batch processing endpoint
 @app.post("/batch/process")
 def batch_process(data: Dict[str, List[str]]):
@@ -159,9 +171,10 @@ def batch_process(data: Dict[str, List[str]]):
     return {
         "results": [f"Processed: {text}" for text in texts],
         "count": len(texts),
-        "route": "/batch/process"
+        "route": "/batch/process",
     }
 
+
 if __name__ == "__main__":
     # Run the server
     uvicorn.run(app, host="0.0.0.0", port=5005)
diff --git a/model-engine/model_engine_server/common/dtos/model_endpoints.py b/model-engine/model_engine_server/common/dtos/model_endpoints.py
index 775fc3b40..18d0aa66f 100644
--- a/model-engine/model_engine_server/common/dtos/model_endpoints.py
+++ b/model-engine/model_engine_server/common/dtos/model_endpoints.py
@@ -78,16 +78,15 @@ class CreateModelEndpointV1Request(BaseModel):
         default=None,
         description="List of additional routes to forward to the user's service. "
         "These routes will be added alongside the default /predict route. "
-        "Requires passthrough forwarder type."
+        "Requires passthrough forwarder type.",
     )
     extra_routes: Optional[List[str]] = Field(
-        default=None,
-        description="Legacy field for additional routes. Use 'routes' instead."
+        default=None, description="Legacy field for additional routes. Use 'routes' instead."
     )
     forwarder_type: Optional[str] = Field(
         default=None,
         description="Type of forwarder to use. Set to 'passthrough' to enable "
-        "multiple route forwarding to your FastAPI service."
+        "multiple route forwarding to your FastAPI service.",
     )
 
 

From 91b2006221d64ec609cebafb3ceb7c6961937342 Mon Sep 17 00:00:00 2001
From: meher-m <meher.mankikar@scale.com>
Date: Tue, 14 Oct 2025 15:48:33 +0000
Subject: [PATCH 3/3] reformat isort

---
 examples/multi_route_client_example.py | 7 ++++---
 examples/multi_route_fastapi_server.py | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/multi_route_client_example.py b/examples/multi_route_client_example.py
index 2ca433f5e..56caa5c4b 100644
--- a/examples/multi_route_client_example.py
+++ b/examples/multi_route_client_example.py
@@ -7,11 +7,12 @@
 natural paths rather than being restricted to just /predict.
 """
 
+import time
+
+import requests
 from llmengine import Model
-from llmengine.data_types.model_endpoints import CreateLLMEndpointRequest
 from llmengine.data_types.core import ModelEndpointType
-import requests
-import time
+from llmengine.data_types.model_endpoints import CreateLLMEndpointRequest
 
 
 def create_multi_route_endpoint():
diff --git a/examples/multi_route_fastapi_server.py b/examples/multi_route_fastapi_server.py
index e0c06a11a..0f0ad0e7d 100644
--- a/examples/multi_route_fastapi_server.py
+++ b/examples/multi_route_fastapi_server.py
@@ -9,10 +9,11 @@
 restriction, but now can be accessed through their natural paths.
 """
 
+from typing import Any, Dict, List, Optional
+
+import uvicorn
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from typing import Dict, List, Optional, Any
-import uvicorn
 
 # FastAPI server with multiple routes
 app = FastAPI(title="Multi-Route Example Server", version="1.0.0")