Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions components/src/dynamo/sglang/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,23 @@ async def init(runtime: DistributedRuntime, config: Config):
await _handle_non_leader_node(engine, generate_endpoint)
return

# Register engine routes for profiling
async def start_profile_handler(body: dict) -> dict:
"""Handle /engine/start_profile requests"""
await engine.tokenizer_manager.start_profile(**body)
return {"status": "ok", "message": "Profiling started"}

async def stop_profile_handler(body: dict) -> dict:
"""Handle /engine/stop_profile requests"""
await engine.tokenizer_manager.stop_profile()
return {"status": "ok", "message": "Profiling stopped"}

runtime.register_engine_route("start_profile", start_profile_handler)
runtime.register_engine_route("stop_profile", stop_profile_handler)
logging.info(
"Registered engine routes: /engine/start_profile, /engine/stop_profile"
)

prefill_client = None
prefill_router_client = None
if config.serving_mode == DisaggregationMode.DECODE:
Expand Down Expand Up @@ -225,6 +242,23 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
await _handle_non_leader_node(engine, generate_endpoint)
return

# Register engine routes for profiling
async def start_profile_handler(body: dict) -> dict:
"""Handle /engine/start_profile requests"""
await engine.tokenizer_manager.start_profile(**body)
return {"status": "ok", "message": "Profiling started"}

async def stop_profile_handler(body: dict) -> dict:
"""Handle /engine/stop_profile requests"""
await engine.tokenizer_manager.stop_profile()
return {"status": "ok", "message": "Profiling stopped"}

runtime.register_engine_route("start_profile", start_profile_handler)
runtime.register_engine_route("stop_profile", stop_profile_handler)
logging.info(
"Registered engine routes: /engine/start_profile, /engine/stop_profile"
)

# Perform dummy warmup for prefill worker to avoid initial TTFT hit
# Only needed on leader node that handles requests
await _warmup_prefill_engine(engine, server_args)
Expand Down
41 changes: 26 additions & 15 deletions components/src/dynamo/vllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,9 +467,6 @@ async def init(runtime: DistributedRuntime, config: Config):

generate_endpoint = component.endpoint(config.endpoint)
clear_endpoint = component.endpoint("clear_kv_blocks")
load_lora_endpoint = component.endpoint("load_lora")
unload_lora_endpoint = component.endpoint("unload_lora")
list_loras_endpoint = component.endpoint("list_loras")

factory = StatLoggerFactory(
component,
Expand Down Expand Up @@ -529,6 +526,32 @@ async def init(runtime: DistributedRuntime, config: Config):

setup_metrics_collection(config, generate_endpoint, logger)

# Register engine routes for LoRA management (accessible via /engine/v1/*)
async def load_lora_handler(body: dict) -> dict:
"""Handle /engine/v1/load_lora requests"""
async for result in handler.load_lora(body):
return result
return {"status": "error", "message": "No response from load_lora handler"}

async def unload_lora_handler(body: dict) -> dict:
"""Handle /engine/v1/unload_lora requests"""
async for result in handler.unload_lora(body):
return result
return {"status": "error", "message": "No response from unload_lora handler"}

async def list_loras_handler(body: dict) -> dict:
"""Handle /engine/v1/list_loras requests"""
async for result in handler.list_loras(body):
return result
return {"status": "error", "message": "No response from list_loras handler"}

runtime.register_engine_route("v1/load_lora", load_lora_handler)
runtime.register_engine_route("v1/unload_lora", unload_lora_handler)
runtime.register_engine_route("v1/list_loras", list_loras_handler)
logger.info(
"Registered engine routes: /engine/v1/load_lora, /engine/v1/unload_lora, /engine/v1/list_loras"
)

if not config.engine_args.data_parallel_rank: # if rank is 0 or None then register
# Parse endpoint types from --dyn-endpoint-types flag
model_type = parse_endpoint_types(config.dyn_endpoint_types)
Expand Down Expand Up @@ -570,18 +593,6 @@ async def init(runtime: DistributedRuntime, config: Config):
handler.clear_kv_blocks,
metrics_labels=[("model", config.served_model_name or config.model)],
),
load_lora_endpoint.serve_endpoint(
handler.load_lora,
metrics_labels=[("model", config.served_model_name or config.model)],
),
unload_lora_endpoint.serve_endpoint(
handler.unload_lora,
metrics_labels=[("model", config.served_model_name or config.model)],
),
list_loras_endpoint.serve_endpoint(
handler.list_loras,
metrics_labels=[("model", config.served_model_name or config.model)],
),
)
logger.debug("serve_endpoint completed for decode worker")
except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func TestLoadLoRA(t *testing.T) {
return
}
// Verify Content-Type header
if r.Header.Get("Content-Type") != "application/json" {
if r.Header.Get("Content-Type") != contentTypeJSON {
t.Errorf("expected Content-Type application/json, got %s", r.Header.Get("Content-Type"))
}
w.WriteHeader(http.StatusOK)
Expand Down
26 changes: 19 additions & 7 deletions deploy/cloud/operator/internal/modelendpoint/lora.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"
)

const contentTypeJSON = "application/json"

// loadLoRA loads a LoRA model on a single endpoint
func (c *Client) loadLoRA(ctx context.Context, address, modelName, sourceURI string) error {
logs := log.FromContext(ctx)
Expand All @@ -47,8 +49,8 @@ func (c *Client) loadLoRA(ctx context.Context, address, modelName, sourceURI str
}

// Build URL robustly using url.JoinPath to handle trailing slashes
// Pass path segments without leading slash to preserve any existing path in address (e.g., /v1)
apiURL, err := url.JoinPath(address, "v1", "loras")
// LoRA management uses /engine/v1/* routes
apiURL, err := url.JoinPath(address, "engine", "v1", "load_lora")
if err != nil {
return fmt.Errorf("failed to construct load LoRA URL: %w", err)
}
Expand All @@ -57,7 +59,7 @@ func (c *Client) loadLoRA(ctx context.Context, address, modelName, sourceURI str
if err != nil {
return fmt.Errorf("failed to create load LoRA request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Content-Type", contentTypeJSON)

resp, err := c.httpClient.Do(req)
if err != nil {
Expand All @@ -83,19 +85,29 @@ func (c *Client) loadLoRA(ctx context.Context, address, modelName, sourceURI str
func (c *Client) unloadLoRA(ctx context.Context, address, modelName string) error {
logs := log.FromContext(ctx)

// Build URL robustly using url.JoinPath to handle trailing slashes and encode modelName
// Pass path segments without leading slash to preserve any existing path in address (e.g., /v1)
apiURL, err := url.JoinPath(address, "v1", "loras", modelName)
// Build URL robustly using url.JoinPath to handle trailing slashes
// LoRA management uses /engine/v1/* routes
apiURL, err := url.JoinPath(address, "engine", "v1", "unload_lora")
if err != nil {
logs.V(1).Info("Failed to construct unload LoRA URL", "error", err)
return fmt.Errorf("failed to construct unload LoRA URL: %w", err)
}

req, err := http.NewRequestWithContext(ctx, "DELETE", apiURL, nil)
// Build request body with lora_name
unloadReq := map[string]interface{}{
"lora_name": modelName,
}
unloadBody, err := json.Marshal(unloadReq)
if err != nil {
return fmt.Errorf("failed to marshal unload LoRA request: %w", err)
}

req, err := http.NewRequestWithContext(ctx, "POST", apiURL, bytes.NewBuffer(unloadBody))
if err != nil {
logs.V(1).Info("Failed to create unload LoRA request", "error", err)
return fmt.Errorf("failed to create unload LoRA request: %w", err)
}
req.Header.Set("Content-Type", contentTypeJSON)

resp, err := c.httpClient.Do(req)
if err != nil {
Expand Down
58 changes: 37 additions & 21 deletions deploy/cloud/operator/internal/modelendpoint/lora_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,17 @@ func TestLoadLoRA_URLConstruction(t *testing.T) {
{
name: "address without trailing slash",
baseAddress: "http://10.0.1.5:9090",
expectedURLPath: "/v1/loras",
expectedURLPath: "/engine/v1/load_lora",
},
{
name: "address with trailing slash",
baseAddress: "http://10.0.1.5:9090/",
expectedURLPath: "/v1/loras",
expectedURLPath: "/engine/v1/load_lora",
},
{
name: "address with path",
baseAddress: "http://10.0.1.5:9090/api",
expectedURLPath: "/api/v1/loras",
expectedURLPath: "/api/engine/v1/load_lora",
},
}

Expand Down Expand Up @@ -105,7 +105,7 @@ func TestLoadLoRA_RequestBody(t *testing.T) {
body, _ := io.ReadAll(r.Body)
_ = json.Unmarshal(body, &capturedBody)

if r.Header.Get("Content-Type") != "application/json" {
if r.Header.Get("Content-Type") != contentTypeJSON {
t.Errorf("expected Content-Type application/json, got %s", r.Header.Get("Content-Type"))
}

Expand Down Expand Up @@ -210,26 +210,26 @@ func TestLoadLoRA_ResponseHandling(t *testing.T) {
}
}

func TestUnloadLoRA_URLConstruction(t *testing.T) {
func TestUnloadLoRA_RequestBody(t *testing.T) {
tests := []struct {
name string
modelName string
expectedURLPath string
name string
modelName string
expectedLoraName string
}{
{
name: "simple model name",
modelName: "my-lora",
expectedURLPath: "/v1/loras/my-lora",
name: "simple model name",
modelName: "my-lora",
expectedLoraName: "my-lora",
},
{
name: "model name with special chars",
modelName: "my-lora-v1.0",
expectedURLPath: "/v1/loras/my-lora-v1.0",
name: "model name with special chars",
modelName: "my-lora-v1.0",
expectedLoraName: "my-lora-v1.0",
},
{
name: "model name with slashes (URL encoded)",
modelName: "org/model",
expectedURLPath: "/v1/loras/org/model",
name: "model name with slashes",
modelName: "org/model",
expectedLoraName: "org/model",
},
}

Expand All @@ -238,9 +238,17 @@ func TestUnloadLoRA_URLConstruction(t *testing.T) {
// Create a test server that captures the request
var capturedPath string
var capturedMethod string
var capturedBody map[string]interface{}
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
capturedPath = r.URL.Path
capturedMethod = r.Method
body, _ := io.ReadAll(r.Body)
_ = json.Unmarshal(body, &capturedBody)

if r.Header.Get("Content-Type") != contentTypeJSON {
t.Errorf("expected Content-Type application/json, got %s", r.Header.Get("Content-Type"))
}

w.WriteHeader(http.StatusOK)
}))
defer server.Close()
Expand All @@ -254,12 +262,20 @@ func TestUnloadLoRA_URLConstruction(t *testing.T) {
t.Fatalf("unexpected error: %v", err)
}

if capturedMethod != "DELETE" {
t.Errorf("expected DELETE method, got %s", capturedMethod)
// Verify HTTP method is POST
if capturedMethod != "POST" {
t.Errorf("expected POST method, got %s", capturedMethod)
}

if capturedPath != tt.expectedURLPath {
t.Errorf("expected URL path %s, got %s", tt.expectedURLPath, capturedPath)
// Verify URL path is correct
expectedPath := "/engine/v1/unload_lora"
if capturedPath != expectedPath {
t.Errorf("expected URL path %s, got %s", expectedPath, capturedPath)
}

// Verify request body contains correct lora_name
if capturedBody["lora_name"] != tt.expectedLoraName {
t.Errorf("expected lora_name %s in body, got %v", tt.expectedLoraName, capturedBody["lora_name"])
}
})
}
Expand Down
44 changes: 44 additions & 0 deletions docs/backends/sglang/profiling.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
-->

# Profiling SGLang Workers in Dynamo

Dynamo exposes profiling endpoints for SGLang workers via the system server's `/engine/*` routes. This allows you to start and stop PyTorch profiling on running inference workers without restarting them.

These endpoints wrap SGLang's internal `TokenizerManager.start_profile()` and `stop_profile()` methods. See SGLang's documentation for the full list of supported parameters.

## Quick Start

1. **Start profiling:**

```bash
curl -X POST http://localhost:9090/engine/start_profile \
-H "Content-Type: application/json" \
-d '{"output_dir": "/tmp/profiler_output"}'
```

2. **Run some inference requests to generate profiling data**

3. **Stop profiling:**

```bash
curl -X POST http://localhost:9090/engine/stop_profile
```

4. **View the traces:**

The profiler outputs Chrome trace files in the specified `output_dir`. You can view them using:
- Chrome's `chrome://tracing`
- [Perfetto UI](https://ui.perfetto.dev/)
- TensorBoard with the PyTorch Profiler plugin

## Test Script

A test script is provided at [`examples/backends/sglang/test_sglang_profile.py`](../../../examples/backends/sglang/test_sglang_profile.py) that demonstrates the full profiling workflow:

```bash
python examples/backends/sglang/test_sglang_profile.py
```

1 change: 1 addition & 0 deletions docs/hidden_toctree.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
backends/sglang/expert-distribution-eplb.md
backends/sglang/gpt-oss.md
backends/sglang/multimodal_epd.md
backends/sglang/profiling.md
backends/sglang/sgl-hicache-example.md
backends/sglang/sglang-disaggregation.md
backends/sglang/prometheus.md
Expand Down
Loading
Loading