ai-dynamo · Aphoh · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
@@ -123,6 +123,23 @@ async def init(runtime: DistributedRuntime, config: Config):
         await _handle_non_leader_node(engine, generate_endpoint)
         return
 
+    # Register engine routes for profiling
+    async def start_profile_handler(body: dict) -> dict:
+        """Handle /engine/start_profile requests"""
+        await engine.tokenizer_manager.start_profile(**body)
+        return {"status": "ok", "message": "Profiling started"}
+
+    async def stop_profile_handler(body: dict) -> dict:
+        """Handle /engine/stop_profile requests"""
+        await engine.tokenizer_manager.stop_profile()
+        return {"status": "ok", "message": "Profiling stopped"}
+
+    runtime.register_engine_route("start_profile", start_profile_handler)
+    runtime.register_engine_route("stop_profile", stop_profile_handler)
+    logging.info(
+        "Registered engine routes: /engine/start_profile, /engine/stop_profile"
+    )
+
     prefill_client = None
     prefill_router_client = None
     if config.serving_mode == DisaggregationMode.DECODE:
@@ -225,6 +242,23 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
         await _handle_non_leader_node(engine, generate_endpoint)
         return
 
+    # Register engine routes for profiling
+    async def start_profile_handler(body: dict) -> dict:
+        """Handle /engine/start_profile requests"""
+        await engine.tokenizer_manager.start_profile(**body)
+        return {"status": "ok", "message": "Profiling started"}
+
+    async def stop_profile_handler(body: dict) -> dict:
+        """Handle /engine/stop_profile requests"""
+        await engine.tokenizer_manager.stop_profile()
+        return {"status": "ok", "message": "Profiling stopped"}
+
+    runtime.register_engine_route("start_profile", start_profile_handler)
+    runtime.register_engine_route("stop_profile", stop_profile_handler)
+    logging.info(
+        "Registered engine routes: /engine/start_profile, /engine/stop_profile"
+    )
+
     # Perform dummy warmup for prefill worker to avoid initial TTFT hit
     # Only needed on leader node that handles requests
     await _warmup_prefill_engine(engine, server_args)

@@ -467,9 +467,6 @@ async def init(runtime: DistributedRuntime, config: Config):
 
     generate_endpoint = component.endpoint(config.endpoint)
     clear_endpoint = component.endpoint("clear_kv_blocks")
-    load_lora_endpoint = component.endpoint("load_lora")
-    unload_lora_endpoint = component.endpoint("unload_lora")
-    list_loras_endpoint = component.endpoint("list_loras")
 
     factory = StatLoggerFactory(
         component,
@@ -529,6 +526,32 @@ async def init(runtime: DistributedRuntime, config: Config):
 
     setup_metrics_collection(config, generate_endpoint, logger)
 
+    # Register engine routes for LoRA management (accessible via /engine/v1/*)
+    async def load_lora_handler(body: dict) -> dict:
+        """Handle /engine/v1/load_lora requests"""
+        async for result in handler.load_lora(body):
+            return result
+        return {"status": "error", "message": "No response from load_lora handler"}
+
+    async def unload_lora_handler(body: dict) -> dict:
+        """Handle /engine/v1/unload_lora requests"""
+        async for result in handler.unload_lora(body):
+            return result
+        return {"status": "error", "message": "No response from unload_lora handler"}
+
+    async def list_loras_handler(body: dict) -> dict:
+        """Handle /engine/v1/list_loras requests"""
+        async for result in handler.list_loras(body):
+            return result
+        return {"status": "error", "message": "No response from list_loras handler"}
+
+    runtime.register_engine_route("v1/load_lora", load_lora_handler)
+    runtime.register_engine_route("v1/unload_lora", unload_lora_handler)
+    runtime.register_engine_route("v1/list_loras", list_loras_handler)
+    logger.info(
+        "Registered engine routes: /engine/v1/load_lora, /engine/v1/unload_lora, /engine/v1/list_loras"
+    )
+
     if not config.engine_args.data_parallel_rank:  # if rank is 0 or None then register
         # Parse endpoint types from --dyn-endpoint-types flag
         model_type = parse_endpoint_types(config.dyn_endpoint_types)
@@ -570,18 +593,6 @@ async def init(runtime: DistributedRuntime, config: Config):
                 handler.clear_kv_blocks,
                 metrics_labels=[("model", config.served_model_name or config.model)],
             ),
-            load_lora_endpoint.serve_endpoint(
-                handler.load_lora,
-                metrics_labels=[("model", config.served_model_name or config.model)],
-            ),
-            unload_lora_endpoint.serve_endpoint(
-                handler.unload_lora,
-                metrics_labels=[("model", config.served_model_name or config.model)],
-            ),
-            list_loras_endpoint.serve_endpoint(
-                handler.list_loras,
-                metrics_labels=[("model", config.served_model_name or config.model)],
-            ),
         )
         logger.debug("serve_endpoint completed for decode worker")
     except Exception as e:

@@ -39,7 +39,7 @@ func TestLoadLoRA(t *testing.T) {
 			return
 		}
 		// Verify Content-Type header
-		if r.Header.Get("Content-Type") != "application/json" {
+		if r.Header.Get("Content-Type") != contentTypeJSON {
 			t.Errorf("expected Content-Type application/json, got %s", r.Header.Get("Content-Type"))
 		}
 		w.WriteHeader(http.StatusOK)

@@ -29,6 +29,8 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/log"
 )
 
+const contentTypeJSON = "application/json"
+
 // loadLoRA loads a LoRA model on a single endpoint
 func (c *Client) loadLoRA(ctx context.Context, address, modelName, sourceURI string) error {
 	logs := log.FromContext(ctx)
@@ -47,8 +49,8 @@ func (c *Client) loadLoRA(ctx context.Context, address, modelName, sourceURI str
 	}
 
 	// Build URL robustly using url.JoinPath to handle trailing slashes
-	// Pass path segments without leading slash to preserve any existing path in address (e.g., /v1)
-	apiURL, err := url.JoinPath(address, "v1", "loras")
+	// LoRA management uses /engine/v1/* routes
+	apiURL, err := url.JoinPath(address, "engine", "v1", "load_lora")
 	if err != nil {
 		return fmt.Errorf("failed to construct load LoRA URL: %w", err)
 	}
@@ -57,7 +59,7 @@ func (c *Client) loadLoRA(ctx context.Context, address, modelName, sourceURI str
 	if err != nil {
 		return fmt.Errorf("failed to create load LoRA request: %w", err)
 	}
-	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Content-Type", contentTypeJSON)
 
 	resp, err := c.httpClient.Do(req)
 	if err != nil {
@@ -83,19 +85,29 @@ func (c *Client) loadLoRA(ctx context.Context, address, modelName, sourceURI str
 func (c *Client) unloadLoRA(ctx context.Context, address, modelName string) error {
 	logs := log.FromContext(ctx)
 
-	// Build URL robustly using url.JoinPath to handle trailing slashes and encode modelName
-	// Pass path segments without leading slash to preserve any existing path in address (e.g., /v1)
-	apiURL, err := url.JoinPath(address, "v1", "loras", modelName)
+	// Build URL robustly using url.JoinPath to handle trailing slashes
+	// LoRA management uses /engine/v1/* routes
+	apiURL, err := url.JoinPath(address, "engine", "v1", "unload_lora")
 	if err != nil {
 		logs.V(1).Info("Failed to construct unload LoRA URL", "error", err)
 		return fmt.Errorf("failed to construct unload LoRA URL: %w", err)
 	}
 
-	req, err := http.NewRequestWithContext(ctx, "DELETE", apiURL, nil)
+	// Build request body with lora_name
+	unloadReq := map[string]interface{}{
+		"lora_name": modelName,
+	}
+	unloadBody, err := json.Marshal(unloadReq)
+	if err != nil {
+		return fmt.Errorf("failed to marshal unload LoRA request: %w", err)
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "POST", apiURL, bytes.NewBuffer(unloadBody))
 	if err != nil {
 		logs.V(1).Info("Failed to create unload LoRA request", "error", err)
 		return fmt.Errorf("failed to create unload LoRA request: %w", err)
 	}
+	req.Header.Set("Content-Type", contentTypeJSON)
 
 	resp, err := c.httpClient.Do(req)
 	if err != nil {

@@ -36,17 +36,17 @@ func TestLoadLoRA_URLConstruction(t *testing.T) {
 		{
 			name:            "address without trailing slash",
 			baseAddress:     "http://10.0.1.5:9090",
-			expectedURLPath: "/v1/loras",
+			expectedURLPath: "/engine/v1/load_lora",
 		},
 		{
 			name:            "address with trailing slash",
 			baseAddress:     "http://10.0.1.5:9090/",
-			expectedURLPath: "/v1/loras",
+			expectedURLPath: "/engine/v1/load_lora",
 		},
 		{
 			name:            "address with path",
 			baseAddress:     "http://10.0.1.5:9090/api",
-			expectedURLPath: "/api/v1/loras",
+			expectedURLPath: "/api/engine/v1/load_lora",
 		},
 	}
 
@@ -105,7 +105,7 @@ func TestLoadLoRA_RequestBody(t *testing.T) {
 				body, _ := io.ReadAll(r.Body)
 				_ = json.Unmarshal(body, &capturedBody)
 
-				if r.Header.Get("Content-Type") != "application/json" {
+				if r.Header.Get("Content-Type") != contentTypeJSON {
 					t.Errorf("expected Content-Type application/json, got %s", r.Header.Get("Content-Type"))
 				}
 
@@ -210,26 +210,26 @@ func TestLoadLoRA_ResponseHandling(t *testing.T) {
 	}
 }
 
-func TestUnloadLoRA_URLConstruction(t *testing.T) {
+func TestUnloadLoRA_RequestBody(t *testing.T) {
 	tests := []struct {
-		name            string
-		modelName       string
-		expectedURLPath string
+		name             string
+		modelName        string
+		expectedLoraName string
 	}{
 		{
-			name:            "simple model name",
-			modelName:       "my-lora",
-			expectedURLPath: "/v1/loras/my-lora",
+			name:             "simple model name",
+			modelName:        "my-lora",
+			expectedLoraName: "my-lora",
 		},
 		{
-			name:            "model name with special chars",
-			modelName:       "my-lora-v1.0",
-			expectedURLPath: "/v1/loras/my-lora-v1.0",
+			name:             "model name with special chars",
+			modelName:        "my-lora-v1.0",
+			expectedLoraName: "my-lora-v1.0",
 		},
 		{
-			name:            "model name with slashes (URL encoded)",
-			modelName:       "org/model",
-			expectedURLPath: "/v1/loras/org/model",
+			name:             "model name with slashes",
+			modelName:        "org/model",
+			expectedLoraName: "org/model",
 		},
 	}
 
@@ -238,9 +238,17 @@ func TestUnloadLoRA_URLConstruction(t *testing.T) {
 			// Create a test server that captures the request
 			var capturedPath string
 			var capturedMethod string
+			var capturedBody map[string]interface{}
 			server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 				capturedPath = r.URL.Path
 				capturedMethod = r.Method
+				body, _ := io.ReadAll(r.Body)
+				_ = json.Unmarshal(body, &capturedBody)
+
+				if r.Header.Get("Content-Type") != contentTypeJSON {
+					t.Errorf("expected Content-Type application/json, got %s", r.Header.Get("Content-Type"))
+				}
+
 				w.WriteHeader(http.StatusOK)
 			}))
 			defer server.Close()
@@ -254,12 +262,20 @@ func TestUnloadLoRA_URLConstruction(t *testing.T) {
 				t.Fatalf("unexpected error: %v", err)
 			}
 
-			if capturedMethod != "DELETE" {
-				t.Errorf("expected DELETE method, got %s", capturedMethod)
+			// Verify HTTP method is POST
+			if capturedMethod != "POST" {
+				t.Errorf("expected POST method, got %s", capturedMethod)
 			}
 
-			if capturedPath != tt.expectedURLPath {
-				t.Errorf("expected URL path %s, got %s", tt.expectedURLPath, capturedPath)
+			// Verify URL path is correct
+			expectedPath := "/engine/v1/unload_lora"
+			if capturedPath != expectedPath {
+				t.Errorf("expected URL path %s, got %s", expectedPath, capturedPath)
+			}
+
+			// Verify request body contains correct lora_name
+			if capturedBody["lora_name"] != tt.expectedLoraName {
+				t.Errorf("expected lora_name %s in body, got %v", tt.expectedLoraName, capturedBody["lora_name"])
 			}
 		})
 	}

diff --git a/docs/backends/sglang/profiling.md b/docs/backends/sglang/profiling.md
@@ -0,0 +1,44 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# Profiling SGLang Workers in Dynamo
+
+Dynamo exposes profiling endpoints for SGLang workers via the system server's `/engine/*` routes. This allows you to start and stop PyTorch profiling on running inference workers without restarting them.
+
+These endpoints wrap SGLang's internal `TokenizerManager.start_profile()` and `stop_profile()` methods. See SGLang's documentation for the full list of supported parameters.
+
+## Quick Start
+
+1. **Start profiling:**
+
+```bash
+curl -X POST http://localhost:9090/engine/start_profile \
+  -H "Content-Type: application/json" \
+  -d '{"output_dir": "/tmp/profiler_output"}'
+```
+
+2. **Run some inference requests to generate profiling data**
+
+3. **Stop profiling:**
+
+```bash
+curl -X POST http://localhost:9090/engine/stop_profile
+```
+
+4. **View the traces:**
+
+The profiler outputs Chrome trace files in the specified `output_dir`. You can view them using:
+- Chrome's `chrome://tracing`
+- [Perfetto UI](https://ui.perfetto.dev/)
+- TensorBoard with the PyTorch Profiler plugin
+
+## Test Script
+
+A test script is provided at [`examples/backends/sglang/test_sglang_profile.py`](../../../examples/backends/sglang/test_sglang_profile.py) that demonstrates the full profiling workflow:
+
+```bash
+python examples/backends/sglang/test_sglang_profile.py
+```
+
diff --git a/docs/hidden_toctree.rst b/docs/hidden_toctree.rst
@@ -61,6 +61,7 @@
    backends/sglang/expert-distribution-eplb.md
    backends/sglang/gpt-oss.md
    backends/sglang/multimodal_epd.md
+   backends/sglang/profiling.md
    backends/sglang/sgl-hicache-example.md
    backends/sglang/sglang-disaggregation.md
    backends/sglang/prometheus.md