From 78ba25d0e146984da2ecda1a7e7759e55784adeb Mon Sep 17 00:00:00 2001
From: Tobrun Van Nuland <tobrun.van.nuland@gmail.com>
Date: Thu, 18 Sep 2025 18:03:18 +0200
Subject: [PATCH] Manage the vLLM lifecycles

---
 inference/monitor_vllm.sh    | 264 ++++++++++++++++++++++++
 inference/run_multi_react.py | 102 +++++++++-
 inference/run_react_infer.sh | 104 +++++++++-
 inference/vllm_manager.py    | 380 +++++++++++++++++++++++++++++++++++
 4 files changed, 835 insertions(+), 15 deletions(-)
 create mode 100755 inference/monitor_vllm.sh
 create mode 100755 inference/vllm_manager.py

diff --git a/inference/monitor_vllm.sh b/inference/monitor_vllm.sh
new file mode 100755
index 00000000..56748b60
--- /dev/null
+++ b/inference/monitor_vllm.sh
@@ -0,0 +1,264 @@
+#!/bin/bash
+
+# VLLM Process Monitor and Cleanup Script
+# This script helps monitor VLLM processes and provides quick cleanup options
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+STATE_FILE="$SCRIPT_DIR/vllm_processes.json"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+print_header() {
+    echo -e "${BLUE}===========================================${NC}"
+    echo -e "${BLUE}         VLLM Process Monitor${NC}"
+    echo -e "${BLUE}===========================================${NC}"
+}
+
+print_usage() {
+    cat << EOF
+Usage: $0 [COMMAND]
+
+Commands:
+    status      Show status of managed VLLM processes
+    list        List all VLLM processes (managed and orphaned)
+    cleanup     Clean up all VLLM processes
+    kill        Force kill all VLLM processes
+    gpu         Show GPU memory usage
+    ports       Show which ports are in use
+    help        Show this help message
+
+Examples:
+    $0 status       # Show managed process status
+    $0 list         # List all VLLM processes
+    $0 cleanup      # Clean shutdown of all processes
+    $0 kill         # Force kill all VLLM processes
+    $0 gpu          # Show GPU memory usage
+EOF
+}
+
+show_status() {
+    print_header
+    echo "Managed VLLM Process Status:"
+    echo "----------------------------"
+
+    if [ -f "$SCRIPT_DIR/vllm_manager.py" ]; then
+        python3 "$SCRIPT_DIR/vllm_manager.py" status --state-file "$STATE_FILE"
+    else
+        echo -e "${RED}Error: vllm_manager.py not found${NC}"
+        return 1
+    fi
+}
+
+list_all_processes() {
+    print_header
+    echo "All VLLM Processes:"
+    echo "-------------------"
+
+    echo -e "${YELLOW}Searching for VLLM processes...${NC}"
+
+    # Find all VLLM related processes
+    VLLM_PIDS=$(pgrep -f "vllm" 2>/dev/null || true)
+
+    if [ -z "$VLLM_PIDS" ]; then
+        echo -e "${GREEN}No VLLM processes found${NC}"
+        return 0
+    fi
+
+    echo -e "PID\tCOMMAND"
+    echo -e "---\t-------"
+
+    for pid in $VLLM_PIDS; do
+        if [ -e /proc/$pid ]; then
+            cmd=$(ps -p $pid -o command= 2>/dev/null | cut -c1-80)
+            echo -e "$pid\t$cmd"
+        fi
+    done
+
+    echo
+    echo -e "${YELLOW}Total VLLM processes: $(echo $VLLM_PIDS | wc -w)${NC}"
+}
+
+cleanup_processes() {
+    print_header
+    echo "Cleaning up VLLM processes..."
+    echo "------------------------------"
+
+    if [ -f "$SCRIPT_DIR/vllm_manager.py" ]; then
+        python3 "$SCRIPT_DIR/vllm_manager.py" cleanup --state-file "$STATE_FILE"
+    else
+        echo -e "${YELLOW}Warning: vllm_manager.py not found, using fallback method${NC}"
+        force_kill_processes
+    fi
+}
+
+force_kill_processes() {
+    print_header
+    echo "Force killing VLLM processes..."
+    echo "-------------------------------"
+
+    # Find all VLLM processes
+    VLLM_PIDS=$(pgrep -f "vllm" 2>/dev/null || true)
+
+    if [ -z "$VLLM_PIDS" ]; then
+        echo -e "${GREEN}No VLLM processes found${NC}"
+        return 0
+    fi
+
+    echo -e "${YELLOW}Found VLLM processes: $VLLM_PIDS${NC}"
+
+    # Kill processes
+    for pid in $VLLM_PIDS; do
+        if [ -e /proc/$pid ]; then
+            echo "Killing process $pid..."
+            kill -TERM $pid 2>/dev/null || true
+        fi
+    done
+
+    # Wait a bit for graceful shutdown
+    sleep 5
+
+    # Force kill any remaining processes
+    REMAINING_PIDS=$(pgrep -f "vllm" 2>/dev/null || true)
+    if [ -n "$REMAINING_PIDS" ]; then
+        echo -e "${YELLOW}Force killing remaining processes: $REMAINING_PIDS${NC}"
+        for pid in $REMAINING_PIDS; do
+            if [ -e /proc/$pid ]; then
+                kill -KILL $pid 2>/dev/null || true
+            fi
+        done
+    fi
+
+    # Clean up state file
+    rm -f "$STATE_FILE"
+
+    echo -e "${GREEN}Cleanup completed${NC}"
+}
+
+show_gpu_usage() {
+    print_header
+    echo "GPU Memory Usage:"
+    echo "-----------------"
+
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        nvidia-smi --query-gpu=index,name,memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits | \
+        awk -F', ' 'BEGIN {printf "%-5s %-20s %-10s %-10s %-8s\n", "GPU", "Name", "Used(MB)", "Total(MB)", "Usage%"}
+                   {printf "%-5s %-20s %-10s %-10s %-8s\n", $1, substr($2,1,20), $3, $4, $5}'
+    else
+        echo -e "${RED}Error: nvidia-smi not found${NC}"
+        return 1
+    fi
+
+    echo
+    echo "VLLM processes using GPU:"
+    echo "-------------------------"
+
+    VLLM_PIDS=$(pgrep -f "vllm" 2>/dev/null || true)
+    if [ -n "$VLLM_PIDS" ]; then
+        for pid in $VLLM_PIDS; do
+            if [ -e /proc/$pid ]; then
+                gpu_mem=$(nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits 2>/dev/null | grep "^$pid," || true)
+                if [ -n "$gpu_mem" ]; then
+                    mem=$(echo "$gpu_mem" | cut -d',' -f2)
+                    echo "PID $pid: ${mem}MB"
+                fi
+            fi
+        done
+    else
+        echo -e "${GREEN}No VLLM processes found${NC}"
+    fi
+}
+
+show_ports() {
+    print_header
+    echo "Port Usage (6001-6008):"
+    echo "------------------------"
+
+    for port in {6001..6008}; do
+        if netstat -tuln 2>/dev/null | grep ":$port " >/dev/null; then
+            pid=$(lsof -ti:$port 2>/dev/null || echo "unknown")
+            if [ "$pid" != "unknown" ] && [ -n "$pid" ]; then
+                cmd=$(ps -p $pid -o command= 2>/dev/null | cut -c1-40 || echo "unknown")
+                echo -e "${RED}Port $port: OCCUPIED (PID: $pid) - $cmd${NC}"
+            else
+                echo -e "${RED}Port $port: OCCUPIED${NC}"
+            fi
+        else
+            echo -e "${GREEN}Port $port: FREE${NC}"
+        fi
+    done
+
+    echo
+    echo "All listening ports on localhost:"
+    echo "----------------------------------"
+    netstat -tuln 2>/dev/null | grep "127.0.0.1" | head -20
+}
+
+interactive_menu() {
+    while true; do
+        print_header
+        echo "Select an option:"
+        echo "1) Show status"
+        echo "2) List all processes"
+        echo "3) Show GPU usage"
+        echo "4) Show port usage"
+        echo "5) Cleanup processes"
+        echo "6) Force kill processes"
+        echo "7) Exit"
+        echo
+        read -p "Enter choice [1-7]: " choice
+
+        case $choice in
+            1) show_status ;;
+            2) list_all_processes ;;
+            3) show_gpu_usage ;;
+            4) show_ports ;;
+            5) cleanup_processes ;;
+            6) force_kill_processes ;;
+            7) echo "Goodbye!"; exit 0 ;;
+            *) echo -e "${RED}Invalid option. Please try again.${NC}" ;;
+        esac
+
+        echo
+        read -p "Press Enter to continue..."
+        clear
+    done
+}
+
+# Main script logic
+case "${1:-}" in
+    status)
+        show_status
+        ;;
+    list)
+        list_all_processes
+        ;;
+    cleanup)
+        cleanup_processes
+        ;;
+    kill)
+        force_kill_processes
+        ;;
+    gpu)
+        show_gpu_usage
+        ;;
+    ports)
+        show_ports
+        ;;
+    help|--help|-h)
+        print_usage
+        ;;
+    "")
+        interactive_menu
+        ;;
+    *)
+        echo -e "${RED}Error: Unknown command '$1'${NC}"
+        echo
+        print_usage
+        exit 1
+        ;;
+esac
\ No newline at end of file
diff --git a/inference/run_multi_react.py b/inference/run_multi_react.py
index 1056a0a7..b487ea94 100644
--- a/inference/run_multi_react.py
+++ b/inference/run_multi_react.py
@@ -9,6 +9,60 @@
 from react_agent import MultiTurnReactAgent
 import time
 import math
+import signal
+import sys
+import atexit
+from pathlib import Path
+
+# Global variables for cleanup
+executor = None
+vllm_state_file = None
+
+def cleanup_resources():
+    """Cleanup resources and VLLM processes."""
+    print("\nCleaning up resources...")
+
+    # Cancel running tasks
+    if executor:
+        try:
+            print("Shutting down thread pool executor...")
+            executor.shutdown(wait=False, cancel_futures=True)
+        except Exception as e:
+            print(f"Error shutting down executor: {e}")
+
+    # Clean up VLLM processes if we started them
+    global vllm_state_file
+    if vllm_state_file and vllm_state_file.exists():
+        try:
+            script_dir = Path(__file__).parent
+            vllm_manager = script_dir / "vllm_manager.py"
+            if vllm_manager.exists():
+                import subprocess
+                print("Cleaning up VLLM processes...")
+                subprocess.run([
+                    sys.executable, str(vllm_manager), "cleanup",
+                    "--state-file", str(vllm_state_file)
+                ], check=False)
+            else:
+                print("VLLM manager not found, trying fallback cleanup...")
+                import subprocess
+                subprocess.run(["pkill", "-f", "vllm serve"], check=False)
+                subprocess.run(["pkill", "-f", "vllm.entrypoints.openai.api_server"], check=False)
+        except Exception as e:
+            print(f"Error during VLLM cleanup: {e}")
+
+    print("Cleanup completed")
+
+def signal_handler(signum, frame):
+    """Handle interrupt signals gracefully."""
+    print(f"\nReceived signal {signum}, initiating graceful shutdown...")
+    cleanup_resources()
+    sys.exit(128 + signum)
+
+# Register cleanup and signal handlers
+atexit.register(cleanup_resources)
+signal.signal(signal.SIGINT, signal_handler)
+signal.signal(signal.SIGTERM, signal_handler)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -22,8 +76,14 @@
     parser.add_argument("--roll_out_count", type=int, default=3)
     parser.add_argument("--total_splits", type=int, default=1)
     parser.add_argument("--worker_split", type=int, default=1)
+    parser.add_argument("--cleanup-on-exit", action="store_true", default=True,
+                       help="Cleanup VLLM processes on exit (default: True)")
     args = parser.parse_args()
 
+    # Initialize VLLM state file path
+    script_dir = Path(__file__).parent
+    vllm_state_file = script_dir / "vllm_processes.json"
+
     model = args.model
     output_base = args.output
     roll_out_count = args.roll_out_count
@@ -36,12 +96,12 @@
         exit(1)
 
     model_name = os.path.basename(model.rstrip('/'))
-    
+
     model_dir = os.path.join(output_base, f"{model_name}_sglang")
     dataset_dir = os.path.join(model_dir, args.dataset)
- 
+
     os.makedirs(dataset_dir, exist_ok=True)
-    
+
     print(f"Model name: {model_name}")
     print(f"Data set name: {args.dataset}")
     print(f"Output directory: {dataset_dir}")
@@ -75,10 +135,10 @@
     items_per_split = math.ceil(total_items / total_splits)
     start_idx = (worker_split - 1) * items_per_split
     end_idx = min(worker_split * items_per_split, total_items)
-    
+
     # Split the dataset
     items = items[start_idx:end_idx]
-    
+
     print(f"Total items in dataset: {total_items}")
     print(f"Processing items {start_idx} to {end_idx-1} ({len(items)} items)")
 
@@ -87,7 +147,7 @@
         output_files = {i: os.path.join(dataset_dir, f"iter{i}_split{worker_split}of{total_splits}.jsonl") for i in range(1, roll_out_count + 1)}
     else:
         output_files = {i: os.path.join(dataset_dir, f"iter{i}.jsonl") for i in range(1, roll_out_count + 1)}
-    
+
     processed_queries_per_rollout = {}
 
     for rollout_idx in range(1, roll_out_count + 1):
@@ -171,7 +231,11 @@
 
         write_locks = {i: threading.Lock() for i in range(1, roll_out_count + 1)}
 
-        with ThreadPoolExecutor(max_workers=args.max_workers) as executor:
+        # Set global executor for cleanup
+        global executor
+        executor = ThreadPoolExecutor(max_workers=args.max_workers)
+
+        try:
             future_to_task = {
                 executor.submit(
                     test_agent._run,
@@ -224,6 +288,28 @@
                         with open(output_file, "a", encoding="utf-8") as f:
                             f.write(json.dumps(error_result, ensure_ascii=False) + "\n")
 
-        print("\nAll tasks completed!")
+            print("\nAll tasks completed!")
+
+        except KeyboardInterrupt:
+            print("\nKeyboard interrupt received, canceling tasks...")
+            # Cancel all futures
+            for future in future_to_task:
+                future.cancel()
+            raise
+        except Exception as e:
+            print(f"\nError during task execution: {e}")
+            # Cancel all futures
+            for future in future_to_task:
+                future.cancel()
+            raise
+        finally:
+            # Clean shutdown of executor
+            if executor:
+                try:
+                    executor.shutdown(wait=True, cancel_futures=True)
+                except Exception as e:
+                    print(f"Error shutting down executor: {e}")
+                finally:
+                    executor = None
 
     print(f"\nAll {roll_out_count} rollouts completed!")
\ No newline at end of file
diff --git a/inference/run_react_infer.sh b/inference/run_react_infer.sh
index b4e865b7..dd867792 100644
--- a/inference/run_react_infer.sh
+++ b/inference/run_react_infer.sh
@@ -1,5 +1,59 @@
 #!/bin/bash
 
+# VLLM Process Management
+VLLM_MANAGER_PID=""
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+STATE_FILE="$SCRIPT_DIR/vllm_processes.json"
+
+# Cleanup function
+cleanup_vllm() {
+    echo "Cleaning up VLLM processes..."
+
+    # First try to kill processes using stored PIDs
+    if [ -f "$SCRIPT_DIR/vllm_pids.txt" ]; then
+        echo "Using stored PIDs for cleanup..."
+        STORED_PIDS=$(cat "$SCRIPT_DIR/vllm_pids.txt" 2>/dev/null || echo "")
+        if [ -n "$STORED_PIDS" ]; then
+            for pid in $STORED_PIDS; do
+                if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
+                    echo "Killing PID $pid..."
+                    kill -TERM "$pid" 2>/dev/null || true
+                fi
+            done
+
+            # Wait a bit for graceful shutdown
+            sleep 3
+
+            # Force kill any remaining
+            for pid in $STORED_PIDS; do
+                if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then
+                    echo "Force killing PID $pid..."
+                    kill -KILL "$pid" 2>/dev/null || true
+                fi
+            done
+        fi
+        rm -f "$SCRIPT_DIR/vllm_pids.txt"
+    fi
+
+    # Fallback: Use Python manager if available for comprehensive cleanup
+    if [ -f "$SCRIPT_DIR/vllm_manager.py" ]; then
+        python3 "$SCRIPT_DIR/vllm_manager.py" cleanup --state-file "$STATE_FILE" 2>/dev/null || true
+    fi
+
+    # Final fallback: Manual process killing
+    echo "Final cleanup check..."
+    pkill -f "vllm serve" 2>/dev/null || true
+    pkill -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true
+
+    # Clean up state files
+    rm -f "$STATE_FILE" "$SCRIPT_DIR/vllm_pids.txt"
+}
+
+# Set up signal traps
+trap 'echo "Received SIGINT, cleaning up..."; cleanup_vllm; exit 130' INT
+trap 'echo "Received SIGTERM, cleaning up..."; cleanup_vllm; exit 143' TERM
+trap 'cleanup_vllm' EXIT
+
 export TORCHDYNAMO_VERBOSE=1
 export TORCHDYNAMO_DISABLE=1
 export NCCL_IB_TC=16
@@ -27,7 +81,7 @@ export MODEL_PATH=/your/model/path
 export DATASET=your_dataset_name
 export OUTPUT_PATH=/your/output/path
 export ROLLOUT_COUNT=3 # eval avg@3
-export TEMPERATURE=0.85 
+export TEMPERATURE=0.85
 export PRESENCE_PENALTY=1.1
 export MAX_WORKERS=30
 
@@ -70,15 +124,52 @@ export IDP_KEY_SECRET=your_idp_key_secret
 ### 1. start server           ###
 ######################################
 
+# You can customize the VLLM server startup by:
+# 1. Commenting out GPU lines you don't have
+# 2. Modifying VLLM parameters as needed
+# 3. Changing ports if required
+# The cleanup will work regardless of how you start the servers
+
 echo "Starting VLLM servers..."
+
+# Store PIDs for cleanup tracking
+VLLM_PIDS=()
+
+# GPU 0 - Port 6001
 CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6001 --disable-log-requests &
+VLLM_PIDS+=($!)
+
+# GPU 1 - Port 6002
 CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6002 --disable-log-requests &
+VLLM_PIDS+=($!)
+
+# GPU 2 - Port 6003
 CUDA_VISIBLE_DEVICES=2 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6003 --disable-log-requests &
+VLLM_PIDS+=($!)
+
+# GPU 3 - Port 6004
 CUDA_VISIBLE_DEVICES=3 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6004 --disable-log-requests &
+VLLM_PIDS+=($!)
+
+# GPU 4 - Port 6005
 CUDA_VISIBLE_DEVICES=4 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6005 --disable-log-requests &
+VLLM_PIDS+=($!)
+
+# GPU 5 - Port 6006
 CUDA_VISIBLE_DEVICES=5 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6006 --disable-log-requests &
+VLLM_PIDS+=($!)
+
+# GPU 6 - Port 6007
 CUDA_VISIBLE_DEVICES=6 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6007 --disable-log-requests &
+VLLM_PIDS+=($!)
+
+# GPU 7 - Port 6008
 CUDA_VISIBLE_DEVICES=7 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6008 --disable-log-requests &
+VLLM_PIDS+=($!)
+
+# Save PIDs to state file for cleanup
+echo "Started VLLM processes with PIDs: ${VLLM_PIDS[@]}"
+echo "${VLLM_PIDS[@]}" > "$SCRIPT_DIR/vllm_pids.txt"
 
 #######################################################
 ### 2. Waiting for the server port to be ready  ###
@@ -99,7 +190,7 @@ echo "Waiting for servers to start..."
 
 while true; do
     all_ready=true
-    
+
     for port in "${main_ports[@]}"; do
         if [ "${server_status[$port]}" = "false" ]; then
             if curl -s -f http://localhost:$port/v1/models > /dev/null 2>&1; then
@@ -110,27 +201,26 @@ while true; do
             fi
         fi
     done
-    
+
     if [ "$all_ready" = "true" ]; then
         echo "All servers are ready for inference!"
         break
     fi
-    
+
     current_time=$(date +%s)
     elapsed=$((current_time - start_time))
     if [ $elapsed -gt $timeout ]; then
         echo -e "\nError: Server startup timeout after ${timeout} seconds"
-        
+
         for port in "${main_ports[@]}"; do
             if [ "${server_status[$port]}" = "false" ]; then
                 echo "Main model server (port $port) failed to start"
             fi
         done
 
-        
         exit 1
     fi
-    
+
     printf 'Waiting for servers to start .....'
     sleep 10
 done
diff --git a/inference/vllm_manager.py b/inference/vllm_manager.py
new file mode 100755
index 00000000..b12c5ab5
--- /dev/null
+++ b/inference/vllm_manager.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+"""
+VLLM Server Process Manager
+
+This utility manages VLLM server processes for the DeepResearch inference pipeline.
+It provides robust process lifecycle management with proper cleanup on exit.
+"""
+
+import os
+import sys
+import json
+import signal
+import subprocess
+import time
+import argparse
+from pathlib import Path
+from typing import List, Dict, Optional
+import psutil
+import atexit
+
+class VLLMManager:
+    def __init__(self, state_file: str = "vllm_processes.json"):
+        self.state_file = Path(state_file)
+        self.processes: Dict[int, subprocess.Popen] = {}
+        self.ports = [6001, 6002, 6003, 6004, 6005, 6006, 6007, 6008]
+
+        # Register cleanup on exit
+        atexit.register(self.cleanup_all)
+
+        # Handle signals
+        signal.signal(signal.SIGTERM, self._signal_handler)
+        signal.signal(signal.SIGINT, self._signal_handler)
+
+    def _signal_handler(self, signum, frame):
+        print(f"\nReceived signal {signum}, cleaning up VLLM processes...")
+        self.cleanup_all()
+        sys.exit(0)
+
+    def load_state(self) -> Dict:
+        """Load process state from file."""
+        if not self.state_file.exists():
+            return {}
+
+        try:
+            with open(self.state_file, 'r') as f:
+                return json.load(f)
+        except (json.JSONDecodeError, IOError) as e:
+            print(f"Warning: Could not load state file: {e}")
+            return {}
+
+    def save_state(self, state: Dict):
+        """Save process state to file."""
+        try:
+            with open(self.state_file, 'w') as f:
+                json.dump(state, f, indent=2)
+        except IOError as e:
+            print(f"Warning: Could not save state file: {e}")
+
+    def is_port_in_use(self, port: int) -> bool:
+        """Check if a port is already in use."""
+        try:
+            # Check if any process is using this port
+            for conn in psutil.net_connections():
+                if conn.laddr.port == port:
+                    return True
+            return False
+        except (psutil.AccessDenied, psutil.NoSuchProcess):
+            return False
+
+    def find_vllm_processes(self) -> List[psutil.Process]:
+        """Find all running VLLM processes."""
+        vllm_processes = []
+        for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
+            try:
+                cmdline_str = ' '.join(proc.info['cmdline']) if proc.info['cmdline'] else ''
+                if cmdline_str and ('vllm serve' in cmdline_str or 'vllm.entrypoints.openai.api_server' in cmdline_str):
+                    vllm_processes.append(proc)
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                continue
+        return vllm_processes
+
+    def start_servers(self, model_path: str, gpu_devices: List[int] = None) -> bool:
+        """Start VLLM servers on specified GPUs."""
+        if gpu_devices is None:
+            gpu_devices = list(range(8))  # Default to GPUs 0-7
+
+        if len(gpu_devices) > len(self.ports):
+            print(f"Warning: More GPUs ({len(gpu_devices)}) than ports ({len(self.ports)})")
+            gpu_devices = gpu_devices[:len(self.ports)]
+
+        print(f"Starting VLLM servers on {len(gpu_devices)} GPUs...")
+
+        state = self.load_state()
+        started_processes = {}
+
+        for i, (gpu_id, port) in enumerate(zip(gpu_devices, self.ports)):
+            if self.is_port_in_use(port):
+                print(f"Port {port} is already in use, skipping GPU {gpu_id}")
+                continue
+
+            print(f"Starting VLLM server on GPU {gpu_id}, port {port}...")
+
+            env = os.environ.copy()
+            env['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
+
+            cmd = [
+                'vllm', 'serve', model_path,
+                '--host', '0.0.0.0',
+                '--port', str(port),
+                '--disable-log-requests'
+            ]
+
+            try:
+                # Start process in new process group
+                process = subprocess.Popen(
+                    cmd,
+                    env=env,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    preexec_fn=os.setsid
+                )
+
+                self.processes[port] = process
+                started_processes[str(port)] = {
+                    'pid': process.pid,
+                    'gpu_id': gpu_id,
+                    'model_path': model_path,
+                    'started_at': time.time()
+                }
+
+                print(f"Started VLLM server on port {port} (PID: {process.pid})")
+
+            except Exception as e:
+                print(f"Failed to start VLLM server on GPU {gpu_id}: {e}")
+                return False
+
+        # Save state
+        state.update(started_processes)
+        self.save_state(state)
+
+        return len(started_processes) > 0
+
+    def wait_for_servers(self, timeout: int = 600) -> bool:
+        """Wait for all servers to be ready."""
+        print("Waiting for servers to start...")
+        start_time = time.time()
+
+        state = self.load_state()
+        ports_to_check = [int(port) for port in state.keys()]
+
+        ready_ports = set()
+
+        while time.time() - start_time < timeout:
+            all_ready = True
+
+            for port in ports_to_check:
+                if port in ready_ports:
+                    continue
+
+                try:
+                    import requests
+                    response = requests.get(f"http://localhost:{port}/v1/models", timeout=5)
+                    if response.status_code == 200:
+                        print(f"Server on port {port} is ready!")
+                        ready_ports.add(port)
+                    else:
+                        all_ready = False
+                except Exception:
+                    all_ready = False
+
+            if len(ready_ports) == len(ports_to_check):
+                print("All servers are ready!")
+                return True
+
+            if not all_ready:
+                print(".", end="", flush=True)
+                time.sleep(10)
+
+        print(f"\nTimeout: Only {len(ready_ports)}/{len(ports_to_check)} servers ready")
+        return len(ready_ports) > 0
+
+    def stop_servers(self, ports: List[int] = None) -> bool:
+        """Stop VLLM servers on specified ports."""
+        state = self.load_state()
+
+        if ports is None:
+            ports = [int(port) for port in state.keys()]
+
+        success = True
+        for port in ports:
+            if not self._stop_server(port, state):
+                success = False
+
+        # Clean up state file
+        remaining_state = {k: v for k, v in state.items() if int(k) not in ports}
+        self.save_state(remaining_state)
+
+        return success
+
+    def _stop_server(self, port: int, state: Dict) -> bool:
+        """Stop a specific VLLM server."""
+        port_str = str(port)
+
+        if port_str not in state:
+            print(f"No server recorded on port {port}")
+            return True
+
+        pid = state[port_str]['pid']
+
+        try:
+            # First try graceful shutdown
+            process = psutil.Process(pid)
+            print(f"Stopping VLLM server on port {port} (PID: {pid})...")
+
+            # Try SIGTERM first
+            process.terminate()
+
+            # Wait up to 30 seconds for graceful shutdown
+            try:
+                process.wait(timeout=30)
+                print(f"Server on port {port} stopped gracefully")
+                return True
+            except psutil.TimeoutExpired:
+                print(f"Server on port {port} didn't stop gracefully, using SIGKILL...")
+                process.kill()
+                process.wait(timeout=10)
+                print(f"Server on port {port} killed")
+                return True
+
+        except psutil.NoSuchProcess:
+            print(f"Process {pid} already dead")
+            return True
+        except Exception as e:
+            print(f"Error stopping server on port {port}: {e}")
+            return False
+
+    def cleanup_all(self):
+        """Clean up all VLLM processes."""
+        print("Cleaning up all VLLM processes...")
+
+        # First try to stop servers from state
+        state = self.load_state()
+        if state:
+            self.stop_servers()
+
+        # Then find and kill any remaining VLLM processes
+        orphan_processes = self.find_vllm_processes()
+
+        for proc in orphan_processes:
+            try:
+                print(f"Killing orphan VLLM process {proc.pid}")
+                proc.terminate()
+                proc.wait(timeout=10)
+            except (psutil.NoSuchProcess, psutil.TimeoutExpired):
+                try:
+                    proc.kill()
+                except psutil.NoSuchProcess:
+                    pass
+            except Exception as e:
+                print(f"Error killing process {proc.pid}: {e}")
+
+        # Clean up state file
+        if self.state_file.exists():
+            self.state_file.unlink()
+
+        print("Cleanup completed")
+
+    def status(self) -> Dict:
+        """Get status of all managed servers."""
+        state = self.load_state()
+        status_info = {}
+
+        for port_str, info in state.items():
+            port = int(port_str)
+            pid = info['pid']
+
+            try:
+                process = psutil.Process(pid)
+                is_running = process.is_running()
+
+                # Check if port is responsive
+                port_responsive = False
+                try:
+                    import requests
+                    response = requests.get(f"http://localhost:{port}/v1/models", timeout=2)
+                    port_responsive = response.status_code == 200
+                except Exception:
+                    pass
+
+                status_info[port] = {
+                    'pid': pid,
+                    'gpu_id': info.get('gpu_id', 'unknown'),
+                    'model_path': info.get('model_path', 'unknown'),
+                    'running': is_running,
+                    'responsive': port_responsive,
+                    'uptime': time.time() - info.get('started_at', 0)
+                }
+            except psutil.NoSuchProcess:
+                status_info[port] = {
+                    'pid': pid,
+                    'gpu_id': info.get('gpu_id', 'unknown'),
+                    'model_path': info.get('model_path', 'unknown'),
+                    'running': False,
+                    'responsive': False,
+                    'uptime': 0
+                }
+
+        return status_info
+
+
+def main():
+    parser = argparse.ArgumentParser(description="VLLM Server Process Manager")
+    parser.add_argument('command', choices=['start', 'stop', 'status', 'cleanup'],
+                       help='Command to execute')
+    parser.add_argument('--model', type=str, required=False,
+                       help='Model path (required for start command)')
+    parser.add_argument('--gpus', type=str, default='0,1,2,3,4,5,6,7',
+                       help='Comma-separated list of GPU IDs')
+    parser.add_argument('--ports', type=str,
+                       help='Comma-separated list of ports to operate on')
+    parser.add_argument('--timeout', type=int, default=600,
+                       help='Timeout for server startup (seconds)')
+    parser.add_argument('--state-file', type=str, default='vllm_processes.json',
+                       help='State file path')
+
+    args = parser.parse_args()
+
+    manager = VLLMManager(args.state_file)
+
+    if args.command == 'start':
+        if not args.model:
+            print("Error: --model is required for start command")
+            sys.exit(1)
+
+        gpu_devices = [int(x.strip()) for x in args.gpus.split(',')]
+
+        if manager.start_servers(args.model, gpu_devices):
+            if manager.wait_for_servers(args.timeout):
+                print("All servers started successfully!")
+                sys.exit(0)
+            else:
+                print("Some servers failed to start")
+                sys.exit(1)
+        else:
+            print("Failed to start servers")
+            sys.exit(1)
+
+    elif args.command == 'stop':
+        ports = None
+        if args.ports:
+            ports = [int(x.strip()) for x in args.ports.split(',')]
+
+        if manager.stop_servers(ports):
+            print("Servers stopped successfully")
+            sys.exit(0)
+        else:
+            print("Some servers failed to stop")
+            sys.exit(1)
+
+    elif args.command == 'status':
+        status_info = manager.status()
+        if not status_info:
+            print("No managed servers found")
+        else:
+            print("VLLM Server Status:")
+            print("-" * 80)
+            for port, info in status_info.items():
+                status = "RUNNING" if info['running'] else "STOPPED"
+                responsive = "RESPONSIVE" if info['responsive'] else "NOT RESPONSIVE"
+                uptime_str = f"{info['uptime']:.1f}s" if info['uptime'] > 0 else "N/A"
+                print(f"Port {port:4d}: {status:8s} | {responsive:14s} | "
+                      f"PID {info['pid']:6d} | GPU {info['gpu_id']} | Uptime {uptime_str}")
+
+    elif args.command == 'cleanup':
+        manager.cleanup_all()
+        print("Cleanup completed")
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file