From 78ba25d0e146984da2ecda1a7e7759e55784adeb Mon Sep 17 00:00:00 2001 From: Tobrun Van Nuland Date: Thu, 18 Sep 2025 18:03:18 +0200 Subject: [PATCH] Manage the vLLM lifecycles --- inference/monitor_vllm.sh | 264 ++++++++++++++++++++++++ inference/run_multi_react.py | 102 +++++++++- inference/run_react_infer.sh | 104 +++++++++- inference/vllm_manager.py | 380 +++++++++++++++++++++++++++++++++++ 4 files changed, 835 insertions(+), 15 deletions(-) create mode 100755 inference/monitor_vllm.sh create mode 100755 inference/vllm_manager.py diff --git a/inference/monitor_vllm.sh b/inference/monitor_vllm.sh new file mode 100755 index 00000000..56748b60 --- /dev/null +++ b/inference/monitor_vllm.sh @@ -0,0 +1,264 @@ +#!/bin/bash + +# VLLM Process Monitor and Cleanup Script +# This script helps monitor VLLM processes and provides quick cleanup options + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +STATE_FILE="$SCRIPT_DIR/vllm_processes.json" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +print_header() { + echo -e "${BLUE}===========================================${NC}" + echo -e "${BLUE} VLLM Process Monitor${NC}" + echo -e "${BLUE}===========================================${NC}" +} + +print_usage() { + cat << EOF +Usage: $0 [COMMAND] + +Commands: + status Show status of managed VLLM processes + list List all VLLM processes (managed and orphaned) + cleanup Clean up all VLLM processes + kill Force kill all VLLM processes + gpu Show GPU memory usage + ports Show which ports are in use + help Show this help message + +Examples: + $0 status # Show managed process status + $0 list # List all VLLM processes + $0 cleanup # Clean shutdown of all processes + $0 kill # Force kill all VLLM processes + $0 gpu # Show GPU memory usage +EOF +} + +show_status() { + print_header + echo "Managed VLLM Process Status:" + echo "----------------------------" + + if [ -f "$SCRIPT_DIR/vllm_manager.py" ]; then + python3 "$SCRIPT_DIR/vllm_manager.py" status --state-file "$STATE_FILE" + else + echo -e "${RED}Error: vllm_manager.py not found${NC}" + return 1 + fi +} + +list_all_processes() { + print_header + echo "All VLLM Processes:" + echo "-------------------" + + echo -e "${YELLOW}Searching for VLLM processes...${NC}" + + # Find all VLLM related processes + VLLM_PIDS=$(pgrep -f "vllm" 2>/dev/null || true) + + if [ -z "$VLLM_PIDS" ]; then + echo -e "${GREEN}No VLLM processes found${NC}" + return 0 + fi + + echo -e "PID\tCOMMAND" + echo -e "---\t-------" + + for pid in $VLLM_PIDS; do + if [ -e /proc/$pid ]; then + cmd=$(ps -p $pid -o command= 2>/dev/null | cut -c1-80) + echo -e "$pid\t$cmd" + fi + done + + echo + echo -e "${YELLOW}Total VLLM processes: $(echo $VLLM_PIDS | wc -w)${NC}" +} + +cleanup_processes() { + print_header + echo "Cleaning up VLLM processes..." + echo "------------------------------" + + if [ -f "$SCRIPT_DIR/vllm_manager.py" ]; then + python3 "$SCRIPT_DIR/vllm_manager.py" cleanup --state-file "$STATE_FILE" + else + echo -e "${YELLOW}Warning: vllm_manager.py not found, using fallback method${NC}" + force_kill_processes + fi +} + +force_kill_processes() { + print_header + echo "Force killing VLLM processes..." + echo "-------------------------------" + + # Find all VLLM processes + VLLM_PIDS=$(pgrep -f "vllm" 2>/dev/null || true) + + if [ -z "$VLLM_PIDS" ]; then + echo -e "${GREEN}No VLLM processes found${NC}" + return 0 + fi + + echo -e "${YELLOW}Found VLLM processes: $VLLM_PIDS${NC}" + + # Kill processes + for pid in $VLLM_PIDS; do + if [ -e /proc/$pid ]; then + echo "Killing process $pid..." + kill -TERM $pid 2>/dev/null || true + fi + done + + # Wait a bit for graceful shutdown + sleep 5 + + # Force kill any remaining processes + REMAINING_PIDS=$(pgrep -f "vllm" 2>/dev/null || true) + if [ -n "$REMAINING_PIDS" ]; then + echo -e "${YELLOW}Force killing remaining processes: $REMAINING_PIDS${NC}" + for pid in $REMAINING_PIDS; do + if [ -e /proc/$pid ]; then + kill -KILL $pid 2>/dev/null || true + fi + done + fi + + # Clean up state file + rm -f "$STATE_FILE" + + echo -e "${GREEN}Cleanup completed${NC}" +} + +show_gpu_usage() { + print_header + echo "GPU Memory Usage:" + echo "-----------------" + + if command -v nvidia-smi >/dev/null 2>&1; then + nvidia-smi --query-gpu=index,name,memory.used,memory.total,utilization.gpu --format=csv,noheader,nounits | \ + awk -F', ' 'BEGIN {printf "%-5s %-20s %-10s %-10s %-8s\n", "GPU", "Name", "Used(MB)", "Total(MB)", "Usage%"} + {printf "%-5s %-20s %-10s %-10s %-8s\n", $1, substr($2,1,20), $3, $4, $5}' + else + echo -e "${RED}Error: nvidia-smi not found${NC}" + return 1 + fi + + echo + echo "VLLM processes using GPU:" + echo "-------------------------" + + VLLM_PIDS=$(pgrep -f "vllm" 2>/dev/null || true) + if [ -n "$VLLM_PIDS" ]; then + for pid in $VLLM_PIDS; do + if [ -e /proc/$pid ]; then + gpu_mem=$(nvidia-smi --query-compute-apps=pid,used_memory --format=csv,noheader,nounits 2>/dev/null | grep "^$pid," || true) + if [ -n "$gpu_mem" ]; then + mem=$(echo "$gpu_mem" | cut -d',' -f2) + echo "PID $pid: ${mem}MB" + fi + fi + done + else + echo -e "${GREEN}No VLLM processes found${NC}" + fi +} + +show_ports() { + print_header + echo "Port Usage (6001-6008):" + echo "------------------------" + + for port in {6001..6008}; do + if netstat -tuln 2>/dev/null | grep ":$port " >/dev/null; then + pid=$(lsof -ti:$port 2>/dev/null || echo "unknown") + if [ "$pid" != "unknown" ] && [ -n "$pid" ]; then + cmd=$(ps -p $pid -o command= 2>/dev/null | cut -c1-40 || echo "unknown") + echo -e "${RED}Port $port: OCCUPIED (PID: $pid) - $cmd${NC}" + else + echo -e "${RED}Port $port: OCCUPIED${NC}" + fi + else + echo -e "${GREEN}Port $port: FREE${NC}" + fi + done + + echo + echo "All listening ports on localhost:" + echo "----------------------------------" + netstat -tuln 2>/dev/null | grep "127.0.0.1" | head -20 +} + +interactive_menu() { + while true; do + print_header + echo "Select an option:" + echo "1) Show status" + echo "2) List all processes" + echo "3) Show GPU usage" + echo "4) Show port usage" + echo "5) Cleanup processes" + echo "6) Force kill processes" + echo "7) Exit" + echo + read -p "Enter choice [1-7]: " choice + + case $choice in + 1) show_status ;; + 2) list_all_processes ;; + 3) show_gpu_usage ;; + 4) show_ports ;; + 5) cleanup_processes ;; + 6) force_kill_processes ;; + 7) echo "Goodbye!"; exit 0 ;; + *) echo -e "${RED}Invalid option. Please try again.${NC}" ;; + esac + + echo + read -p "Press Enter to continue..." + clear + done +} + +# Main script logic +case "${1:-}" in + status) + show_status + ;; + list) + list_all_processes + ;; + cleanup) + cleanup_processes + ;; + kill) + force_kill_processes + ;; + gpu) + show_gpu_usage + ;; + ports) + show_ports + ;; + help|--help|-h) + print_usage + ;; + "") + interactive_menu + ;; + *) + echo -e "${RED}Error: Unknown command '$1'${NC}" + echo + print_usage + exit 1 + ;; +esac \ No newline at end of file diff --git a/inference/run_multi_react.py b/inference/run_multi_react.py index 1056a0a7..b487ea94 100644 --- a/inference/run_multi_react.py +++ b/inference/run_multi_react.py @@ -9,6 +9,60 @@ from react_agent import MultiTurnReactAgent import time import math +import signal +import sys +import atexit +from pathlib import Path + +# Global variables for cleanup +executor = None +vllm_state_file = None + +def cleanup_resources(): + """Cleanup resources and VLLM processes.""" + print("\nCleaning up resources...") + + # Cancel running tasks + if executor: + try: + print("Shutting down thread pool executor...") + executor.shutdown(wait=False, cancel_futures=True) + except Exception as e: + print(f"Error shutting down executor: {e}") + + # Clean up VLLM processes if we started them + global vllm_state_file + if vllm_state_file and vllm_state_file.exists(): + try: + script_dir = Path(__file__).parent + vllm_manager = script_dir / "vllm_manager.py" + if vllm_manager.exists(): + import subprocess + print("Cleaning up VLLM processes...") + subprocess.run([ + sys.executable, str(vllm_manager), "cleanup", + "--state-file", str(vllm_state_file) + ], check=False) + else: + print("VLLM manager not found, trying fallback cleanup...") + import subprocess + subprocess.run(["pkill", "-f", "vllm serve"], check=False) + subprocess.run(["pkill", "-f", "vllm.entrypoints.openai.api_server"], check=False) + except Exception as e: + print(f"Error during VLLM cleanup: {e}") + + print("Cleanup completed") + +def signal_handler(signum, frame): + """Handle interrupt signals gracefully.""" + print(f"\nReceived signal {signum}, initiating graceful shutdown...") + cleanup_resources() + sys.exit(128 + signum) + +# Register cleanup and signal handlers +atexit.register(cleanup_resources) +signal.signal(signal.SIGINT, signal_handler) +signal.signal(signal.SIGTERM, signal_handler) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -22,8 +76,14 @@ parser.add_argument("--roll_out_count", type=int, default=3) parser.add_argument("--total_splits", type=int, default=1) parser.add_argument("--worker_split", type=int, default=1) + parser.add_argument("--cleanup-on-exit", action="store_true", default=True, + help="Cleanup VLLM processes on exit (default: True)") args = parser.parse_args() + # Initialize VLLM state file path + script_dir = Path(__file__).parent + vllm_state_file = script_dir / "vllm_processes.json" + model = args.model output_base = args.output roll_out_count = args.roll_out_count @@ -36,12 +96,12 @@ exit(1) model_name = os.path.basename(model.rstrip('/')) - + model_dir = os.path.join(output_base, f"{model_name}_sglang") dataset_dir = os.path.join(model_dir, args.dataset) - + os.makedirs(dataset_dir, exist_ok=True) - + print(f"Model name: {model_name}") print(f"Data set name: {args.dataset}") print(f"Output directory: {dataset_dir}") @@ -75,10 +135,10 @@ items_per_split = math.ceil(total_items / total_splits) start_idx = (worker_split - 1) * items_per_split end_idx = min(worker_split * items_per_split, total_items) - + # Split the dataset items = items[start_idx:end_idx] - + print(f"Total items in dataset: {total_items}") print(f"Processing items {start_idx} to {end_idx-1} ({len(items)} items)") @@ -87,7 +147,7 @@ output_files = {i: os.path.join(dataset_dir, f"iter{i}_split{worker_split}of{total_splits}.jsonl") for i in range(1, roll_out_count + 1)} else: output_files = {i: os.path.join(dataset_dir, f"iter{i}.jsonl") for i in range(1, roll_out_count + 1)} - + processed_queries_per_rollout = {} for rollout_idx in range(1, roll_out_count + 1): @@ -171,7 +231,11 @@ write_locks = {i: threading.Lock() for i in range(1, roll_out_count + 1)} - with ThreadPoolExecutor(max_workers=args.max_workers) as executor: + # Set global executor for cleanup + global executor + executor = ThreadPoolExecutor(max_workers=args.max_workers) + + try: future_to_task = { executor.submit( test_agent._run, @@ -224,6 +288,28 @@ with open(output_file, "a", encoding="utf-8") as f: f.write(json.dumps(error_result, ensure_ascii=False) + "\n") - print("\nAll tasks completed!") + print("\nAll tasks completed!") + + except KeyboardInterrupt: + print("\nKeyboard interrupt received, canceling tasks...") + # Cancel all futures + for future in future_to_task: + future.cancel() + raise + except Exception as e: + print(f"\nError during task execution: {e}") + # Cancel all futures + for future in future_to_task: + future.cancel() + raise + finally: + # Clean shutdown of executor + if executor: + try: + executor.shutdown(wait=True, cancel_futures=True) + except Exception as e: + print(f"Error shutting down executor: {e}") + finally: + executor = None print(f"\nAll {roll_out_count} rollouts completed!") \ No newline at end of file diff --git a/inference/run_react_infer.sh b/inference/run_react_infer.sh index b4e865b7..dd867792 100644 --- a/inference/run_react_infer.sh +++ b/inference/run_react_infer.sh @@ -1,5 +1,59 @@ #!/bin/bash +# VLLM Process Management +VLLM_MANAGER_PID="" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +STATE_FILE="$SCRIPT_DIR/vllm_processes.json" + +# Cleanup function +cleanup_vllm() { + echo "Cleaning up VLLM processes..." + + # First try to kill processes using stored PIDs + if [ -f "$SCRIPT_DIR/vllm_pids.txt" ]; then + echo "Using stored PIDs for cleanup..." + STORED_PIDS=$(cat "$SCRIPT_DIR/vllm_pids.txt" 2>/dev/null || echo "") + if [ -n "$STORED_PIDS" ]; then + for pid in $STORED_PIDS; do + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + echo "Killing PID $pid..." + kill -TERM "$pid" 2>/dev/null || true + fi + done + + # Wait a bit for graceful shutdown + sleep 3 + + # Force kill any remaining + for pid in $STORED_PIDS; do + if [ -n "$pid" ] && kill -0 "$pid" 2>/dev/null; then + echo "Force killing PID $pid..." + kill -KILL "$pid" 2>/dev/null || true + fi + done + fi + rm -f "$SCRIPT_DIR/vllm_pids.txt" + fi + + # Fallback: Use Python manager if available for comprehensive cleanup + if [ -f "$SCRIPT_DIR/vllm_manager.py" ]; then + python3 "$SCRIPT_DIR/vllm_manager.py" cleanup --state-file "$STATE_FILE" 2>/dev/null || true + fi + + # Final fallback: Manual process killing + echo "Final cleanup check..." + pkill -f "vllm serve" 2>/dev/null || true + pkill -f "vllm.entrypoints.openai.api_server" 2>/dev/null || true + + # Clean up state files + rm -f "$STATE_FILE" "$SCRIPT_DIR/vllm_pids.txt" +} + +# Set up signal traps +trap 'echo "Received SIGINT, cleaning up..."; cleanup_vllm; exit 130' INT +trap 'echo "Received SIGTERM, cleaning up..."; cleanup_vllm; exit 143' TERM +trap 'cleanup_vllm' EXIT + export TORCHDYNAMO_VERBOSE=1 export TORCHDYNAMO_DISABLE=1 export NCCL_IB_TC=16 @@ -27,7 +81,7 @@ export MODEL_PATH=/your/model/path export DATASET=your_dataset_name export OUTPUT_PATH=/your/output/path export ROLLOUT_COUNT=3 # eval avg@3 -export TEMPERATURE=0.85 +export TEMPERATURE=0.85 export PRESENCE_PENALTY=1.1 export MAX_WORKERS=30 @@ -70,15 +124,52 @@ export IDP_KEY_SECRET=your_idp_key_secret ### 1. start server ### ###################################### +# You can customize the VLLM server startup by: +# 1. Commenting out GPU lines you don't have +# 2. Modifying VLLM parameters as needed +# 3. Changing ports if required +# The cleanup will work regardless of how you start the servers + echo "Starting VLLM servers..." + +# Store PIDs for cleanup tracking +VLLM_PIDS=() + +# GPU 0 - Port 6001 CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6001 --disable-log-requests & +VLLM_PIDS+=($!) + +# GPU 1 - Port 6002 CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6002 --disable-log-requests & +VLLM_PIDS+=($!) + +# GPU 2 - Port 6003 CUDA_VISIBLE_DEVICES=2 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6003 --disable-log-requests & +VLLM_PIDS+=($!) + +# GPU 3 - Port 6004 CUDA_VISIBLE_DEVICES=3 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6004 --disable-log-requests & +VLLM_PIDS+=($!) + +# GPU 4 - Port 6005 CUDA_VISIBLE_DEVICES=4 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6005 --disable-log-requests & +VLLM_PIDS+=($!) + +# GPU 5 - Port 6006 CUDA_VISIBLE_DEVICES=5 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6006 --disable-log-requests & +VLLM_PIDS+=($!) + +# GPU 6 - Port 6007 CUDA_VISIBLE_DEVICES=6 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6007 --disable-log-requests & +VLLM_PIDS+=($!) + +# GPU 7 - Port 6008 CUDA_VISIBLE_DEVICES=7 vllm serve $MODEL_PATH --host 0.0.0.0 --port 6008 --disable-log-requests & +VLLM_PIDS+=($!) + +# Save PIDs to state file for cleanup +echo "Started VLLM processes with PIDs: ${VLLM_PIDS[@]}" +echo "${VLLM_PIDS[@]}" > "$SCRIPT_DIR/vllm_pids.txt" ####################################################### ### 2. Waiting for the server port to be ready ### @@ -99,7 +190,7 @@ echo "Waiting for servers to start..." while true; do all_ready=true - + for port in "${main_ports[@]}"; do if [ "${server_status[$port]}" = "false" ]; then if curl -s -f http://localhost:$port/v1/models > /dev/null 2>&1; then @@ -110,27 +201,26 @@ while true; do fi fi done - + if [ "$all_ready" = "true" ]; then echo "All servers are ready for inference!" break fi - + current_time=$(date +%s) elapsed=$((current_time - start_time)) if [ $elapsed -gt $timeout ]; then echo -e "\nError: Server startup timeout after ${timeout} seconds" - + for port in "${main_ports[@]}"; do if [ "${server_status[$port]}" = "false" ]; then echo "Main model server (port $port) failed to start" fi done - exit 1 fi - + printf 'Waiting for servers to start .....' sleep 10 done diff --git a/inference/vllm_manager.py b/inference/vllm_manager.py new file mode 100755 index 00000000..b12c5ab5 --- /dev/null +++ b/inference/vllm_manager.py @@ -0,0 +1,380 @@ +#!/usr/bin/env python3 +""" +VLLM Server Process Manager + +This utility manages VLLM server processes for the DeepResearch inference pipeline. +It provides robust process lifecycle management with proper cleanup on exit. +""" + +import os +import sys +import json +import signal +import subprocess +import time +import argparse +from pathlib import Path +from typing import List, Dict, Optional +import psutil +import atexit + +class VLLMManager: + def __init__(self, state_file: str = "vllm_processes.json"): + self.state_file = Path(state_file) + self.processes: Dict[int, subprocess.Popen] = {} + self.ports = [6001, 6002, 6003, 6004, 6005, 6006, 6007, 6008] + + # Register cleanup on exit + atexit.register(self.cleanup_all) + + # Handle signals + signal.signal(signal.SIGTERM, self._signal_handler) + signal.signal(signal.SIGINT, self._signal_handler) + + def _signal_handler(self, signum, frame): + print(f"\nReceived signal {signum}, cleaning up VLLM processes...") + self.cleanup_all() + sys.exit(0) + + def load_state(self) -> Dict: + """Load process state from file.""" + if not self.state_file.exists(): + return {} + + try: + with open(self.state_file, 'r') as f: + return json.load(f) + except (json.JSONDecodeError, IOError) as e: + print(f"Warning: Could not load state file: {e}") + return {} + + def save_state(self, state: Dict): + """Save process state to file.""" + try: + with open(self.state_file, 'w') as f: + json.dump(state, f, indent=2) + except IOError as e: + print(f"Warning: Could not save state file: {e}") + + def is_port_in_use(self, port: int) -> bool: + """Check if a port is already in use.""" + try: + # Check if any process is using this port + for conn in psutil.net_connections(): + if conn.laddr.port == port: + return True + return False + except (psutil.AccessDenied, psutil.NoSuchProcess): + return False + + def find_vllm_processes(self) -> List[psutil.Process]: + """Find all running VLLM processes.""" + vllm_processes = [] + for proc in psutil.process_iter(['pid', 'name', 'cmdline']): + try: + cmdline_str = ' '.join(proc.info['cmdline']) if proc.info['cmdline'] else '' + if cmdline_str and ('vllm serve' in cmdline_str or 'vllm.entrypoints.openai.api_server' in cmdline_str): + vllm_processes.append(proc) + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + return vllm_processes + + def start_servers(self, model_path: str, gpu_devices: List[int] = None) -> bool: + """Start VLLM servers on specified GPUs.""" + if gpu_devices is None: + gpu_devices = list(range(8)) # Default to GPUs 0-7 + + if len(gpu_devices) > len(self.ports): + print(f"Warning: More GPUs ({len(gpu_devices)}) than ports ({len(self.ports)})") + gpu_devices = gpu_devices[:len(self.ports)] + + print(f"Starting VLLM servers on {len(gpu_devices)} GPUs...") + + state = self.load_state() + started_processes = {} + + for i, (gpu_id, port) in enumerate(zip(gpu_devices, self.ports)): + if self.is_port_in_use(port): + print(f"Port {port} is already in use, skipping GPU {gpu_id}") + continue + + print(f"Starting VLLM server on GPU {gpu_id}, port {port}...") + + env = os.environ.copy() + env['CUDA_VISIBLE_DEVICES'] = str(gpu_id) + + cmd = [ + 'vllm', 'serve', model_path, + '--host', '0.0.0.0', + '--port', str(port), + '--disable-log-requests' + ] + + try: + # Start process in new process group + process = subprocess.Popen( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + preexec_fn=os.setsid + ) + + self.processes[port] = process + started_processes[str(port)] = { + 'pid': process.pid, + 'gpu_id': gpu_id, + 'model_path': model_path, + 'started_at': time.time() + } + + print(f"Started VLLM server on port {port} (PID: {process.pid})") + + except Exception as e: + print(f"Failed to start VLLM server on GPU {gpu_id}: {e}") + return False + + # Save state + state.update(started_processes) + self.save_state(state) + + return len(started_processes) > 0 + + def wait_for_servers(self, timeout: int = 600) -> bool: + """Wait for all servers to be ready.""" + print("Waiting for servers to start...") + start_time = time.time() + + state = self.load_state() + ports_to_check = [int(port) for port in state.keys()] + + ready_ports = set() + + while time.time() - start_time < timeout: + all_ready = True + + for port in ports_to_check: + if port in ready_ports: + continue + + try: + import requests + response = requests.get(f"http://localhost:{port}/v1/models", timeout=5) + if response.status_code == 200: + print(f"Server on port {port} is ready!") + ready_ports.add(port) + else: + all_ready = False + except Exception: + all_ready = False + + if len(ready_ports) == len(ports_to_check): + print("All servers are ready!") + return True + + if not all_ready: + print(".", end="", flush=True) + time.sleep(10) + + print(f"\nTimeout: Only {len(ready_ports)}/{len(ports_to_check)} servers ready") + return len(ready_ports) > 0 + + def stop_servers(self, ports: List[int] = None) -> bool: + """Stop VLLM servers on specified ports.""" + state = self.load_state() + + if ports is None: + ports = [int(port) for port in state.keys()] + + success = True + for port in ports: + if not self._stop_server(port, state): + success = False + + # Clean up state file + remaining_state = {k: v for k, v in state.items() if int(k) not in ports} + self.save_state(remaining_state) + + return success + + def _stop_server(self, port: int, state: Dict) -> bool: + """Stop a specific VLLM server.""" + port_str = str(port) + + if port_str not in state: + print(f"No server recorded on port {port}") + return True + + pid = state[port_str]['pid'] + + try: + # First try graceful shutdown + process = psutil.Process(pid) + print(f"Stopping VLLM server on port {port} (PID: {pid})...") + + # Try SIGTERM first + process.terminate() + + # Wait up to 30 seconds for graceful shutdown + try: + process.wait(timeout=30) + print(f"Server on port {port} stopped gracefully") + return True + except psutil.TimeoutExpired: + print(f"Server on port {port} didn't stop gracefully, using SIGKILL...") + process.kill() + process.wait(timeout=10) + print(f"Server on port {port} killed") + return True + + except psutil.NoSuchProcess: + print(f"Process {pid} already dead") + return True + except Exception as e: + print(f"Error stopping server on port {port}: {e}") + return False + + def cleanup_all(self): + """Clean up all VLLM processes.""" + print("Cleaning up all VLLM processes...") + + # First try to stop servers from state + state = self.load_state() + if state: + self.stop_servers() + + # Then find and kill any remaining VLLM processes + orphan_processes = self.find_vllm_processes() + + for proc in orphan_processes: + try: + print(f"Killing orphan VLLM process {proc.pid}") + proc.terminate() + proc.wait(timeout=10) + except (psutil.NoSuchProcess, psutil.TimeoutExpired): + try: + proc.kill() + except psutil.NoSuchProcess: + pass + except Exception as e: + print(f"Error killing process {proc.pid}: {e}") + + # Clean up state file + if self.state_file.exists(): + self.state_file.unlink() + + print("Cleanup completed") + + def status(self) -> Dict: + """Get status of all managed servers.""" + state = self.load_state() + status_info = {} + + for port_str, info in state.items(): + port = int(port_str) + pid = info['pid'] + + try: + process = psutil.Process(pid) + is_running = process.is_running() + + # Check if port is responsive + port_responsive = False + try: + import requests + response = requests.get(f"http://localhost:{port}/v1/models", timeout=2) + port_responsive = response.status_code == 200 + except Exception: + pass + + status_info[port] = { + 'pid': pid, + 'gpu_id': info.get('gpu_id', 'unknown'), + 'model_path': info.get('model_path', 'unknown'), + 'running': is_running, + 'responsive': port_responsive, + 'uptime': time.time() - info.get('started_at', 0) + } + except psutil.NoSuchProcess: + status_info[port] = { + 'pid': pid, + 'gpu_id': info.get('gpu_id', 'unknown'), + 'model_path': info.get('model_path', 'unknown'), + 'running': False, + 'responsive': False, + 'uptime': 0 + } + + return status_info + + +def main(): + parser = argparse.ArgumentParser(description="VLLM Server Process Manager") + parser.add_argument('command', choices=['start', 'stop', 'status', 'cleanup'], + help='Command to execute') + parser.add_argument('--model', type=str, required=False, + help='Model path (required for start command)') + parser.add_argument('--gpus', type=str, default='0,1,2,3,4,5,6,7', + help='Comma-separated list of GPU IDs') + parser.add_argument('--ports', type=str, + help='Comma-separated list of ports to operate on') + parser.add_argument('--timeout', type=int, default=600, + help='Timeout for server startup (seconds)') + parser.add_argument('--state-file', type=str, default='vllm_processes.json', + help='State file path') + + args = parser.parse_args() + + manager = VLLMManager(args.state_file) + + if args.command == 'start': + if not args.model: + print("Error: --model is required for start command") + sys.exit(1) + + gpu_devices = [int(x.strip()) for x in args.gpus.split(',')] + + if manager.start_servers(args.model, gpu_devices): + if manager.wait_for_servers(args.timeout): + print("All servers started successfully!") + sys.exit(0) + else: + print("Some servers failed to start") + sys.exit(1) + else: + print("Failed to start servers") + sys.exit(1) + + elif args.command == 'stop': + ports = None + if args.ports: + ports = [int(x.strip()) for x in args.ports.split(',')] + + if manager.stop_servers(ports): + print("Servers stopped successfully") + sys.exit(0) + else: + print("Some servers failed to stop") + sys.exit(1) + + elif args.command == 'status': + status_info = manager.status() + if not status_info: + print("No managed servers found") + else: + print("VLLM Server Status:") + print("-" * 80) + for port, info in status_info.items(): + status = "RUNNING" if info['running'] else "STOPPED" + responsive = "RESPONSIVE" if info['responsive'] else "NOT RESPONSIVE" + uptime_str = f"{info['uptime']:.1f}s" if info['uptime'] > 0 else "N/A" + print(f"Port {port:4d}: {status:8s} | {responsive:14s} | " + f"PID {info['pid']:6d} | GPU {info['gpu_id']} | Uptime {uptime_str}") + + elif args.command == 'cleanup': + manager.cleanup_all() + print("Cleanup completed") + + +if __name__ == '__main__': + main() \ No newline at end of file