Skip to content
80 changes: 73 additions & 7 deletions src/core/containers/runtime/providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,48 +138,72 @@ def start_container(
port: Port to expose (if None, finds available port)
env_vars: Environment variables for the container
**kwargs: Additional Docker run options
- memory_gb: Memory limit in GB (default: 4GB)
- command_override: List of command args to override container CMD

Returns:
Base URL to connect to the container
"""
import subprocess
import time
import logging

logger = logging.getLogger(__name__)

# Find available port if not specified
if port is None:
port = self._find_available_port()

# Use default memory limit if not specified
memory_gb = kwargs.get("memory_gb", 16)

# Generate container name
self._container_name = self._generate_container_name(image)

# Build docker run command
# Use host networking for better performance and consistency with podman
# NOTE: Do NOT use --rm initially - if container fails to start, we need logs
cmd = [
"docker", "run",
"-d", # Detached
"--name", self._container_name,
"-p", f"{port}:8000", # Map port
"--network", "host", # Use host network
"--memory", f"{memory_gb}g", # Limit container memory
"--memory-swap", f"{memory_gb}g", # Prevent swap usage (set equal to --memory)
"--oom-kill-disable=false", # Allow OOM killer (exit gracefully)
]

# Add environment variables
if env_vars:
for key, value in env_vars.items():
cmd.extend(["-e", f"{key}={value}"])

# Pass custom port via environment variable instead of overriding command
# This allows the container to use its proper entrypoint/CMD
if port != 8000:
cmd.extend(["-e", f"PORT={port}"])

# Add image
cmd.append(image)

# Add command override if provided (explicit override by user)
if "command_override" in kwargs:
cmd.extend(kwargs["command_override"])

# Run container
try:
logger.debug(f"Starting container with command: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
self._container_id = result.stdout.strip()
logger.debug(f"Container started with ID: {self._container_id}")
except subprocess.CalledProcessError as e:
error_msg = f"Failed to start Docker container.\nCommand: {' '.join(cmd)}\nExit code: {e.returncode}\nStderr: {e.stderr}\nStdout: {e.stdout}"
raise RuntimeError(error_msg) from e

# Wait a moment for container to start
time.sleep(1)

base_url = f"http://localhost:{port}"
base_url = f"http://127.0.0.1:{port}"
return base_url

def stop_container(self) -> None:
Expand Down Expand Up @@ -227,23 +251,65 @@ def wait_for_ready(self, base_url: str, timeout_s: float = 30.0) -> None:
"""
import time
import requests
import subprocess
import logging

start_time = time.time()
health_url = f"{base_url}/health"
last_error = None

while time.time() - start_time < timeout_s:
try:
response = requests.get(health_url, timeout=2.0)
if response.status_code == 200:
return
except requests.RequestException:
pass
except requests.RequestException as e:
last_error = str(e)

time.sleep(0.5)

raise TimeoutError(
f"Container at {base_url} did not become ready within {timeout_s}s"
)
# If we timeout, provide diagnostic information
error_msg = f"Container at {base_url} did not become ready within {timeout_s}s"

if self._container_id:
try:
# First check if container exists
inspect_result = subprocess.run(
["docker", "inspect", self._container_id],
capture_output=True,
text=True,
timeout=5,
)

if inspect_result.returncode != 0:
# Container doesn't exist - likely exited and auto-removed due to --rm flag
error_msg += f"\n\nContainer was auto-removed (likely exited immediately)."
error_msg += f"\nThis typically means:"
error_msg += f"\n 1. The container image has an error in its startup script"
error_msg += f"\n 2. Required dependencies are missing in the container"
error_msg += f"\n 3. Port {base_url.split(':')[-1]} might be in use by another process"
error_msg += f"\n 4. Container command/entrypoint is misconfigured"
error_msg += f"\nTry running the container manually to debug:"
error_msg += f"\n docker run -it --rm <IMAGE_NAME>"
else:
# Container exists, try to get logs
result = subprocess.run(
["docker", "logs", "--tail", "50", self._container_id],
capture_output=True,
text=True,
timeout=5,
)
if result.stdout or result.stderr:
error_msg += f"\n\nContainer logs (last 50 lines):\n{result.stdout}\n{result.stderr}"
except subprocess.TimeoutExpired:
error_msg += f"\n\nTimeout while trying to inspect container"
except Exception as e:
error_msg += f"\n\nFailed to get container diagnostics: {e}"

if last_error:
error_msg += f"\n\nLast connection error: {last_error}"

raise TimeoutError(error_msg)

def _find_available_port(self) -> int:
"""
Expand Down
12 changes: 8 additions & 4 deletions src/core/http_env_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,18 @@ def from_docker_image(
if provider is None:
provider = LocalDockerProvider()

# Extract timeout_s from kwargs for wait_for_ready, with a default
timeout_s = kwargs.pop('timeout_s', 30.0)
request_timeout_s = kwargs.pop('request_timeout_s', 15.0)

# 1. Start container with optional kwargs (e.g., env_vars, port)
base_url = provider.start_container(image, **kwargs)

# 2. Wait for server to be ready
provider.wait_for_ready(base_url)
# 2. Wait for server to be ready with the specified timeout
provider.wait_for_ready(base_url, timeout_s=timeout_s)

# 3. Create and return client instance with provider reference
return cls(base_url=base_url, provider=provider)
# 3. Create and return client instance with provider reference and request timeout
return cls(base_url=base_url, request_timeout_s=request_timeout_s, provider=provider)

@classmethod
def from_hub(cls: Type[EnvClientT], repo_id: str, provider: Optional["ContainerProvider"] = None, **kwargs: Any) -> EnvClientT:
Expand Down
5 changes: 4 additions & 1 deletion src/core/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@

from .git_server_client import GitServerClient, RepoInfo
from .local_python_executor import PyExecutor
from .local_julia_executor import JuliaExecutor


__all__ = [
"PyExecutor",
"JuliaExecutor",
"GitServerClient",
"RepoInfo",
]
]
Loading