Add remaining LLM fixes and LLM registry (#70)

ryanhoangt · xingyaoww · web-flow · commit f16a2da9bc90 · 2025-09-03T22:25:50.000+07:00
Co-authored-by: Xingyao Wang &lt;xingyao@all-hands.dev&gt;
Co-authored-by: Xingyao Wang &lt;xingyaoww@gmail.com&gt;
diff --git a/.github/workflows/precommit.yml b/.github/workflows/precommit.yml
@@ -3,7 +3,9 @@ name: Pre-commit checks
 
 on:
   push:
-    branches: ["**"]   # all branches
+    branches: ["main"]
+  pull_request:
+    branches: ["**"]
 
 jobs:
   pre-commit:
diff --git a/examples/hello_world_with_registry.py b/examples/hello_world_with_registry.py
@@ -0,0 +1,96 @@
+import os
+
+from pydantic import SecretStr
+
+from openhands.core import (
+    CodeActAgent,
+    Conversation,
+    EventType,
+    LLMConfig,
+    LLMConvertibleEvent,
+    LLMRegistry,
+    Message,
+    TextContent,
+    Tool,
+    get_logger,
+)
+from openhands.tools import (
+    BashExecutor,
+    FileEditorExecutor,
+    execute_bash_tool,
+    str_replace_editor_tool,
+)
+
+
+logger = get_logger(__name__)
+
+# Configure LLM using LLMRegistry
+api_key = os.getenv("LITELLM_API_KEY")
+assert api_key is not None, "LITELLM_API_KEY environment variable is not set."
+
+# Create LLM registry
+llm_registry = LLMRegistry()
+
+# Get LLM from registry (this will create and cache the LLM)
+llm_config = LLMConfig(
+    model="litellm_proxy/anthropic/claude-sonnet-4-20250514",
+    base_url="https://llm-proxy.eval.all-hands.dev",
+    api_key=SecretStr(api_key),
+)
+llm = llm_registry.get_llm(service_id="main_agent", config=llm_config)
+
+# Tools
+cwd = os.getcwd()
+bash = BashExecutor(working_dir=cwd)
+file_editor = FileEditorExecutor()
+tools: list[Tool] = [
+    execute_bash_tool.set_executor(executor=bash),
+    str_replace_editor_tool.set_executor(executor=file_editor),
+]
+
+# Agent
+agent = CodeActAgent(llm=llm, tools=tools)
+
+llm_messages = []  # collect raw LLM messages
+
+
+def conversation_callback(event: EventType):
+    logger.info(f"Found a conversation message: {str(event)[:200]}...")
+    if isinstance(event, LLMConvertibleEvent):
+        llm_messages.append(event.to_llm_message())
+
+
+conversation = Conversation(agent=agent, callbacks=[conversation_callback])
+
+conversation.send_message(
+    message=Message(
+        role="user",
+        content=[
+            TextContent(
+                text="Hello! Can you create a new Python file named "
+                "hello_registry.py that prints 'Hello from LLM Registry!'?"
+            )
+        ],
+    )
+)
+conversation.run()
+
+print("=" * 100)
+print("Conversation finished. Got the following LLM messages:")
+for i, message in enumerate(llm_messages):
+    print(f"Message {i}: {str(message)[:200]}")
+
+print("=" * 100)
+print(f"LLM Registry services: {llm_registry.list_services()}")
+
+# Demonstrate getting the same LLM instance from registry
+same_llm = llm_registry.get_llm(service_id="main_agent", config=llm_config)
+print(f"Same LLM instance: {llm is same_llm}")
+
+# Demonstrate requesting a completion directly from registry
+completion_response = llm_registry.request_extraneous_completion(
+    service_id="completion_service",
+    llm_config=llm_config,
+    messages=[{"role": "user", "content": "Say hello in one word."}],
+)
+print(f"Direct completion response: {completion_response}")
diff --git a/openhands/core/__init__.py b/openhands/core/__init__.py
@@ -4,7 +4,7 @@
 from .config import LLMConfig, MCPConfig
 from .conversation import Conversation, ConversationCallbackType
 from .event import EventBase, EventType, LLMConvertibleEvent
-from .llm import LLM, ImageContent, Message, TextContent
+from .llm import LLM, ImageContent, LLMRegistry, Message, RegistryEvent, TextContent
 from .logger import get_logger
 from .tool import ActionBase, ObservationBase, Tool
 
@@ -16,6 +16,8 @@
 
 __all__ = [
     "LLM",
+    "LLMRegistry",
+    "RegistryEvent",
     "Message",
     "TextContent",
     "ImageContent",
diff --git a/openhands/core/agent/codeact_agent/codeact_agent.py b/openhands/core/agent/codeact_agent/codeact_agent.py
@@ -87,10 +87,11 @@ def step(
             list[LLMConvertibleEvent],
             [e for e in state.events if isinstance(e, LLMConvertibleEvent)],
         )
-        _messages = self.llm.format_messages_for_llm(
-            LLMConvertibleEvent.events_to_messages(llm_convertible_events)
+        _messages = LLMConvertibleEvent.events_to_messages(llm_convertible_events)
+        logger.debug(
+            "Sending messages to LLM: "
+            f"{json.dumps([m.model_dump() for m in _messages], indent=2)}"
         )
-        logger.debug(f"Sending messages to LLM: {json.dumps(_messages, indent=2)}")
         response: ModelResponse = self.llm.completion(
             messages=_messages,
             tools=[tool.to_openai_tool() for tool in self.tools.values()],
diff --git a/openhands/core/config/llm_config.py b/openhands/core/config/llm_config.py
@@ -1,16 +1,15 @@
 import os
-from typing import Any
 
-from pydantic import BaseModel, ConfigDict, Field, SecretStr
+from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator
 
 from openhands.core.logger import ENV_LOG_DIR, get_logger
 
 
 logger = get_logger(__name__)
 
 
-class LLMConfig(BaseModel):
-    """Configuration for the LLM model.
+class LLMConfig(BaseModel, frozen=True):
+    """Immutable configuration for the LLM model.
 
     Attributes:
         model: The model to use.
@@ -99,32 +98,50 @@ class LLMConfig(BaseModel):
 
     model_config = ConfigDict(extra="forbid")
 
-    def model_post_init(self, __context: Any) -> None:
-        """Post-initialization hook to assign OpenRouter-related variables to
-        environment variables.
+    # 1) Pre-validation: transform inputs for a frozen model
+    @model_validator(mode="before")
+    @classmethod
+    def _coerce_inputs(cls, data):
+        # data can be dict or BaseModel – normalize to dict
+        if not isinstance(data, dict):
+            return data
+        d = dict(data)
 
-        This ensures that these values are accessible to litellm at runtime.
-        """
-        super().model_post_init(__context)
+        model_val = d.get("model", None)
+        if model_val is None:
+            raise ValueError("model must be specified in LLMConfig")
 
-        # Assign OpenRouter-specific variables to environment variables
+        # reasoning_effort default (unless Gemini)
+        if d.get("reasoning_effort") is None and "gemini-2.5-pro" not in model_val:
+            d["reasoning_effort"] = "high"
+
+        # Azure default api_version
+        if model_val.startswith("azure") and not d.get("api_version"):
+            d["api_version"] = "2024-12-01-preview"
+
+        # Provider rewrite: openhands/* -> litellm_proxy/*
+        if model_val.startswith("openhands/"):
+            model_name = model_val.removeprefix("openhands/")
+            d["model"] = f"litellm_proxy/{model_name}"
+            # don't overwrite if caller explicitly set base_url
+            d.setdefault("base_url", "https://llm-proxy.app.all-hands.dev/")
+
+        # HF doesn't support the OpenAI default value for top_p (1)
+        if model_val.startswith("huggingface"):
+            logger.debug(f"Setting top_p to 0.9 for Hugging Face model: {model_val}")
+            _cur_top_p = d.get("top_p", 1.0)
+            d["top_p"] = 0.9 if _cur_top_p == 1 else _cur_top_p
+
+        return d
+
+    # 2) Post-validation: side effects only; must return self
+    @model_validator(mode="after")
+    def _set_env_side_effects(self):
         if self.openrouter_site_url:
             os.environ["OR_SITE_URL"] = self.openrouter_site_url
         if self.openrouter_app_name:
             os.environ["OR_APP_NAME"] = self.openrouter_app_name
 
-        # Set reasoning_effort to 'high' by default for non-Gemini models
-        # Gemini models use optimized thinking budget when reasoning_effort is None
-        if self.reasoning_effort is None and "gemini-2.5-pro" not in self.model:
-            self.reasoning_effort = "high"
-
-        # Set an API version by default for Azure models
-        # Required for newer models.
-        # Azure issue: https://github.com/All-Hands-AI/OpenHands/issues/7755
-        if self.model.startswith("azure") and self.api_version is None:
-            self.api_version = "2024-12-01-preview"
-
-        # Set AWS credentials as environment variables for LiteLLM Bedrock
         if self.aws_access_key_id:
             os.environ["AWS_ACCESS_KEY_ID"] = self.aws_access_key_id.get_secret_value()
         if self.aws_secret_access_key:
@@ -133,3 +150,11 @@ def model_post_init(self, __context: Any) -> None:
             )
         if self.aws_region_name:
             os.environ["AWS_REGION_NAME"] = self.aws_region_name
+
+        logger.debug(
+            f"LLMConfig finalized with model={self.model} "
+            f"base_url={self.base_url} "
+            f"api_version={self.api_version} "
+            f"reasoning_effort={self.reasoning_effort}",
+        )
+        return self
diff --git a/openhands/core/llm/__init__.py b/openhands/core/llm/__init__.py
@@ -1,10 +1,13 @@
 from .llm import LLM
+from .llm_registry import LLMRegistry, RegistryEvent
 from .message import ImageContent, Message, TextContent
 from .metadata import get_llm_metadata
 
 
 __all__ = [
     "LLM",
+    "LLMRegistry",
+    "RegistryEvent",
     "Message",
     "TextContent",
     "ImageContent",
diff --git a/openhands/core/llm/llm.py b/openhands/core/llm/llm.py
@@ -3,7 +3,7 @@
 import time
 import warnings
 from functools import partial
-from typing import Any, Callable, TypeGuard
+from typing import Any, Callable, TypeGuard, cast
 
 import httpx
 
@@ -100,6 +100,8 @@ def __init__(
 
         self.model_info: ModelInfo | None = None
         self._function_calling_active: bool = False
+        self._max_input_tokens: int | None = self.config.max_input_tokens
+        self._max_output_tokens: int | None = self.config.max_output_tokens
         self.retry_listener = retry_listener
         if self.config.log_completions:
             if self.config.log_completions_folder is None:
@@ -139,15 +141,6 @@ def __init__(
             # openai doesn't expose top_p, but litellm does
             kwargs["top_p"] = self.config.top_p
 
-        # Handle OpenHands provider - rewrite to litellm_proxy
-        if self.config.model.startswith("openhands/"):
-            model_name = self.config.model.removeprefix("openhands/")
-            self.config.model = f"litellm_proxy/{model_name}"
-            self.config.base_url = "https://llm-proxy.app.all-hands.dev/"
-            logger.debug(
-                f"Rewrote openhands/{model_name} to {self.config.model} with base URL {self.config.base_url}"  # noqa: E501
-            )
-
         features = get_features(self.config.model)
         if features.supports_reasoning_effort:
             # For Gemini models, only map 'low' to optimized thinking budget
@@ -229,7 +222,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
             if "stream" in kwargs and kwargs["stream"]:
                 raise ValueError("Streaming is not supported in LLM class.")
 
-            messages_kwarg: list[dict[str, Any]] | dict[str, Any] = []
+            messages_kwarg: (
+                dict[str, Any] | Message | list[dict[str, Any]] | list[Message]
+            ) = []
             mock_function_calling = not self.is_function_calling_active()
 
             # some callers might send the model and messages directly
@@ -248,9 +243,19 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
                 messages_kwarg = kwargs["messages"]
 
             # ensure we work with a list of messages
-            messages: list[dict[str, Any]] = (
+            messages_list = (
                 messages_kwarg if isinstance(messages_kwarg, list) else [messages_kwarg]
             )
+            # format Message objects to dict if needed
+            messages: list[dict] = []
+            if messages_list and isinstance(messages_list[0], Message):
+                messages = self.format_messages_for_llm(
+                    cast(list[Message], messages_list)
+                )
+            else:
+                messages = cast(list[dict[str, Any]], messages_list)
+
+            kwargs["messages"] = messages
 
             # handle conversion of to non-function calling messages if needed
             original_fncall_messages = copy.deepcopy(messages)
@@ -408,6 +413,14 @@ def _all_choices(
 
         self._completion = wrapper
 
+    @property
+    def max_input_tokens(self) -> int | None:
+        return self._max_input_tokens
+
+    @property
+    def max_output_tokens(self) -> int | None:
+        return self._max_output_tokens
+
     @property
     def completion(self) -> Callable:
         """Decorator for the litellm completion function.
@@ -483,41 +496,34 @@ def init_model_info(self) -> None:
             f"Model info: {json.dumps({'model': self.config.model, 'base_url': self.config.base_url}, indent=2)}"  # noqa: E501
         )
 
-        if self.config.model.startswith("huggingface"):
-            # HF doesn't support the OpenAI default value for top_p (1)
-            logger.debug(
-                f"Setting top_p to 0.9 for Hugging Face model: {self.config.model}"
-            )
-            self.config.top_p = 0.9 if self.config.top_p == 1 else self.config.top_p
-
         # Set max_input_tokens from model info if not explicitly set
         if (
-            self.config.max_input_tokens is None
+            self._max_input_tokens is None
             and self.model_info is not None
             and "max_input_tokens" in self.model_info
             and isinstance(self.model_info["max_input_tokens"], int)
         ):
-            self.config.max_input_tokens = self.model_info["max_input_tokens"]
+            self._max_input_tokens = self.model_info["max_input_tokens"]
 
         # Set max_output_tokens from model info if not explicitly set
-        if self.config.max_output_tokens is None:
+        if self._max_output_tokens is None:
             # Special case for Claude 3.7 Sonnet models
             if any(
                 model in self.config.model
                 for model in ["claude-3-7-sonnet", "claude-3.7-sonnet"]
             ):
-                self.config.max_output_tokens = 64000  # litellm set max to 128k, but that requires a header to be set  # noqa: E501
+                self._max_output_tokens = 64000  # litellm set max to 128k, but that requires a header to be set  # noqa: E501
             # Try to get from model info
             elif self.model_info is not None:
                 # max_output_tokens has precedence over max_tokens
                 if "max_output_tokens" in self.model_info and isinstance(
                     self.model_info["max_output_tokens"], int
                 ):
-                    self.config.max_output_tokens = self.model_info["max_output_tokens"]
+                    self._max_output_tokens = self.model_info["max_output_tokens"]
                 elif "max_tokens" in self.model_info and isinstance(
                     self.model_info["max_tokens"], int
                 ):
-                    self.config.max_output_tokens = self.model_info["max_tokens"]
+                    self._max_output_tokens = self.model_info["max_tokens"]
 
         # Initialize function calling using centralized model features
         features = get_features(self.config.model)
diff --git a/openhands/core/llm/llm_registry.py b/openhands/core/llm/llm_registry.py
diff --git a/openhands/core/tests/llm/test_llm_registry.py b/openhands/core/tests/llm/test_llm_registry.py