Skip to content
30 changes: 23 additions & 7 deletions openhands-sdk/openhands/sdk/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
from openhands.sdk.llm.options.chat_options import select_chat_options
from openhands.sdk.llm.options.responses_options import select_responses_options
from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot
from openhands.sdk.llm.utils.model_features import get_features
from openhands.sdk.llm.utils.model_features import get_default_temperature, get_features
from openhands.sdk.llm.utils.retry_mixin import RetryMixin
from openhands.sdk.llm.utils.telemetry import Telemetry
from openhands.sdk.logger import ENV_LOG_DIR, get_logger
Expand Down Expand Up @@ -149,7 +149,14 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
description="Approx max chars in each event/content sent to the LLM.",
)

temperature: float | None = Field(default=0.0, ge=0)
temperature: float | None = Field(
default=None,
ge=0,
description=(
"Sampling temperature for response generation. "
"Defaults to 0 for most models and provider default for reasoning models."
),
)
top_p: float | None = Field(default=1.0, ge=0, le=1)
top_k: float | None = Field(default=None, ge=0)

Expand Down Expand Up @@ -375,9 +382,13 @@ def _set_env_side_effects(self):
# Capabilities + model info
self._init_model_info_and_caps()

if self.temperature is None:
self.temperature = get_default_temperature(self.model)

logger.debug(
f"LLM ready: model={self.model} base_url={self.base_url} "
f"reasoning_effort={self.reasoning_effort}"
f"reasoning_effort={self.reasoning_effort} "
f"temperature={self.temperature}"
)
return self

Expand Down Expand Up @@ -826,7 +837,12 @@ def _init_model_info_and_caps(self) -> None:
if self.max_output_tokens is None:
if any(
m in self.model
for m in ["claude-3-7-sonnet", "claude-3.7-sonnet", "claude-sonnet-4"]
for m in [
"claude-3-7-sonnet",
"claude-3.7-sonnet",
"claude-sonnet-4",
"kimi-k2-thinking",
]
):
self.max_output_tokens = (
64000 # practical cap (litellm may allow 128k with header)
Expand Down Expand Up @@ -932,9 +948,9 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]:
message.cache_enabled = self.is_caching_prompt_active()
message.vision_enabled = self.vision_is_active()
message.function_calling_enabled = self.native_tool_calling
message.force_string_serializer = get_features(
self.model
).force_string_serializer
model_features = get_features(self.model)
message.force_string_serializer = model_features.force_string_serializer
message.send_reasoning_content = model_features.send_reasoning_content

formatted_messages = [message.to_chat_dict() for message in messages]

Expand Down
21 changes: 19 additions & 2 deletions openhands-sdk/openhands/sdk/llm/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,21 @@ class Message(BaseModel):
# - tool execution result (to LLM)
tool_call_id: str | None = None
name: str | None = None # name of the tool
# force string serializer
force_string_serializer: bool = False
force_string_serializer: bool = Field(
default=False,
description=(
"Force using string content serializer when sending to LLM API. "
"Useful for providers that do not support list content, "
"like HuggingFace and Groq."
),
)
send_reasoning_content: bool = Field(
default=False,
description=(
"Whether to include the full reasoning content when sending to the LLM. "
"Useful for models that support extended reasoning, like Kimi-K2-thinking."
),
)
# reasoning content (from reasoning models like o1, Claude thinking, DeepSeek R1)
reasoning_content: str | None = Field(
default=None,
Expand Down Expand Up @@ -279,6 +292,10 @@ def to_chat_dict(self) -> dict[str, Any]:
message_dict["tool_call_id"] = self.tool_call_id
message_dict["name"] = self.name

# Required for model like kimi-k2-thinking
if self.send_reasoning_content and self.reasoning_content:
message_dict["reasoning_content"] = self.reasoning_content

return message_dict

def _string_serializer(self) -> dict[str, Any]:
Expand Down
27 changes: 27 additions & 0 deletions openhands-sdk/openhands/sdk/llm/utils/model_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class ModelFeatures:
supports_stop_words: bool
supports_responses_api: bool
force_string_serializer: bool
send_reasoning_content: bool


# Pattern tables capturing current behavior. Keep patterns lowercase.
Expand Down Expand Up @@ -99,6 +100,12 @@ class ModelFeatures:
"groq/kimi-k2-instruct", # explicit provider-prefixed IDs
]

# Models that we should send full reasoning content
# in the message input
SEND_REASONING_CONTENT_PATTERNS: list[str] = [
"kimi-k2-thinking",
]


def get_features(model: str) -> ModelFeatures:
"""Get model features."""
Expand All @@ -111,4 +118,24 @@ def get_features(model: str) -> ModelFeatures:
),
supports_responses_api=model_matches(model, RESPONSES_API_PATTERNS),
force_string_serializer=model_matches(model, FORCE_STRING_SERIALIZER_PATTERNS),
send_reasoning_content=model_matches(model, SEND_REASONING_CONTENT_PATTERNS),
)


# Default temperature mapping.
# Each entry: (pattern, default_temperature)
DEFAULT_TEMPERATURE_PATTERNS: list[tuple[str, float]] = [
("kimi-k2-thinking", 1.0),
]


def get_default_temperature(model: str) -> float:
"""Return the default temperature for a given model pattern.

Uses case-insensitive substring matching via model_matches.
The last entry with '*' is treated as a wildcard fallback.
"""
for pattern, value in DEFAULT_TEMPERATURE_PATTERNS:
if model_matches(model, [pattern]):
return value
return 0.0
4 changes: 4 additions & 0 deletions openhands-sdk/openhands/sdk/llm/utils/telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,10 @@ def _record_usage(
if p_details is not None:
cache_read = int(getattr(p_details, "cached_tokens", 0) or 0)

# Kimi-K2-thinking populate usage.cached_tokens field
if not cache_read and hasattr(usage, "cached_tokens"):
cache_read = int(getattr(usage, "cached_tokens", 0) or 0)

reasoning_tokens = 0
c_details = getattr(usage, "completion_tokens_details", None) or getattr(
usage, "output_tokens_details", None
Expand Down
10 changes: 2 additions & 8 deletions openhands-sdk/openhands/sdk/llm/utils/verified_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,13 @@
VERIFIED_OPENHANDS_MODELS = [
"claude-sonnet-4-5-20250929",
"claude-haiku-4-5-20251001",
"gpt-5-codex",
"gpt-5-2025-08-07",
"gpt-5-codex",
"kimi-k2-thinking",
"gpt-5-mini-2025-08-07",
"claude-sonnet-4-20250514",
"claude-opus-4-20250514",
"claude-opus-4-1-20250805",
"devstral-small-2507",
"devstral-medium-2507",
"o3",
"o4-mini",
"gemini-2.5-pro",
"kimi-k2-0711-preview",
"qwen3-coder-480b",
]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def create_completion_result(content: str) -> LLMResponse:
mock_llm.base_url = None
mock_llm.reasoning_effort = None
mock_llm.litellm_extra_body = {}
mock_llm.temperature = 0.0

# Explicitly set pricing attributes required by LLM -> Telemetry wiring
mock_llm.input_cost_per_token = None
Expand Down
1 change: 1 addition & 0 deletions tests/sdk/llm/test_llm_no_response_retry.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def base_llm() -> LLM:
num_retries=2,
retry_min_wait=1,
retry_max_wait=2,
temperature=0.0, # Explicitly set to test temperature bump behavior
)


Expand Down
103 changes: 103 additions & 0 deletions tests/sdk/llm/test_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,3 +267,106 @@ def test_text_content_truncation_exact_limit():
# Check that text was not truncated
assert len(result) == 1
assert result[0]["text"] == exact_text


def test_message_with_reasoning_content_when_enabled():
"""Test that reasoning_content is included when send_reasoning_content is True."""
from openhands.sdk.llm.message import Message, TextContent

message = Message(
role="assistant",
content=[TextContent(text="Final answer")],
reasoning_content="Let me think step by step...",
send_reasoning_content=True,
)

result = message.to_chat_dict()
assert result["role"] == "assistant"
assert result["content"] == "Final answer"
assert result["reasoning_content"] == "Let me think step by step..."


def test_message_with_reasoning_content_when_disabled():
"""Test that reasoning_content is NOT included when send_reasoning_content is False.""" # noqa: E501
from openhands.sdk.llm.message import Message, TextContent

message = Message(
role="assistant",
content=[TextContent(text="Final answer")],
reasoning_content="Let me think step by step...",
send_reasoning_content=False,
)

result = message.to_chat_dict()
assert result["role"] == "assistant"
assert result["content"] == "Final answer"
assert "reasoning_content" not in result


def test_message_with_reasoning_content_default_disabled():
"""Test that reasoning_content is NOT included by default."""
from openhands.sdk.llm.message import Message, TextContent

message = Message(
role="assistant",
content=[TextContent(text="Final answer")],
reasoning_content="Let me think step by step...",
)

result = message.to_chat_dict()
assert result["role"] == "assistant"
assert result["content"] == "Final answer"
assert "reasoning_content" not in result


def test_message_with_reasoning_content_none():
"""Test that reasoning_content is NOT included when it's None even if enabled."""
from openhands.sdk.llm.message import Message, TextContent

message = Message(
role="assistant",
content=[TextContent(text="Final answer")],
reasoning_content=None,
send_reasoning_content=True,
)

result = message.to_chat_dict()
assert result["role"] == "assistant"
assert result["content"] == "Final answer"
assert "reasoning_content" not in result


def test_message_with_reasoning_content_empty_string():
"""Test that reasoning_content is NOT included when it's an empty string."""
from openhands.sdk.llm.message import Message, TextContent

message = Message(
role="assistant",
content=[TextContent(text="Final answer")],
reasoning_content="",
send_reasoning_content=True,
)

result = message.to_chat_dict()
assert result["role"] == "assistant"
assert result["content"] == "Final answer"
assert "reasoning_content" not in result


def test_message_with_reasoning_content_list_serializer():
"""Test that reasoning_content works with list serializer."""
from openhands.sdk.llm.message import Message, TextContent

message = Message(
role="assistant",
content=[TextContent(text="Final answer")],
reasoning_content="Step by step reasoning",
send_reasoning_content=True,
function_calling_enabled=True, # Forces list serializer
)

result = message.to_chat_dict()
assert result["role"] == "assistant"
assert isinstance(result["content"], list)
assert result["content"][0]["text"] == "Final answer"
assert result["reasoning_content"] == "Step by step reasoning"
69 changes: 69 additions & 0 deletions tests/sdk/llm/test_model_features.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from openhands.sdk.llm.utils.model_features import (
get_default_temperature,
get_features,
model_matches,
)
Expand Down Expand Up @@ -240,3 +241,71 @@ def test_force_string_serializer_full_model_names():
assert get_features("Kimi K2-Instruct-0905").force_string_serializer is False
# Groq-prefixed Kimi should force string serializer
assert get_features("groq/kimi-k2-instruct-0905").force_string_serializer is True


@pytest.mark.parametrize(
"model,expected_send_reasoning",
[
("kimi-k2-thinking", True),
("kimi-k2-thinking-0905", True),
("Kimi-K2-Thinking", True), # Case insensitive
("moonshot/kimi-k2-thinking", True), # With provider prefix
("kimi-k2-instruct", False), # Different variant
("gpt-4o", False),
("claude-3-5-sonnet", False),
("o1", False),
("unknown-model", False),
],
)
def test_send_reasoning_content_support(model, expected_send_reasoning):
"""Test that models like kimi-k2-thinking require send_reasoning_content."""
features = get_features(model)
assert features.send_reasoning_content is expected_send_reasoning


@pytest.mark.parametrize(
"model,expected_temperature",
[
# kimi-k2-thinking models should default to 1.0
("kimi-k2-thinking", 1.0),
("kimi-k2-thinking-0905", 1.0),
("Kimi-K2-Thinking", 1.0), # Case insensitive
("moonshot/kimi-k2-thinking", 1.0), # With provider prefix
("litellm_proxy/kimi-k2-thinking", 1.0), # With litellm proxy prefix
# All other models should default to 0.0
("kimi-k2-instruct", 0.0), # Different kimi variant
("gpt-4", 0.0),
("gpt-4o", 0.0),
("gpt-4o-mini", 0.0),
("claude-3-5-sonnet", 0.0),
("claude-3-7-sonnet", 0.0),
("gemini-1.5-pro", 0.0),
("gemini-2.5-pro-experimental", 0.0),
("o1", 0.0),
("o1-mini", 0.0),
("o3", 0.0),
("deepseek-chat", 0.0),
("llama-3.1-70b", 0.0),
("azure/gpt-4", 0.0),
("openai/gpt-4o", 0.0),
("anthropic/claude-3-5-sonnet", 0.0),
("unknown-model", 0.0),
],
)
def test_get_default_temperature(model, expected_temperature):
"""Test that get_default_temperature returns correct values for different models."""
assert get_default_temperature(model) == expected_temperature


def test_get_default_temperature_fallback():
"""Test that get_default_temperature returns 0.0 for unknown models."""
assert get_default_temperature("completely-unknown-model-12345") == 0.0
assert get_default_temperature("some-random-model") == 0.0


def test_get_default_temperature_case_insensitive():
"""Test that get_default_temperature is case insensitive."""
assert get_default_temperature("kimi-k2-thinking") == 1.0
assert get_default_temperature("KIMI-K2-THINKING") == 1.0
assert get_default_temperature("Kimi-K2-Thinking") == 1.0
assert get_default_temperature("KiMi-k2-ThInKiNg") == 1.0
Loading