diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index b865553237..c30e7a54e4 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -73,7 +73,7 @@ from openhands.sdk.llm.options.chat_options import select_chat_options from openhands.sdk.llm.options.responses_options import select_responses_options from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot -from openhands.sdk.llm.utils.model_features import get_features +from openhands.sdk.llm.utils.model_features import get_default_temperature, get_features from openhands.sdk.llm.utils.retry_mixin import RetryMixin from openhands.sdk.llm.utils.telemetry import Telemetry from openhands.sdk.logger import ENV_LOG_DIR, get_logger @@ -149,7 +149,14 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin): description="Approx max chars in each event/content sent to the LLM.", ) - temperature: float | None = Field(default=0.0, ge=0) + temperature: float | None = Field( + default=None, + ge=0, + description=( + "Sampling temperature for response generation. " + "Defaults to 0 for most models and provider default for reasoning models." + ), + ) top_p: float | None = Field(default=1.0, ge=0, le=1) top_k: float | None = Field(default=None, ge=0) @@ -375,9 +382,13 @@ def _set_env_side_effects(self): # Capabilities + model info self._init_model_info_and_caps() + if self.temperature is None: + self.temperature = get_default_temperature(self.model) + logger.debug( f"LLM ready: model={self.model} base_url={self.base_url} " - f"reasoning_effort={self.reasoning_effort}" + f"reasoning_effort={self.reasoning_effort} " + f"temperature={self.temperature}" ) return self @@ -826,7 +837,12 @@ def _init_model_info_and_caps(self) -> None: if self.max_output_tokens is None: if any( m in self.model - for m in ["claude-3-7-sonnet", "claude-3.7-sonnet", "claude-sonnet-4"] + for m in [ + "claude-3-7-sonnet", + "claude-3.7-sonnet", + "claude-sonnet-4", + "kimi-k2-thinking", + ] ): self.max_output_tokens = ( 64000 # practical cap (litellm may allow 128k with header) @@ -932,9 +948,9 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]: message.cache_enabled = self.is_caching_prompt_active() message.vision_enabled = self.vision_is_active() message.function_calling_enabled = self.native_tool_calling - message.force_string_serializer = get_features( - self.model - ).force_string_serializer + model_features = get_features(self.model) + message.force_string_serializer = model_features.force_string_serializer + message.send_reasoning_content = model_features.send_reasoning_content formatted_messages = [message.to_chat_dict() for message in messages] diff --git a/openhands-sdk/openhands/sdk/llm/message.py b/openhands-sdk/openhands/sdk/llm/message.py index fb70135ac2..d14d3a20a3 100644 --- a/openhands-sdk/openhands/sdk/llm/message.py +++ b/openhands-sdk/openhands/sdk/llm/message.py @@ -217,8 +217,21 @@ class Message(BaseModel): # - tool execution result (to LLM) tool_call_id: str | None = None name: str | None = None # name of the tool - # force string serializer - force_string_serializer: bool = False + force_string_serializer: bool = Field( + default=False, + description=( + "Force using string content serializer when sending to LLM API. " + "Useful for providers that do not support list content, " + "like HuggingFace and Groq." + ), + ) + send_reasoning_content: bool = Field( + default=False, + description=( + "Whether to include the full reasoning content when sending to the LLM. " + "Useful for models that support extended reasoning, like Kimi-K2-thinking." + ), + ) # reasoning content (from reasoning models like o1, Claude thinking, DeepSeek R1) reasoning_content: str | None = Field( default=None, @@ -279,6 +292,10 @@ def to_chat_dict(self) -> dict[str, Any]: message_dict["tool_call_id"] = self.tool_call_id message_dict["name"] = self.name + # Required for model like kimi-k2-thinking + if self.send_reasoning_content and self.reasoning_content: + message_dict["reasoning_content"] = self.reasoning_content + return message_dict def _string_serializer(self) -> dict[str, Any]: diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index ccc1290b37..00056706cf 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -23,6 +23,7 @@ class ModelFeatures: supports_stop_words: bool supports_responses_api: bool force_string_serializer: bool + send_reasoning_content: bool # Pattern tables capturing current behavior. Keep patterns lowercase. @@ -99,6 +100,12 @@ class ModelFeatures: "groq/kimi-k2-instruct", # explicit provider-prefixed IDs ] +# Models that we should send full reasoning content +# in the message input +SEND_REASONING_CONTENT_PATTERNS: list[str] = [ + "kimi-k2-thinking", +] + def get_features(model: str) -> ModelFeatures: """Get model features.""" @@ -111,4 +118,23 @@ def get_features(model: str) -> ModelFeatures: ), supports_responses_api=model_matches(model, RESPONSES_API_PATTERNS), force_string_serializer=model_matches(model, FORCE_STRING_SERIALIZER_PATTERNS), + send_reasoning_content=model_matches(model, SEND_REASONING_CONTENT_PATTERNS), ) + + +# Default temperature mapping. +# Each entry: (pattern, default_temperature) +DEFAULT_TEMPERATURE_PATTERNS: list[tuple[str, float]] = [ + ("kimi-k2-thinking", 1.0), +] + + +def get_default_temperature(model: str) -> float: + """Return the default temperature for a given model pattern. + + Uses case-insensitive substring matching via model_matches. + """ + for pattern, value in DEFAULT_TEMPERATURE_PATTERNS: + if model_matches(model, [pattern]): + return value + return 0.0 diff --git a/openhands-sdk/openhands/sdk/llm/utils/telemetry.py b/openhands-sdk/openhands/sdk/llm/utils/telemetry.py index 7f0707602f..2e6b1ac785 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/telemetry.py +++ b/openhands-sdk/openhands/sdk/llm/utils/telemetry.py @@ -149,6 +149,10 @@ def _record_usage( if p_details is not None: cache_read = int(getattr(p_details, "cached_tokens", 0) or 0) + # Kimi-K2-thinking populate usage.cached_tokens field + if not cache_read and hasattr(usage, "cached_tokens"): + cache_read = int(getattr(usage, "cached_tokens", 0) or 0) + reasoning_tokens = 0 c_details = getattr(usage, "completion_tokens_details", None) or getattr( usage, "output_tokens_details", None diff --git a/openhands-sdk/openhands/sdk/llm/utils/verified_models.py b/openhands-sdk/openhands/sdk/llm/utils/verified_models.py index c539f36026..b1b53eb1e4 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/verified_models.py +++ b/openhands-sdk/openhands/sdk/llm/utils/verified_models.py @@ -37,19 +37,13 @@ VERIFIED_OPENHANDS_MODELS = [ "claude-sonnet-4-5-20250929", "claude-haiku-4-5-20251001", - "gpt-5-codex", "gpt-5-2025-08-07", + "gpt-5-codex", + "kimi-k2-thinking", "gpt-5-mini-2025-08-07", - "claude-sonnet-4-20250514", - "claude-opus-4-20250514", "claude-opus-4-1-20250805", "devstral-small-2507", "devstral-medium-2507", - "o3", - "o4-mini", - "gemini-2.5-pro", - "kimi-k2-0711-preview", - "qwen3-coder-480b", ] diff --git a/tests/sdk/context/condenser/test_llm_summarizing_condenser.py b/tests/sdk/context/condenser/test_llm_summarizing_condenser.py index 8b8f344b05..18e1c02afa 100644 --- a/tests/sdk/context/condenser/test_llm_summarizing_condenser.py +++ b/tests/sdk/context/condenser/test_llm_summarizing_condenser.py @@ -65,6 +65,7 @@ def create_completion_result(content: str) -> LLMResponse: mock_llm.base_url = None mock_llm.reasoning_effort = None mock_llm.litellm_extra_body = {} + mock_llm.temperature = 0.0 # Explicitly set pricing attributes required by LLM -> Telemetry wiring mock_llm.input_cost_per_token = None diff --git a/tests/sdk/llm/test_llm_no_response_retry.py b/tests/sdk/llm/test_llm_no_response_retry.py index a14e6ecaea..c8f5809554 100644 --- a/tests/sdk/llm/test_llm_no_response_retry.py +++ b/tests/sdk/llm/test_llm_no_response_retry.py @@ -48,6 +48,7 @@ def base_llm() -> LLM: num_retries=2, retry_min_wait=1, retry_max_wait=2, + temperature=0.0, # Explicitly set to test temperature bump behavior ) diff --git a/tests/sdk/llm/test_message.py b/tests/sdk/llm/test_message.py index 2cdb1cd4ec..d87e1e924c 100644 --- a/tests/sdk/llm/test_message.py +++ b/tests/sdk/llm/test_message.py @@ -267,3 +267,106 @@ def test_text_content_truncation_exact_limit(): # Check that text was not truncated assert len(result) == 1 assert result[0]["text"] == exact_text + + +def test_message_with_reasoning_content_when_enabled(): + """Test that reasoning_content is included when send_reasoning_content is True.""" + from openhands.sdk.llm.message import Message, TextContent + + message = Message( + role="assistant", + content=[TextContent(text="Final answer")], + reasoning_content="Let me think step by step...", + send_reasoning_content=True, + ) + + result = message.to_chat_dict() + assert result["role"] == "assistant" + assert result["content"] == "Final answer" + assert result["reasoning_content"] == "Let me think step by step..." + + +def test_message_with_reasoning_content_when_disabled(): + """Test that reasoning_content is NOT included when send_reasoning_content is False.""" # noqa: E501 + from openhands.sdk.llm.message import Message, TextContent + + message = Message( + role="assistant", + content=[TextContent(text="Final answer")], + reasoning_content="Let me think step by step...", + send_reasoning_content=False, + ) + + result = message.to_chat_dict() + assert result["role"] == "assistant" + assert result["content"] == "Final answer" + assert "reasoning_content" not in result + + +def test_message_with_reasoning_content_default_disabled(): + """Test that reasoning_content is NOT included by default.""" + from openhands.sdk.llm.message import Message, TextContent + + message = Message( + role="assistant", + content=[TextContent(text="Final answer")], + reasoning_content="Let me think step by step...", + ) + + result = message.to_chat_dict() + assert result["role"] == "assistant" + assert result["content"] == "Final answer" + assert "reasoning_content" not in result + + +def test_message_with_reasoning_content_none(): + """Test that reasoning_content is NOT included when it's None even if enabled.""" + from openhands.sdk.llm.message import Message, TextContent + + message = Message( + role="assistant", + content=[TextContent(text="Final answer")], + reasoning_content=None, + send_reasoning_content=True, + ) + + result = message.to_chat_dict() + assert result["role"] == "assistant" + assert result["content"] == "Final answer" + assert "reasoning_content" not in result + + +def test_message_with_reasoning_content_empty_string(): + """Test that reasoning_content is NOT included when it's an empty string.""" + from openhands.sdk.llm.message import Message, TextContent + + message = Message( + role="assistant", + content=[TextContent(text="Final answer")], + reasoning_content="", + send_reasoning_content=True, + ) + + result = message.to_chat_dict() + assert result["role"] == "assistant" + assert result["content"] == "Final answer" + assert "reasoning_content" not in result + + +def test_message_with_reasoning_content_list_serializer(): + """Test that reasoning_content works with list serializer.""" + from openhands.sdk.llm.message import Message, TextContent + + message = Message( + role="assistant", + content=[TextContent(text="Final answer")], + reasoning_content="Step by step reasoning", + send_reasoning_content=True, + function_calling_enabled=True, # Forces list serializer + ) + + result = message.to_chat_dict() + assert result["role"] == "assistant" + assert isinstance(result["content"], list) + assert result["content"][0]["text"] == "Final answer" + assert result["reasoning_content"] == "Step by step reasoning" diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py index 7a6c424c96..6f17e205f3 100644 --- a/tests/sdk/llm/test_model_features.py +++ b/tests/sdk/llm/test_model_features.py @@ -1,6 +1,7 @@ import pytest from openhands.sdk.llm.utils.model_features import ( + get_default_temperature, get_features, model_matches, ) @@ -240,3 +241,71 @@ def test_force_string_serializer_full_model_names(): assert get_features("Kimi K2-Instruct-0905").force_string_serializer is False # Groq-prefixed Kimi should force string serializer assert get_features("groq/kimi-k2-instruct-0905").force_string_serializer is True + + +@pytest.mark.parametrize( + "model,expected_send_reasoning", + [ + ("kimi-k2-thinking", True), + ("kimi-k2-thinking-0905", True), + ("Kimi-K2-Thinking", True), # Case insensitive + ("moonshot/kimi-k2-thinking", True), # With provider prefix + ("kimi-k2-instruct", False), # Different variant + ("gpt-4o", False), + ("claude-3-5-sonnet", False), + ("o1", False), + ("unknown-model", False), + ], +) +def test_send_reasoning_content_support(model, expected_send_reasoning): + """Test that models like kimi-k2-thinking require send_reasoning_content.""" + features = get_features(model) + assert features.send_reasoning_content is expected_send_reasoning + + +@pytest.mark.parametrize( + "model,expected_temperature", + [ + # kimi-k2-thinking models should default to 1.0 + ("kimi-k2-thinking", 1.0), + ("kimi-k2-thinking-0905", 1.0), + ("Kimi-K2-Thinking", 1.0), # Case insensitive + ("moonshot/kimi-k2-thinking", 1.0), # With provider prefix + ("litellm_proxy/kimi-k2-thinking", 1.0), # With litellm proxy prefix + # All other models should default to 0.0 + ("kimi-k2-instruct", 0.0), # Different kimi variant + ("gpt-4", 0.0), + ("gpt-4o", 0.0), + ("gpt-4o-mini", 0.0), + ("claude-3-5-sonnet", 0.0), + ("claude-3-7-sonnet", 0.0), + ("gemini-1.5-pro", 0.0), + ("gemini-2.5-pro-experimental", 0.0), + ("o1", 0.0), + ("o1-mini", 0.0), + ("o3", 0.0), + ("deepseek-chat", 0.0), + ("llama-3.1-70b", 0.0), + ("azure/gpt-4", 0.0), + ("openai/gpt-4o", 0.0), + ("anthropic/claude-3-5-sonnet", 0.0), + ("unknown-model", 0.0), + ], +) +def test_get_default_temperature(model, expected_temperature): + """Test that get_default_temperature returns correct values for different models.""" + assert get_default_temperature(model) == expected_temperature + + +def test_get_default_temperature_fallback(): + """Test that get_default_temperature returns 0.0 for unknown models.""" + assert get_default_temperature("completely-unknown-model-12345") == 0.0 + assert get_default_temperature("some-random-model") == 0.0 + + +def test_get_default_temperature_case_insensitive(): + """Test that get_default_temperature is case insensitive.""" + assert get_default_temperature("kimi-k2-thinking") == 1.0 + assert get_default_temperature("KIMI-K2-THINKING") == 1.0 + assert get_default_temperature("Kimi-K2-Thinking") == 1.0 + assert get_default_temperature("KiMi-k2-ThInKiNg") == 1.0