Support kimi-k2 extended thinking, fix prompt caching stats, fix max output (#1133)

xingyaoww · openhands-agent · web-flow · commit a4f97bd0cfbb · 2025-11-11T17:26:31.000Z
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -73,7 +73,7 @@
 from openhands.sdk.llm.options.chat_options import select_chat_options
 from openhands.sdk.llm.options.responses_options import select_responses_options
 from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot
-from openhands.sdk.llm.utils.model_features import get_features
+from openhands.sdk.llm.utils.model_features import get_default_temperature, get_features
 from openhands.sdk.llm.utils.retry_mixin import RetryMixin
 from openhands.sdk.llm.utils.telemetry import Telemetry
 from openhands.sdk.logger import ENV_LOG_DIR, get_logger
@@ -149,7 +149,14 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
         description="Approx max chars in each event/content sent to the LLM.",
     )
 
-    temperature: float | None = Field(default=0.0, ge=0)
+    temperature: float | None = Field(
+        default=None,
+        ge=0,
+        description=(
+            "Sampling temperature for response generation. "
+            "Defaults to 0 for most models and provider default for reasoning models."
+        ),
+    )
     top_p: float | None = Field(default=1.0, ge=0, le=1)
     top_k: float | None = Field(default=None, ge=0)
 
@@ -375,9 +382,13 @@ def _set_env_side_effects(self):
         # Capabilities + model info
         self._init_model_info_and_caps()
 
+        if self.temperature is None:
+            self.temperature = get_default_temperature(self.model)
+
         logger.debug(
             f"LLM ready: model={self.model} base_url={self.base_url} "
-            f"reasoning_effort={self.reasoning_effort}"
+            f"reasoning_effort={self.reasoning_effort} "
+            f"temperature={self.temperature}"
         )
         return self
 
@@ -826,7 +837,12 @@ def _init_model_info_and_caps(self) -> None:
         if self.max_output_tokens is None:
             if any(
                 m in self.model
-                for m in ["claude-3-7-sonnet", "claude-3.7-sonnet", "claude-sonnet-4"]
+                for m in [
+                    "claude-3-7-sonnet",
+                    "claude-3.7-sonnet",
+                    "claude-sonnet-4",
+                    "kimi-k2-thinking",
+                ]
             ):
                 self.max_output_tokens = (
                     64000  # practical cap (litellm may allow 128k with header)
@@ -932,9 +948,9 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]:
             message.cache_enabled = self.is_caching_prompt_active()
             message.vision_enabled = self.vision_is_active()
             message.function_calling_enabled = self.native_tool_calling
-            message.force_string_serializer = get_features(
-                self.model
-            ).force_string_serializer
+            model_features = get_features(self.model)
+            message.force_string_serializer = model_features.force_string_serializer
+            message.send_reasoning_content = model_features.send_reasoning_content
 
         formatted_messages = [message.to_chat_dict() for message in messages]
 
diff --git a/openhands-sdk/openhands/sdk/llm/message.py b/openhands-sdk/openhands/sdk/llm/message.py
@@ -217,8 +217,21 @@ class Message(BaseModel):
     # - tool execution result (to LLM)
     tool_call_id: str | None = None
     name: str | None = None  # name of the tool
-    # force string serializer
-    force_string_serializer: bool = False
+    force_string_serializer: bool = Field(
+        default=False,
+        description=(
+            "Force using string content serializer when sending to LLM API. "
+            "Useful for providers that do not support list content, "
+            "like HuggingFace and Groq."
+        ),
+    )
+    send_reasoning_content: bool = Field(
+        default=False,
+        description=(
+            "Whether to include the full reasoning content when sending to the LLM. "
+            "Useful for models that support extended reasoning, like Kimi-K2-thinking."
+        ),
+    )
     # reasoning content (from reasoning models like o1, Claude thinking, DeepSeek R1)
     reasoning_content: str | None = Field(
         default=None,
@@ -279,6 +292,10 @@ def to_chat_dict(self) -> dict[str, Any]:
             message_dict["tool_call_id"] = self.tool_call_id
             message_dict["name"] = self.name
 
+        # Required for model like kimi-k2-thinking
+        if self.send_reasoning_content and self.reasoning_content:
+            message_dict["reasoning_content"] = self.reasoning_content
+
         return message_dict
 
     def _string_serializer(self) -> dict[str, Any]:
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -23,6 +23,7 @@ class ModelFeatures:
     supports_stop_words: bool
     supports_responses_api: bool
     force_string_serializer: bool
+    send_reasoning_content: bool
 
 
 # Pattern tables capturing current behavior. Keep patterns lowercase.
@@ -99,6 +100,12 @@ class ModelFeatures:
     "groq/kimi-k2-instruct",  # explicit provider-prefixed IDs
 ]
 
+# Models that we should send full reasoning content
+# in the message input
+SEND_REASONING_CONTENT_PATTERNS: list[str] = [
+    "kimi-k2-thinking",
+]
+
 
 def get_features(model: str) -> ModelFeatures:
     """Get model features."""
@@ -111,4 +118,23 @@ def get_features(model: str) -> ModelFeatures:
         ),
         supports_responses_api=model_matches(model, RESPONSES_API_PATTERNS),
         force_string_serializer=model_matches(model, FORCE_STRING_SERIALIZER_PATTERNS),
+        send_reasoning_content=model_matches(model, SEND_REASONING_CONTENT_PATTERNS),
     )
+
+
+# Default temperature mapping.
+# Each entry: (pattern, default_temperature)
+DEFAULT_TEMPERATURE_PATTERNS: list[tuple[str, float]] = [
+    ("kimi-k2-thinking", 1.0),
+]
+
+
+def get_default_temperature(model: str) -> float:
+    """Return the default temperature for a given model pattern.
+
+    Uses case-insensitive substring matching via model_matches.
+    """
+    for pattern, value in DEFAULT_TEMPERATURE_PATTERNS:
+        if model_matches(model, [pattern]):
+            return value
+    return 0.0
diff --git a/openhands-sdk/openhands/sdk/llm/utils/telemetry.py b/openhands-sdk/openhands/sdk/llm/utils/telemetry.py
@@ -149,6 +149,10 @@ def _record_usage(
         if p_details is not None:
             cache_read = int(getattr(p_details, "cached_tokens", 0) or 0)
 
+        # Kimi-K2-thinking populate usage.cached_tokens field
+        if not cache_read and hasattr(usage, "cached_tokens"):
+            cache_read = int(getattr(usage, "cached_tokens", 0) or 0)
+
         reasoning_tokens = 0
         c_details = getattr(usage, "completion_tokens_details", None) or getattr(
             usage, "output_tokens_details", None
diff --git a/openhands-sdk/openhands/sdk/llm/utils/verified_models.py b/openhands-sdk/openhands/sdk/llm/utils/verified_models.py
@@ -37,19 +37,13 @@
 VERIFIED_OPENHANDS_MODELS = [
     "claude-sonnet-4-5-20250929",
     "claude-haiku-4-5-20251001",
-    "gpt-5-codex",
     "gpt-5-2025-08-07",
+    "gpt-5-codex",
+    "kimi-k2-thinking",
     "gpt-5-mini-2025-08-07",
-    "claude-sonnet-4-20250514",
-    "claude-opus-4-20250514",
     "claude-opus-4-1-20250805",
     "devstral-small-2507",
     "devstral-medium-2507",
-    "o3",
-    "o4-mini",
-    "gemini-2.5-pro",
-    "kimi-k2-0711-preview",
-    "qwen3-coder-480b",
 ]
 
 
diff --git a/tests/sdk/context/condenser/test_llm_summarizing_condenser.py b/tests/sdk/context/condenser/test_llm_summarizing_condenser.py
@@ -65,6 +65,7 @@ def create_completion_result(content: str) -> LLMResponse:
     mock_llm.base_url = None
     mock_llm.reasoning_effort = None
     mock_llm.litellm_extra_body = {}
+    mock_llm.temperature = 0.0
 
     # Explicitly set pricing attributes required by LLM -> Telemetry wiring
     mock_llm.input_cost_per_token = None
diff --git a/tests/sdk/llm/test_llm_no_response_retry.py b/tests/sdk/llm/test_llm_no_response_retry.py
@@ -48,6 +48,7 @@ def base_llm() -> LLM:
         num_retries=2,
         retry_min_wait=1,
         retry_max_wait=2,
+        temperature=0.0,  # Explicitly set to test temperature bump behavior
     )
 
 
diff --git a/tests/sdk/llm/test_message.py b/tests/sdk/llm/test_message.py
@@ -267,3 +267,106 @@ def test_text_content_truncation_exact_limit():
         # Check that text was not truncated
         assert len(result) == 1
         assert result[0]["text"] == exact_text
+
+
+def test_message_with_reasoning_content_when_enabled():
+    """Test that reasoning_content is included when send_reasoning_content is True."""
+    from openhands.sdk.llm.message import Message, TextContent
+
+    message = Message(
+        role="assistant",
+        content=[TextContent(text="Final answer")],
+        reasoning_content="Let me think step by step...",
+        send_reasoning_content=True,
+    )
+
+    result = message.to_chat_dict()
+    assert result["role"] == "assistant"
+    assert result["content"] == "Final answer"
+    assert result["reasoning_content"] == "Let me think step by step..."
+
+
+def test_message_with_reasoning_content_when_disabled():
+    """Test that reasoning_content is NOT included when send_reasoning_content is False."""  # noqa: E501
+    from openhands.sdk.llm.message import Message, TextContent
+
+    message = Message(
+        role="assistant",
+        content=[TextContent(text="Final answer")],
+        reasoning_content="Let me think step by step...",
+        send_reasoning_content=False,
+    )
+
+    result = message.to_chat_dict()
+    assert result["role"] == "assistant"
+    assert result["content"] == "Final answer"
+    assert "reasoning_content" not in result
+
+
+def test_message_with_reasoning_content_default_disabled():
+    """Test that reasoning_content is NOT included by default."""
+    from openhands.sdk.llm.message import Message, TextContent
+
+    message = Message(
+        role="assistant",
+        content=[TextContent(text="Final answer")],
+        reasoning_content="Let me think step by step...",
+    )
+
+    result = message.to_chat_dict()
+    assert result["role"] == "assistant"
+    assert result["content"] == "Final answer"
+    assert "reasoning_content" not in result
+
+
+def test_message_with_reasoning_content_none():
+    """Test that reasoning_content is NOT included when it's None even if enabled."""
+    from openhands.sdk.llm.message import Message, TextContent
+
+    message = Message(
+        role="assistant",
+        content=[TextContent(text="Final answer")],
+        reasoning_content=None,
+        send_reasoning_content=True,
+    )
+
+    result = message.to_chat_dict()
+    assert result["role"] == "assistant"
+    assert result["content"] == "Final answer"
+    assert "reasoning_content" not in result
+
+
+def test_message_with_reasoning_content_empty_string():
+    """Test that reasoning_content is NOT included when it's an empty string."""
+    from openhands.sdk.llm.message import Message, TextContent
+
+    message = Message(
+        role="assistant",
+        content=[TextContent(text="Final answer")],
+        reasoning_content="",
+        send_reasoning_content=True,
+    )
+
+    result = message.to_chat_dict()
+    assert result["role"] == "assistant"
+    assert result["content"] == "Final answer"
+    assert "reasoning_content" not in result
+
+
+def test_message_with_reasoning_content_list_serializer():
+    """Test that reasoning_content works with list serializer."""
+    from openhands.sdk.llm.message import Message, TextContent
+
+    message = Message(
+        role="assistant",
+        content=[TextContent(text="Final answer")],
+        reasoning_content="Step by step reasoning",
+        send_reasoning_content=True,
+        function_calling_enabled=True,  # Forces list serializer
+    )
+
+    result = message.to_chat_dict()
+    assert result["role"] == "assistant"
+    assert isinstance(result["content"], list)
+    assert result["content"][0]["text"] == "Final answer"
+    assert result["reasoning_content"] == "Step by step reasoning"
diff --git a/tests/sdk/llm/test_model_features.py b/tests/sdk/llm/test_model_features.py
@@ -1,6 +1,7 @@
 import pytest
 
 from openhands.sdk.llm.utils.model_features import (
+    get_default_temperature,
     get_features,
     model_matches,
 )
@@ -240,3 +241,71 @@ def test_force_string_serializer_full_model_names():
     assert get_features("Kimi K2-Instruct-0905").force_string_serializer is False
     # Groq-prefixed Kimi should force string serializer
     assert get_features("groq/kimi-k2-instruct-0905").force_string_serializer is True
+
+
+@pytest.mark.parametrize(
+    "model,expected_send_reasoning",
+    [
+        ("kimi-k2-thinking", True),
+        ("kimi-k2-thinking-0905", True),
+        ("Kimi-K2-Thinking", True),  # Case insensitive
+        ("moonshot/kimi-k2-thinking", True),  # With provider prefix
+        ("kimi-k2-instruct", False),  # Different variant
+        ("gpt-4o", False),
+        ("claude-3-5-sonnet", False),
+        ("o1", False),
+        ("unknown-model", False),
+    ],
+)
+def test_send_reasoning_content_support(model, expected_send_reasoning):
+    """Test that models like kimi-k2-thinking require send_reasoning_content."""
+    features = get_features(model)
+    assert features.send_reasoning_content is expected_send_reasoning
+
+
+@pytest.mark.parametrize(
+    "model,expected_temperature",
+    [
+        # kimi-k2-thinking models should default to 1.0
+        ("kimi-k2-thinking", 1.0),
+        ("kimi-k2-thinking-0905", 1.0),
+        ("Kimi-K2-Thinking", 1.0),  # Case insensitive
+        ("moonshot/kimi-k2-thinking", 1.0),  # With provider prefix
+        ("litellm_proxy/kimi-k2-thinking", 1.0),  # With litellm proxy prefix
+        # All other models should default to 0.0
+        ("kimi-k2-instruct", 0.0),  # Different kimi variant
+        ("gpt-4", 0.0),
+        ("gpt-4o", 0.0),
+        ("gpt-4o-mini", 0.0),
+        ("claude-3-5-sonnet", 0.0),
+        ("claude-3-7-sonnet", 0.0),
+        ("gemini-1.5-pro", 0.0),
+        ("gemini-2.5-pro-experimental", 0.0),
+        ("o1", 0.0),
+        ("o1-mini", 0.0),
+        ("o3", 0.0),
+        ("deepseek-chat", 0.0),
+        ("llama-3.1-70b", 0.0),
+        ("azure/gpt-4", 0.0),
+        ("openai/gpt-4o", 0.0),
+        ("anthropic/claude-3-5-sonnet", 0.0),
+        ("unknown-model", 0.0),
+    ],
+)
+def test_get_default_temperature(model, expected_temperature):
+    """Test that get_default_temperature returns correct values for different models."""
+    assert get_default_temperature(model) == expected_temperature
+
+
+def test_get_default_temperature_fallback():
+    """Test that get_default_temperature returns 0.0 for unknown models."""
+    assert get_default_temperature("completely-unknown-model-12345") == 0.0
+    assert get_default_temperature("some-random-model") == 0.0
+
+
+def test_get_default_temperature_case_insensitive():
+    """Test that get_default_temperature is case insensitive."""
+    assert get_default_temperature("kimi-k2-thinking") == 1.0
+    assert get_default_temperature("KIMI-K2-THINKING") == 1.0
+    assert get_default_temperature("Kimi-K2-Thinking") == 1.0
+    assert get_default_temperature("KiMi-k2-ThInKiNg") == 1.0

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@ def base_llm() -> LLM:`
`48`	`48`	`num_retries=2,`
`49`	`49`	`retry_min_wait=1,`
`50`	`50`	`retry_max_wait=2,`
	`51`	`+ temperature=0.0, # Explicitly set to test temperature bump behavior`
`51`	`52`	`)`
`52`	`53`
`53`	`54`