Skip to content

Commit a4f97bd

Browse files
Support kimi-k2 extended thinking, fix prompt caching stats, fix max output (#1133)
Co-authored-by: openhands <openhands@all-hands.dev>
1 parent 488806e commit a4f97bd

File tree

9 files changed

+248
-17
lines changed

9 files changed

+248
-17
lines changed

openhands-sdk/openhands/sdk/llm/llm.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
from openhands.sdk.llm.options.chat_options import select_chat_options
7474
from openhands.sdk.llm.options.responses_options import select_responses_options
7575
from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot
76-
from openhands.sdk.llm.utils.model_features import get_features
76+
from openhands.sdk.llm.utils.model_features import get_default_temperature, get_features
7777
from openhands.sdk.llm.utils.retry_mixin import RetryMixin
7878
from openhands.sdk.llm.utils.telemetry import Telemetry
7979
from openhands.sdk.logger import ENV_LOG_DIR, get_logger
@@ -149,7 +149,14 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
149149
description="Approx max chars in each event/content sent to the LLM.",
150150
)
151151

152-
temperature: float | None = Field(default=0.0, ge=0)
152+
temperature: float | None = Field(
153+
default=None,
154+
ge=0,
155+
description=(
156+
"Sampling temperature for response generation. "
157+
"Defaults to 0 for most models and provider default for reasoning models."
158+
),
159+
)
153160
top_p: float | None = Field(default=1.0, ge=0, le=1)
154161
top_k: float | None = Field(default=None, ge=0)
155162

@@ -375,9 +382,13 @@ def _set_env_side_effects(self):
375382
# Capabilities + model info
376383
self._init_model_info_and_caps()
377384

385+
if self.temperature is None:
386+
self.temperature = get_default_temperature(self.model)
387+
378388
logger.debug(
379389
f"LLM ready: model={self.model} base_url={self.base_url} "
380-
f"reasoning_effort={self.reasoning_effort}"
390+
f"reasoning_effort={self.reasoning_effort} "
391+
f"temperature={self.temperature}"
381392
)
382393
return self
383394

@@ -826,7 +837,12 @@ def _init_model_info_and_caps(self) -> None:
826837
if self.max_output_tokens is None:
827838
if any(
828839
m in self.model
829-
for m in ["claude-3-7-sonnet", "claude-3.7-sonnet", "claude-sonnet-4"]
840+
for m in [
841+
"claude-3-7-sonnet",
842+
"claude-3.7-sonnet",
843+
"claude-sonnet-4",
844+
"kimi-k2-thinking",
845+
]
830846
):
831847
self.max_output_tokens = (
832848
64000 # practical cap (litellm may allow 128k with header)
@@ -932,9 +948,9 @@ def format_messages_for_llm(self, messages: list[Message]) -> list[dict]:
932948
message.cache_enabled = self.is_caching_prompt_active()
933949
message.vision_enabled = self.vision_is_active()
934950
message.function_calling_enabled = self.native_tool_calling
935-
message.force_string_serializer = get_features(
936-
self.model
937-
).force_string_serializer
951+
model_features = get_features(self.model)
952+
message.force_string_serializer = model_features.force_string_serializer
953+
message.send_reasoning_content = model_features.send_reasoning_content
938954

939955
formatted_messages = [message.to_chat_dict() for message in messages]
940956

openhands-sdk/openhands/sdk/llm/message.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,21 @@ class Message(BaseModel):
217217
# - tool execution result (to LLM)
218218
tool_call_id: str | None = None
219219
name: str | None = None # name of the tool
220-
# force string serializer
221-
force_string_serializer: bool = False
220+
force_string_serializer: bool = Field(
221+
default=False,
222+
description=(
223+
"Force using string content serializer when sending to LLM API. "
224+
"Useful for providers that do not support list content, "
225+
"like HuggingFace and Groq."
226+
),
227+
)
228+
send_reasoning_content: bool = Field(
229+
default=False,
230+
description=(
231+
"Whether to include the full reasoning content when sending to the LLM. "
232+
"Useful for models that support extended reasoning, like Kimi-K2-thinking."
233+
),
234+
)
222235
# reasoning content (from reasoning models like o1, Claude thinking, DeepSeek R1)
223236
reasoning_content: str | None = Field(
224237
default=None,
@@ -279,6 +292,10 @@ def to_chat_dict(self) -> dict[str, Any]:
279292
message_dict["tool_call_id"] = self.tool_call_id
280293
message_dict["name"] = self.name
281294

295+
# Required for model like kimi-k2-thinking
296+
if self.send_reasoning_content and self.reasoning_content:
297+
message_dict["reasoning_content"] = self.reasoning_content
298+
282299
return message_dict
283300

284301
def _string_serializer(self) -> dict[str, Any]:

openhands-sdk/openhands/sdk/llm/utils/model_features.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ class ModelFeatures:
2323
supports_stop_words: bool
2424
supports_responses_api: bool
2525
force_string_serializer: bool
26+
send_reasoning_content: bool
2627

2728

2829
# Pattern tables capturing current behavior. Keep patterns lowercase.
@@ -99,6 +100,12 @@ class ModelFeatures:
99100
"groq/kimi-k2-instruct", # explicit provider-prefixed IDs
100101
]
101102

103+
# Models that we should send full reasoning content
104+
# in the message input
105+
SEND_REASONING_CONTENT_PATTERNS: list[str] = [
106+
"kimi-k2-thinking",
107+
]
108+
102109

103110
def get_features(model: str) -> ModelFeatures:
104111
"""Get model features."""
@@ -111,4 +118,23 @@ def get_features(model: str) -> ModelFeatures:
111118
),
112119
supports_responses_api=model_matches(model, RESPONSES_API_PATTERNS),
113120
force_string_serializer=model_matches(model, FORCE_STRING_SERIALIZER_PATTERNS),
121+
send_reasoning_content=model_matches(model, SEND_REASONING_CONTENT_PATTERNS),
114122
)
123+
124+
125+
# Default temperature mapping.
126+
# Each entry: (pattern, default_temperature)
127+
DEFAULT_TEMPERATURE_PATTERNS: list[tuple[str, float]] = [
128+
("kimi-k2-thinking", 1.0),
129+
]
130+
131+
132+
def get_default_temperature(model: str) -> float:
133+
"""Return the default temperature for a given model pattern.
134+
135+
Uses case-insensitive substring matching via model_matches.
136+
"""
137+
for pattern, value in DEFAULT_TEMPERATURE_PATTERNS:
138+
if model_matches(model, [pattern]):
139+
return value
140+
return 0.0

openhands-sdk/openhands/sdk/llm/utils/telemetry.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,10 @@ def _record_usage(
149149
if p_details is not None:
150150
cache_read = int(getattr(p_details, "cached_tokens", 0) or 0)
151151

152+
# Kimi-K2-thinking populate usage.cached_tokens field
153+
if not cache_read and hasattr(usage, "cached_tokens"):
154+
cache_read = int(getattr(usage, "cached_tokens", 0) or 0)
155+
152156
reasoning_tokens = 0
153157
c_details = getattr(usage, "completion_tokens_details", None) or getattr(
154158
usage, "output_tokens_details", None

openhands-sdk/openhands/sdk/llm/utils/verified_models.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,19 +37,13 @@
3737
VERIFIED_OPENHANDS_MODELS = [
3838
"claude-sonnet-4-5-20250929",
3939
"claude-haiku-4-5-20251001",
40-
"gpt-5-codex",
4140
"gpt-5-2025-08-07",
41+
"gpt-5-codex",
42+
"kimi-k2-thinking",
4243
"gpt-5-mini-2025-08-07",
43-
"claude-sonnet-4-20250514",
44-
"claude-opus-4-20250514",
4544
"claude-opus-4-1-20250805",
4645
"devstral-small-2507",
4746
"devstral-medium-2507",
48-
"o3",
49-
"o4-mini",
50-
"gemini-2.5-pro",
51-
"kimi-k2-0711-preview",
52-
"qwen3-coder-480b",
5347
]
5448

5549

tests/sdk/context/condenser/test_llm_summarizing_condenser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ def create_completion_result(content: str) -> LLMResponse:
6565
mock_llm.base_url = None
6666
mock_llm.reasoning_effort = None
6767
mock_llm.litellm_extra_body = {}
68+
mock_llm.temperature = 0.0
6869

6970
# Explicitly set pricing attributes required by LLM -> Telemetry wiring
7071
mock_llm.input_cost_per_token = None

tests/sdk/llm/test_llm_no_response_retry.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ def base_llm() -> LLM:
4848
num_retries=2,
4949
retry_min_wait=1,
5050
retry_max_wait=2,
51+
temperature=0.0, # Explicitly set to test temperature bump behavior
5152
)
5253

5354

tests/sdk/llm/test_message.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,3 +267,106 @@ def test_text_content_truncation_exact_limit():
267267
# Check that text was not truncated
268268
assert len(result) == 1
269269
assert result[0]["text"] == exact_text
270+
271+
272+
def test_message_with_reasoning_content_when_enabled():
273+
"""Test that reasoning_content is included when send_reasoning_content is True."""
274+
from openhands.sdk.llm.message import Message, TextContent
275+
276+
message = Message(
277+
role="assistant",
278+
content=[TextContent(text="Final answer")],
279+
reasoning_content="Let me think step by step...",
280+
send_reasoning_content=True,
281+
)
282+
283+
result = message.to_chat_dict()
284+
assert result["role"] == "assistant"
285+
assert result["content"] == "Final answer"
286+
assert result["reasoning_content"] == "Let me think step by step..."
287+
288+
289+
def test_message_with_reasoning_content_when_disabled():
290+
"""Test that reasoning_content is NOT included when send_reasoning_content is False.""" # noqa: E501
291+
from openhands.sdk.llm.message import Message, TextContent
292+
293+
message = Message(
294+
role="assistant",
295+
content=[TextContent(text="Final answer")],
296+
reasoning_content="Let me think step by step...",
297+
send_reasoning_content=False,
298+
)
299+
300+
result = message.to_chat_dict()
301+
assert result["role"] == "assistant"
302+
assert result["content"] == "Final answer"
303+
assert "reasoning_content" not in result
304+
305+
306+
def test_message_with_reasoning_content_default_disabled():
307+
"""Test that reasoning_content is NOT included by default."""
308+
from openhands.sdk.llm.message import Message, TextContent
309+
310+
message = Message(
311+
role="assistant",
312+
content=[TextContent(text="Final answer")],
313+
reasoning_content="Let me think step by step...",
314+
)
315+
316+
result = message.to_chat_dict()
317+
assert result["role"] == "assistant"
318+
assert result["content"] == "Final answer"
319+
assert "reasoning_content" not in result
320+
321+
322+
def test_message_with_reasoning_content_none():
323+
"""Test that reasoning_content is NOT included when it's None even if enabled."""
324+
from openhands.sdk.llm.message import Message, TextContent
325+
326+
message = Message(
327+
role="assistant",
328+
content=[TextContent(text="Final answer")],
329+
reasoning_content=None,
330+
send_reasoning_content=True,
331+
)
332+
333+
result = message.to_chat_dict()
334+
assert result["role"] == "assistant"
335+
assert result["content"] == "Final answer"
336+
assert "reasoning_content" not in result
337+
338+
339+
def test_message_with_reasoning_content_empty_string():
340+
"""Test that reasoning_content is NOT included when it's an empty string."""
341+
from openhands.sdk.llm.message import Message, TextContent
342+
343+
message = Message(
344+
role="assistant",
345+
content=[TextContent(text="Final answer")],
346+
reasoning_content="",
347+
send_reasoning_content=True,
348+
)
349+
350+
result = message.to_chat_dict()
351+
assert result["role"] == "assistant"
352+
assert result["content"] == "Final answer"
353+
assert "reasoning_content" not in result
354+
355+
356+
def test_message_with_reasoning_content_list_serializer():
357+
"""Test that reasoning_content works with list serializer."""
358+
from openhands.sdk.llm.message import Message, TextContent
359+
360+
message = Message(
361+
role="assistant",
362+
content=[TextContent(text="Final answer")],
363+
reasoning_content="Step by step reasoning",
364+
send_reasoning_content=True,
365+
function_calling_enabled=True, # Forces list serializer
366+
)
367+
368+
result = message.to_chat_dict()
369+
assert result["role"] == "assistant"
370+
assert isinstance(result["content"], list)
371+
assert result["content"][0]["text"] == "Final answer"
372+
assert result["reasoning_content"] == "Step by step reasoning"

tests/sdk/llm/test_model_features.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pytest
22

33
from openhands.sdk.llm.utils.model_features import (
4+
get_default_temperature,
45
get_features,
56
model_matches,
67
)
@@ -240,3 +241,71 @@ def test_force_string_serializer_full_model_names():
240241
assert get_features("Kimi K2-Instruct-0905").force_string_serializer is False
241242
# Groq-prefixed Kimi should force string serializer
242243
assert get_features("groq/kimi-k2-instruct-0905").force_string_serializer is True
244+
245+
246+
@pytest.mark.parametrize(
247+
"model,expected_send_reasoning",
248+
[
249+
("kimi-k2-thinking", True),
250+
("kimi-k2-thinking-0905", True),
251+
("Kimi-K2-Thinking", True), # Case insensitive
252+
("moonshot/kimi-k2-thinking", True), # With provider prefix
253+
("kimi-k2-instruct", False), # Different variant
254+
("gpt-4o", False),
255+
("claude-3-5-sonnet", False),
256+
("o1", False),
257+
("unknown-model", False),
258+
],
259+
)
260+
def test_send_reasoning_content_support(model, expected_send_reasoning):
261+
"""Test that models like kimi-k2-thinking require send_reasoning_content."""
262+
features = get_features(model)
263+
assert features.send_reasoning_content is expected_send_reasoning
264+
265+
266+
@pytest.mark.parametrize(
267+
"model,expected_temperature",
268+
[
269+
# kimi-k2-thinking models should default to 1.0
270+
("kimi-k2-thinking", 1.0),
271+
("kimi-k2-thinking-0905", 1.0),
272+
("Kimi-K2-Thinking", 1.0), # Case insensitive
273+
("moonshot/kimi-k2-thinking", 1.0), # With provider prefix
274+
("litellm_proxy/kimi-k2-thinking", 1.0), # With litellm proxy prefix
275+
# All other models should default to 0.0
276+
("kimi-k2-instruct", 0.0), # Different kimi variant
277+
("gpt-4", 0.0),
278+
("gpt-4o", 0.0),
279+
("gpt-4o-mini", 0.0),
280+
("claude-3-5-sonnet", 0.0),
281+
("claude-3-7-sonnet", 0.0),
282+
("gemini-1.5-pro", 0.0),
283+
("gemini-2.5-pro-experimental", 0.0),
284+
("o1", 0.0),
285+
("o1-mini", 0.0),
286+
("o3", 0.0),
287+
("deepseek-chat", 0.0),
288+
("llama-3.1-70b", 0.0),
289+
("azure/gpt-4", 0.0),
290+
("openai/gpt-4o", 0.0),
291+
("anthropic/claude-3-5-sonnet", 0.0),
292+
("unknown-model", 0.0),
293+
],
294+
)
295+
def test_get_default_temperature(model, expected_temperature):
296+
"""Test that get_default_temperature returns correct values for different models."""
297+
assert get_default_temperature(model) == expected_temperature
298+
299+
300+
def test_get_default_temperature_fallback():
301+
"""Test that get_default_temperature returns 0.0 for unknown models."""
302+
assert get_default_temperature("completely-unknown-model-12345") == 0.0
303+
assert get_default_temperature("some-random-model") == 0.0
304+
305+
306+
def test_get_default_temperature_case_insensitive():
307+
"""Test that get_default_temperature is case insensitive."""
308+
assert get_default_temperature("kimi-k2-thinking") == 1.0
309+
assert get_default_temperature("KIMI-K2-THINKING") == 1.0
310+
assert get_default_temperature("Kimi-K2-Thinking") == 1.0
311+
assert get_default_temperature("KiMi-k2-ThInKiNg") == 1.0

0 commit comments

Comments
 (0)