From 746b72734fee3057595abcc383c1595ca2d96d00 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Mon, 3 Nov 2025 15:47:54 +0100 Subject: [PATCH 01/10] revert stuff --- openhands-sdk/openhands/sdk/llm/llm.py | 14 ++++++++++++++ .../openhands/sdk/llm/utils/model_features.py | 7 ++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index 6fb213cf9b..45410accfb 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -690,6 +690,20 @@ def _litellm_modify_params_ctx(self, flag: bool): # Capabilities, formatting, and info # ========================================================================= def _init_model_info_and_caps(self) -> None: + # Check if this model doesn't support native tool calling + from openhands.sdk.llm.utils.model_features import ( + NO_NATIVE_TOOL_CALLING_PATTERNS, + model_matches, + ) + + if model_matches(self.model, NO_NATIVE_TOOL_CALLING_PATTERNS): + if self.native_tool_calling: + logger.info( + f"Model {self.model} does not support native tool calling. " + f"Setting native_tool_calling=False." + ) + self.native_tool_calling = False + # Try to get model info via openrouter or litellm proxy first tried = False try: diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index 10a104b3de..e96e8b2630 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -91,10 +91,15 @@ class ModelFeatures: # and need plain strings instead FORCE_STRING_SERIALIZER_PATTERNS: list[str] = [ "deepseek", - "glm", "groq/kimi-k2-instruct", ] +# Models that don't support native tool calling +# These models return tool call arguments with arrays/objects as JSON strings +# instead of properly structured data (e.g., "[1, 100]" instead of [1, 100]) +# This causes Pydantic validation errors when parsing tool arguments +NO_NATIVE_TOOL_CALLING_PATTERNS: list[str] = [] + def get_features(model: str) -> ModelFeatures: """Get model features.""" From eecd17688905c6f063c50327e5dfc35e9e18a945 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 4 Nov 2025 10:27:00 +0100 Subject: [PATCH 02/10] add glm4.6 response normalizer --- openhands-sdk/openhands/sdk/llm/llm.py | 21 +- .../sdk/llm/mixins/response_normalizer.py | 143 +++++++++++ .../openhands/sdk/llm/utils/model_features.py | 14 +- tests/sdk/llm/test_response_normalizer.py | 229 ++++++++++++++++++ 4 files changed, 391 insertions(+), 16 deletions(-) create mode 100644 openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py create mode 100644 tests/sdk/llm/test_response_normalizer.py diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index 45410accfb..3703c0e441 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -70,6 +70,7 @@ Message, ) from openhands.sdk.llm.mixins.non_native_fc import NonNativeToolCallingMixin +from openhands.sdk.llm.mixins.response_normalizer import ResponseNormalizerMixin from openhands.sdk.llm.options.chat_options import select_chat_options from openhands.sdk.llm.options.responses_options import select_responses_options from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot @@ -100,7 +101,7 @@ ) -class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin): +class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin, ResponseNormalizerMixin): """Refactored LLM: simple `completion()`, centralized Telemetry, tiny helpers.""" # ========================================================================= @@ -495,6 +496,9 @@ def _one_attempt(**retry_kwargs) -> ModelResponse: try: resp = _one_attempt() + # Normalize response for model-specific quirks (via mixin) + resp = self.normalize_chat_completion_response(resp) + # Convert the first choice to an OpenHands Message first_choice = resp["choices"][0] message = Message.from_llm_chat_message(first_choice["message"]) @@ -614,6 +618,9 @@ def _one_attempt(**retry_kwargs) -> ResponsesAPIResponse: try: resp: ResponsesAPIResponse = _one_attempt() + # Normalize response for model-specific quirks (via mixin) + resp = self.normalize_responses_api_response(resp) + # Parse output -> Message (typed) # Cast to a typed sequence # accepted by from_llm_responses_output @@ -691,18 +698,6 @@ def _litellm_modify_params_ctx(self, flag: bool): # ========================================================================= def _init_model_info_and_caps(self) -> None: # Check if this model doesn't support native tool calling - from openhands.sdk.llm.utils.model_features import ( - NO_NATIVE_TOOL_CALLING_PATTERNS, - model_matches, - ) - - if model_matches(self.model, NO_NATIVE_TOOL_CALLING_PATTERNS): - if self.native_tool_calling: - logger.info( - f"Model {self.model} does not support native tool calling. " - f"Setting native_tool_calling=False." - ) - self.native_tool_calling = False # Try to get model info via openrouter or litellm proxy first tried = False diff --git a/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py b/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py new file mode 100644 index 0000000000..c371088020 --- /dev/null +++ b/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py @@ -0,0 +1,143 @@ +"""Mixin for normalizing LLM responses with model-specific quirks. + +This handles models that support native function calling but return malformed +tool call arguments (e.g., arrays/objects encoded as JSON strings). +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Any, Protocol + +from litellm.types.utils import Choices + + +if TYPE_CHECKING: + from litellm.types.llms.openai import ResponsesAPIResponse + from litellm.types.utils import ModelResponse + + +class _HostSupports(Protocol): + """Protocol defining what the host class must provide.""" + + model: str + + def _normalize_arguments_str(self, arguments_str: str) -> str: + """Normalize tool call arguments.""" + ... + + +class ResponseNormalizerMixin: + """Mixin for normalizing tool call arguments in LLM responses. + + Some LLMs return tool call arguments with nested data structures + (arrays, objects) encoded as JSON strings instead of proper structures. + For example: {"view_range": "[1, 100]"} instead of {"view_range": [1, 100]} + + This mixin provides methods to detect and fix these quirks based on model features. + + Host requirements: + - self.model: str - The model identifier + """ + + def normalize_chat_completion_response( + self: _HostSupports, response: ModelResponse + ) -> ModelResponse: + """Normalize tool call arguments in a Chat Completions API response. + + Modifies the response in-place if the model has known quirks. + + Args: + response: The ModelResponse from litellm.completion() + + Returns: + The same response object (modified in-place) + """ + # Normalize tool call arguments in each choice's message + for choice in response.choices: + # Type guard: only handle non-streaming Choices + if not isinstance(choice, Choices): + continue + if choice.message and choice.message.tool_calls: + for tool_call in choice.message.tool_calls: + if tool_call.function and tool_call.function.arguments: + tool_call.function.arguments = self._normalize_arguments_str( + tool_call.function.arguments + ) + + return response + + def normalize_responses_api_response( + self: _HostSupports, response: ResponsesAPIResponse + ) -> ResponsesAPIResponse: + """Normalize tool call arguments in a Responses API response. + + Modifies the response in-place if the model has known quirks. + + Args: + response: The ResponsesAPIResponse from litellm.responses() + + Returns: + The same response object (modified in-place) + """ + # Normalize tool call arguments in output items + if response.output: + for item in response.output: + # Check if it's a function_call item with arguments + if ( + hasattr(item, "type") + and getattr(item, "type", None) == "function_call" + and hasattr(item, "arguments") + ): + args = getattr(item, "arguments", None) + if args: + setattr(item, "arguments", self._normalize_arguments_str(args)) + + return response + + def _normalize_arguments_str(self: _HostSupports, arguments_str: str) -> str: + """Recursively parse nested JSON strings in tool call arguments. + + This fixes arguments where nested structures are encoded as JSON strings. + + Args: + arguments_str: The raw arguments JSON string from the LLM + + Returns: + Normalized arguments JSON string with proper data structures + """ + from openhands.sdk.llm.utils.model_features import get_features + + # Only normalize if model has this quirk + if not get_features(self.model).args_as_json_strings: + return arguments_str + + try: + # Parse the JSON string + args_dict = json.loads(arguments_str) + + # Recursively parse nested JSON strings + def _recursively_parse(obj: Any) -> Any: + if isinstance(obj, dict): + return {k: _recursively_parse(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [_recursively_parse(item) for item in obj] + elif isinstance(obj, str): + # Try to parse strings as JSON + try: + parsed = json.loads(obj) + return _recursively_parse(parsed) + except (json.JSONDecodeError, ValueError): + # Not JSON, return as-is + return obj + else: + # Numbers, booleans, None - return as-is + return obj + + normalized_dict = _recursively_parse(args_dict) + + # Re-encode to JSON string + return json.dumps(normalized_dict) + except (json.JSONDecodeError, ValueError): + # If parsing fails, return original + return arguments_str diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index e96e8b2630..bb3d3c2cb1 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -23,6 +23,7 @@ class ModelFeatures: supports_stop_words: bool supports_responses_api: bool force_string_serializer: bool + args_as_json_strings: bool # Pattern tables capturing current behavior. Keep patterns lowercase. @@ -92,13 +93,17 @@ class ModelFeatures: FORCE_STRING_SERIALIZER_PATTERNS: list[str] = [ "deepseek", "groq/kimi-k2-instruct", + "glm-4", ] -# Models that don't support native tool calling -# These models return tool call arguments with arrays/objects as JSON strings + +# Models that return tool call arguments with arrays/objects as JSON strings # instead of properly structured data (e.g., "[1, 100]" instead of [1, 100]) # This causes Pydantic validation errors when parsing tool arguments -NO_NATIVE_TOOL_CALLING_PATTERNS: list[str] = [] +# These models need recursive JSON string parsing in their arguments +FUNCTION_ARGS_AS_JSON_STRINGS_PATTERNS: list[str] = [ + "glm-4", +] def get_features(model: str) -> ModelFeatures: @@ -112,4 +117,7 @@ def get_features(model: str) -> ModelFeatures: ), supports_responses_api=model_matches(model, RESPONSES_API_PATTERNS), force_string_serializer=model_matches(model, FORCE_STRING_SERIALIZER_PATTERNS), + args_as_json_strings=model_matches( + model, FUNCTION_ARGS_AS_JSON_STRINGS_PATTERNS + ), ) diff --git a/tests/sdk/llm/test_response_normalizer.py b/tests/sdk/llm/test_response_normalizer.py new file mode 100644 index 0000000000..cfee35da35 --- /dev/null +++ b/tests/sdk/llm/test_response_normalizer.py @@ -0,0 +1,229 @@ +"""Tests for ResponseNormalizerMixin.""" + +import json +from unittest.mock import MagicMock + +from litellm import ChatCompletionMessageToolCall +from litellm.types.utils import ( + Choices, + Function, + Message as LiteLLMMessage, + ModelResponse, +) + +from openhands.sdk.llm.llm import LLM +from openhands.sdk.llm.mixins.response_normalizer import ResponseNormalizerMixin + + +class TestResponseNormalizerMixin: + """Test ResponseNormalizerMixin functionality.""" + + def test_llm_inherits_mixin(self): + """Verify LLM inherits from ResponseNormalizerMixin.""" + assert issubclass(LLM, ResponseNormalizerMixin) + + def test_mixin_methods_available(self): + """Verify mixin methods are available on LLM instances.""" + llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") + assert hasattr(llm, "normalize_chat_completion_response") + assert hasattr(llm, "normalize_responses_api_response") + assert hasattr(llm, "_normalize_arguments_str") + + def test_normalize_arguments_str_glm(self): + """Test normalization for GLM models.""" + llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") + + # Test array normalization + args_str = '{"view_range": "[1, 100]"}' + normalized = llm._normalize_arguments_str(args_str) + parsed = json.loads(normalized) + assert isinstance(parsed["view_range"], list) + assert parsed["view_range"] == [1, 100] + + # Test nested object normalization + args_str = '{"nested": "{\\"key\\": \\"value\\"}"}' + normalized = llm._normalize_arguments_str(args_str) + parsed = json.loads(normalized) + assert isinstance(parsed["nested"], dict) + assert parsed["nested"]["key"] == "value" + + # Test complex nested structure + args_str = '{"data": "{\\"items\\": \\"[1, 2, 3]\\"}"}' + normalized = llm._normalize_arguments_str(args_str) + parsed = json.loads(normalized) + assert isinstance(parsed["data"], dict) + assert isinstance(parsed["data"]["items"], list) + assert parsed["data"]["items"] == [1, 2, 3] + + def test_normalize_arguments_str_non_glm(self): + """Test that non-GLM models pass through unchanged.""" + llm = LLM(model="gpt-4") + + args_str = '{"view_range": "[1, 100]"}' + normalized = llm._normalize_arguments_str(args_str) + # Should not change for non-GLM models + assert normalized == args_str + + def test_normalize_arguments_str_invalid_json(self): + """Test handling of invalid JSON.""" + llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") + + # Invalid JSON should return original string + args_str = "not json" + normalized = llm._normalize_arguments_str(args_str) + assert normalized == args_str + + def test_normalize_chat_completion_response_glm(self): + """Test normalizing Chat Completions API response for GLM.""" + llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") + + # Create a mock response with malformed tool call arguments + tool_call = ChatCompletionMessageToolCall( + id="call_123", + type="function", + function=Function( + name="str_replace_editor", + arguments='{"path": "/test.py", "view_range": "[1, 100]"}', + ), + ) + + message = LiteLLMMessage(role="assistant", content=None, tool_calls=[tool_call]) + + choice = Choices(finish_reason="tool_calls", index=0, message=message) + + response = ModelResponse( + id="resp_123", + model="glm-4", + object="chat.completion", + created=1234567890, + choices=[choice], + ) + + # Normalize the response + normalized = llm.normalize_chat_completion_response(response) + + # Check that tool call arguments are normalized + first_choice = normalized.choices[0] + assert isinstance(first_choice, Choices) + assert first_choice.message.tool_calls is not None + normalized_args = json.loads( + first_choice.message.tool_calls[0].function.arguments + ) + assert isinstance(normalized_args["view_range"], list) + assert normalized_args["view_range"] == [1, 100] + + def test_normalize_chat_completion_response_non_glm(self): + """Test that non-GLM models are not normalized.""" + llm = LLM(model="gpt-4") + + # Create a mock response + tool_call = ChatCompletionMessageToolCall( + id="call_123", + type="function", + function=Function( + name="test_function", + arguments='{"view_range": "[1, 100]"}', + ), + ) + + message = LiteLLMMessage(role="assistant", content=None, tool_calls=[tool_call]) + + choice = Choices(finish_reason="tool_calls", index=0, message=message) + + response = ModelResponse( + id="resp_123", + model="gpt-4", + object="chat.completion", + created=1234567890, + choices=[choice], + ) + + # Normalize the response + normalized = llm.normalize_chat_completion_response(response) + + # Arguments should remain unchanged for non-GLM models + first_choice = normalized.choices[0] + assert isinstance(first_choice, Choices) + assert first_choice.message.tool_calls is not None + assert ( + first_choice.message.tool_calls[0].function.arguments + == '{"view_range": "[1, 100]"}' + ) + + def test_normalize_responses_api_response_glm(self): + """Test normalizing Responses API response for GLM.""" + llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") + + # Create a mock Responses API response + function_call_item = MagicMock() + function_call_item.type = "function_call" + function_call_item.arguments = '{"path": "/test.py", "view_range": "[1, 100]"}' + + response = MagicMock() + response.output = [function_call_item] + + # Normalize the response + normalized = llm.normalize_responses_api_response(response) + + # Check that arguments are normalized + assert normalized.output is not None + assert len(normalized.output) > 0 + first_item = normalized.output[0] + assert hasattr(first_item, "arguments") + arguments_str = getattr(first_item, "arguments") + assert arguments_str is not None + normalized_args = json.loads(arguments_str) + assert isinstance(normalized_args["view_range"], list) + assert normalized_args["view_range"] == [1, 100] + + def test_mixin_is_reusable(self): + """Test that the mixin can be used by other classes.""" + + class CustomClass(ResponseNormalizerMixin): + def __init__(self, model: str): + self.model = model + + # Create instance with GLM model + obj = CustomClass(model="litellm_proxy/openrouter/z-ai/glm-4.6") + args_str = '{"arr": "[1, 2, 3]"}' + normalized = obj._normalize_arguments_str(args_str) + parsed = json.loads(normalized) + assert isinstance(parsed["arr"], list) + assert parsed["arr"] == [1, 2, 3] + + # Create instance with non-GLM model + obj_gpt = CustomClass(model="gpt-4") + normalized_gpt = obj_gpt._normalize_arguments_str(args_str) + assert normalized_gpt == args_str + + def test_normalize_preserves_non_json_strings(self): + """Test that regular strings that aren't JSON are preserved.""" + llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") + + args_str = '{"message": "Hello, world!", "count": 42}' + normalized = llm._normalize_arguments_str(args_str) + parsed = json.loads(normalized) + assert parsed["message"] == "Hello, world!" + assert parsed["count"] == 42 + + def test_normalize_empty_tool_calls(self): + """Test normalizing response with no tool calls.""" + llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") + + message = LiteLLMMessage(role="assistant", content="Hello", tool_calls=None) + + choice = Choices(finish_reason="stop", index=0, message=message) + + response = ModelResponse( + id="resp_123", + model="glm-4", + object="chat.completion", + created=1234567890, + choices=[choice], + ) + + # Should not raise any errors + normalized = llm.normalize_chat_completion_response(response) + first_choice = normalized.choices[0] + assert isinstance(first_choice, Choices) + assert first_choice.message.content == "Hello" From 9f630fb7891fa3cdf60c83878513b5c775ba38fb Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 4 Nov 2025 10:32:52 +0100 Subject: [PATCH 03/10] remove comment --- openhands-sdk/openhands/sdk/llm/llm.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index 3703c0e441..a63337d9d6 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -697,8 +697,6 @@ def _litellm_modify_params_ctx(self, flag: bool): # Capabilities, formatting, and info # ========================================================================= def _init_model_info_and_caps(self) -> None: - # Check if this model doesn't support native tool calling - # Try to get model info via openrouter or litellm proxy first tried = False try: From 33d21fb2f1d6b73049d8acd92d4fd73036ad8955 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 4 Nov 2025 11:31:51 +0100 Subject: [PATCH 04/10] add pydantic tool call validation --- openhands-sdk/openhands/sdk/llm/llm.py | 9 +- .../sdk/llm/mixins/response_normalizer.py | 143 ----------- .../openhands/sdk/llm/utils/model_features.py | 12 - openhands-sdk/openhands/sdk/tool/schema.py | 75 +++++- tests/sdk/llm/test_response_normalizer.py | 229 ------------------ 5 files changed, 74 insertions(+), 394 deletions(-) delete mode 100644 openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py delete mode 100644 tests/sdk/llm/test_response_normalizer.py diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py index d3f5585089..7fea0e1335 100644 --- a/openhands-sdk/openhands/sdk/llm/llm.py +++ b/openhands-sdk/openhands/sdk/llm/llm.py @@ -70,7 +70,6 @@ Message, ) from openhands.sdk.llm.mixins.non_native_fc import NonNativeToolCallingMixin -from openhands.sdk.llm.mixins.response_normalizer import ResponseNormalizerMixin from openhands.sdk.llm.options.chat_options import select_chat_options from openhands.sdk.llm.options.responses_options import select_responses_options from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot @@ -101,7 +100,7 @@ ) -class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin, ResponseNormalizerMixin): +class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin): """Language model interface for OpenHands agents. The LLM class provides a unified interface for interacting with various @@ -534,9 +533,6 @@ def _one_attempt(**retry_kwargs) -> ModelResponse: try: resp = _one_attempt() - # Normalize response for model-specific quirks (via mixin) - resp = self.normalize_chat_completion_response(resp) - # Convert the first choice to an OpenHands Message first_choice = resp["choices"][0] message = Message.from_llm_chat_message(first_choice["message"]) @@ -659,9 +655,6 @@ def _one_attempt(**retry_kwargs) -> ResponsesAPIResponse: try: resp: ResponsesAPIResponse = _one_attempt() - # Normalize response for model-specific quirks (via mixin) - resp = self.normalize_responses_api_response(resp) - # Parse output -> Message (typed) # Cast to a typed sequence # accepted by from_llm_responses_output diff --git a/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py b/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py deleted file mode 100644 index c371088020..0000000000 --- a/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py +++ /dev/null @@ -1,143 +0,0 @@ -"""Mixin for normalizing LLM responses with model-specific quirks. - -This handles models that support native function calling but return malformed -tool call arguments (e.g., arrays/objects encoded as JSON strings). -""" - -from __future__ import annotations - -import json -from typing import TYPE_CHECKING, Any, Protocol - -from litellm.types.utils import Choices - - -if TYPE_CHECKING: - from litellm.types.llms.openai import ResponsesAPIResponse - from litellm.types.utils import ModelResponse - - -class _HostSupports(Protocol): - """Protocol defining what the host class must provide.""" - - model: str - - def _normalize_arguments_str(self, arguments_str: str) -> str: - """Normalize tool call arguments.""" - ... - - -class ResponseNormalizerMixin: - """Mixin for normalizing tool call arguments in LLM responses. - - Some LLMs return tool call arguments with nested data structures - (arrays, objects) encoded as JSON strings instead of proper structures. - For example: {"view_range": "[1, 100]"} instead of {"view_range": [1, 100]} - - This mixin provides methods to detect and fix these quirks based on model features. - - Host requirements: - - self.model: str - The model identifier - """ - - def normalize_chat_completion_response( - self: _HostSupports, response: ModelResponse - ) -> ModelResponse: - """Normalize tool call arguments in a Chat Completions API response. - - Modifies the response in-place if the model has known quirks. - - Args: - response: The ModelResponse from litellm.completion() - - Returns: - The same response object (modified in-place) - """ - # Normalize tool call arguments in each choice's message - for choice in response.choices: - # Type guard: only handle non-streaming Choices - if not isinstance(choice, Choices): - continue - if choice.message and choice.message.tool_calls: - for tool_call in choice.message.tool_calls: - if tool_call.function and tool_call.function.arguments: - tool_call.function.arguments = self._normalize_arguments_str( - tool_call.function.arguments - ) - - return response - - def normalize_responses_api_response( - self: _HostSupports, response: ResponsesAPIResponse - ) -> ResponsesAPIResponse: - """Normalize tool call arguments in a Responses API response. - - Modifies the response in-place if the model has known quirks. - - Args: - response: The ResponsesAPIResponse from litellm.responses() - - Returns: - The same response object (modified in-place) - """ - # Normalize tool call arguments in output items - if response.output: - for item in response.output: - # Check if it's a function_call item with arguments - if ( - hasattr(item, "type") - and getattr(item, "type", None) == "function_call" - and hasattr(item, "arguments") - ): - args = getattr(item, "arguments", None) - if args: - setattr(item, "arguments", self._normalize_arguments_str(args)) - - return response - - def _normalize_arguments_str(self: _HostSupports, arguments_str: str) -> str: - """Recursively parse nested JSON strings in tool call arguments. - - This fixes arguments where nested structures are encoded as JSON strings. - - Args: - arguments_str: The raw arguments JSON string from the LLM - - Returns: - Normalized arguments JSON string with proper data structures - """ - from openhands.sdk.llm.utils.model_features import get_features - - # Only normalize if model has this quirk - if not get_features(self.model).args_as_json_strings: - return arguments_str - - try: - # Parse the JSON string - args_dict = json.loads(arguments_str) - - # Recursively parse nested JSON strings - def _recursively_parse(obj: Any) -> Any: - if isinstance(obj, dict): - return {k: _recursively_parse(v) for k, v in obj.items()} - elif isinstance(obj, list): - return [_recursively_parse(item) for item in obj] - elif isinstance(obj, str): - # Try to parse strings as JSON - try: - parsed = json.loads(obj) - return _recursively_parse(parsed) - except (json.JSONDecodeError, ValueError): - # Not JSON, return as-is - return obj - else: - # Numbers, booleans, None - return as-is - return obj - - normalized_dict = _recursively_parse(args_dict) - - # Re-encode to JSON string - return json.dumps(normalized_dict) - except (json.JSONDecodeError, ValueError): - # If parsing fails, return original - return arguments_str diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index b6e0fb7142..2ee16e37b2 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -23,7 +23,6 @@ class ModelFeatures: supports_stop_words: bool supports_responses_api: bool force_string_serializer: bool - args_as_json_strings: bool # Pattern tables capturing current behavior. Keep patterns lowercase. @@ -99,14 +98,6 @@ class ModelFeatures: "groq/kimi-k2-instruct", # explicit provider-prefixed IDs ] -# Models that return tool call arguments with arrays/objects as JSON strings -# instead of properly structured data (e.g., "[1, 100]" instead of [1, 100]) -# This causes Pydantic validation errors when parsing tool arguments -# These models need recursive JSON string parsing in their arguments -FUNCTION_ARGS_AS_JSON_STRINGS_PATTERNS: list[str] = [ - "glm-4", # e.g., GLM-4.5 / GLM-4.6 -] - def get_features(model: str) -> ModelFeatures: """Get model features.""" @@ -119,7 +110,4 @@ def get_features(model: str) -> ModelFeatures: ), supports_responses_api=model_matches(model, RESPONSES_API_PATTERNS), force_string_serializer=model_matches(model, FORCE_STRING_SERIALIZER_PATTERNS), - args_as_json_strings=model_matches( - model, FUNCTION_ARGS_AS_JSON_STRINGS_PATTERNS - ), ) diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py index 1c041aa659..09c2911825 100644 --- a/openhands-sdk/openhands/sdk/tool/schema.py +++ b/openhands-sdk/openhands/sdk/tool/schema.py @@ -1,8 +1,10 @@ +import json +import types from abc import ABC, abstractmethod from collections.abc import Sequence -from typing import Any, ClassVar, TypeVar +from typing import Any, ClassVar, TypeVar, Union, get_args, get_origin -from pydantic import ConfigDict, Field, create_model +from pydantic import ConfigDict, Field, create_model, model_validator from rich.text import Text from openhands.sdk.llm import ImageContent, TextContent @@ -100,6 +102,75 @@ class Schema(DiscriminatedUnionMixin): model_config: ClassVar[ConfigDict] = ConfigDict(extra="forbid", frozen=True) + @model_validator(mode="before") + @classmethod + def _decode_json_strings(cls, data: Any) -> Any: + """Pre-validator that automatically decodes JSON strings for list/dict fields. + + This validator runs before field validation and checks if any field that + expects a list or dict type has received a JSON string instead. If so, + it automatically decodes the string using json.loads(). + + This handles cases where LLMs (like GLM-4) return array/object values + as JSON strings instead of native JSON arrays/objects. + + Args: + data: The input data (usually a dict) before validation. + + Returns: + The data with JSON strings decoded where appropriate. + """ + if not isinstance(data, dict): + return data + + # Get the model's field annotations + annotations = getattr(cls, "__annotations__", {}) + + for field_name, value in list(data.items()): + # Skip if value is not a string + if not isinstance(value, str): + continue + + # Get the expected type for this field + if field_name not in annotations: + continue + + expected_type = annotations[field_name] + + # Check if the expected type is a list or dict + # Handle Union types (e.g., list | None, Optional[list]) + origin = get_origin(expected_type) + + # Check for Union types (handles both typing.Union and | syntax) + is_union = origin is Union + # Python 3.10+ has types.UnionType for | syntax + if hasattr(types, "UnionType") and origin is types.UnionType: + is_union = True + + if is_union: + # For Union types, check all args + type_args = get_args(expected_type) + expected_origins = [get_origin(arg) or arg for arg in type_args] + else: + expected_origins = [origin or expected_type] + + # Check if any of the expected types is list or dict + expects_list_or_dict = any(exp in (list, dict) for exp in expected_origins) + + if expects_list_or_dict: + # Try to parse the string as JSON + try: + parsed_value = json.loads(value) + # Only replace if we got a list or dict + if isinstance(parsed_value, (list, dict)): + data[field_name] = parsed_value + except (json.JSONDecodeError, ValueError): + # If parsing fails, leave the original value + # Pydantic will raise validation error if needed + pass + + return data + @classmethod def to_mcp_schema(cls) -> dict[str, Any]: """Convert to JSON schema format compatible with MCP.""" diff --git a/tests/sdk/llm/test_response_normalizer.py b/tests/sdk/llm/test_response_normalizer.py deleted file mode 100644 index cfee35da35..0000000000 --- a/tests/sdk/llm/test_response_normalizer.py +++ /dev/null @@ -1,229 +0,0 @@ -"""Tests for ResponseNormalizerMixin.""" - -import json -from unittest.mock import MagicMock - -from litellm import ChatCompletionMessageToolCall -from litellm.types.utils import ( - Choices, - Function, - Message as LiteLLMMessage, - ModelResponse, -) - -from openhands.sdk.llm.llm import LLM -from openhands.sdk.llm.mixins.response_normalizer import ResponseNormalizerMixin - - -class TestResponseNormalizerMixin: - """Test ResponseNormalizerMixin functionality.""" - - def test_llm_inherits_mixin(self): - """Verify LLM inherits from ResponseNormalizerMixin.""" - assert issubclass(LLM, ResponseNormalizerMixin) - - def test_mixin_methods_available(self): - """Verify mixin methods are available on LLM instances.""" - llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") - assert hasattr(llm, "normalize_chat_completion_response") - assert hasattr(llm, "normalize_responses_api_response") - assert hasattr(llm, "_normalize_arguments_str") - - def test_normalize_arguments_str_glm(self): - """Test normalization for GLM models.""" - llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") - - # Test array normalization - args_str = '{"view_range": "[1, 100]"}' - normalized = llm._normalize_arguments_str(args_str) - parsed = json.loads(normalized) - assert isinstance(parsed["view_range"], list) - assert parsed["view_range"] == [1, 100] - - # Test nested object normalization - args_str = '{"nested": "{\\"key\\": \\"value\\"}"}' - normalized = llm._normalize_arguments_str(args_str) - parsed = json.loads(normalized) - assert isinstance(parsed["nested"], dict) - assert parsed["nested"]["key"] == "value" - - # Test complex nested structure - args_str = '{"data": "{\\"items\\": \\"[1, 2, 3]\\"}"}' - normalized = llm._normalize_arguments_str(args_str) - parsed = json.loads(normalized) - assert isinstance(parsed["data"], dict) - assert isinstance(parsed["data"]["items"], list) - assert parsed["data"]["items"] == [1, 2, 3] - - def test_normalize_arguments_str_non_glm(self): - """Test that non-GLM models pass through unchanged.""" - llm = LLM(model="gpt-4") - - args_str = '{"view_range": "[1, 100]"}' - normalized = llm._normalize_arguments_str(args_str) - # Should not change for non-GLM models - assert normalized == args_str - - def test_normalize_arguments_str_invalid_json(self): - """Test handling of invalid JSON.""" - llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") - - # Invalid JSON should return original string - args_str = "not json" - normalized = llm._normalize_arguments_str(args_str) - assert normalized == args_str - - def test_normalize_chat_completion_response_glm(self): - """Test normalizing Chat Completions API response for GLM.""" - llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") - - # Create a mock response with malformed tool call arguments - tool_call = ChatCompletionMessageToolCall( - id="call_123", - type="function", - function=Function( - name="str_replace_editor", - arguments='{"path": "/test.py", "view_range": "[1, 100]"}', - ), - ) - - message = LiteLLMMessage(role="assistant", content=None, tool_calls=[tool_call]) - - choice = Choices(finish_reason="tool_calls", index=0, message=message) - - response = ModelResponse( - id="resp_123", - model="glm-4", - object="chat.completion", - created=1234567890, - choices=[choice], - ) - - # Normalize the response - normalized = llm.normalize_chat_completion_response(response) - - # Check that tool call arguments are normalized - first_choice = normalized.choices[0] - assert isinstance(first_choice, Choices) - assert first_choice.message.tool_calls is not None - normalized_args = json.loads( - first_choice.message.tool_calls[0].function.arguments - ) - assert isinstance(normalized_args["view_range"], list) - assert normalized_args["view_range"] == [1, 100] - - def test_normalize_chat_completion_response_non_glm(self): - """Test that non-GLM models are not normalized.""" - llm = LLM(model="gpt-4") - - # Create a mock response - tool_call = ChatCompletionMessageToolCall( - id="call_123", - type="function", - function=Function( - name="test_function", - arguments='{"view_range": "[1, 100]"}', - ), - ) - - message = LiteLLMMessage(role="assistant", content=None, tool_calls=[tool_call]) - - choice = Choices(finish_reason="tool_calls", index=0, message=message) - - response = ModelResponse( - id="resp_123", - model="gpt-4", - object="chat.completion", - created=1234567890, - choices=[choice], - ) - - # Normalize the response - normalized = llm.normalize_chat_completion_response(response) - - # Arguments should remain unchanged for non-GLM models - first_choice = normalized.choices[0] - assert isinstance(first_choice, Choices) - assert first_choice.message.tool_calls is not None - assert ( - first_choice.message.tool_calls[0].function.arguments - == '{"view_range": "[1, 100]"}' - ) - - def test_normalize_responses_api_response_glm(self): - """Test normalizing Responses API response for GLM.""" - llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") - - # Create a mock Responses API response - function_call_item = MagicMock() - function_call_item.type = "function_call" - function_call_item.arguments = '{"path": "/test.py", "view_range": "[1, 100]"}' - - response = MagicMock() - response.output = [function_call_item] - - # Normalize the response - normalized = llm.normalize_responses_api_response(response) - - # Check that arguments are normalized - assert normalized.output is not None - assert len(normalized.output) > 0 - first_item = normalized.output[0] - assert hasattr(first_item, "arguments") - arguments_str = getattr(first_item, "arguments") - assert arguments_str is not None - normalized_args = json.loads(arguments_str) - assert isinstance(normalized_args["view_range"], list) - assert normalized_args["view_range"] == [1, 100] - - def test_mixin_is_reusable(self): - """Test that the mixin can be used by other classes.""" - - class CustomClass(ResponseNormalizerMixin): - def __init__(self, model: str): - self.model = model - - # Create instance with GLM model - obj = CustomClass(model="litellm_proxy/openrouter/z-ai/glm-4.6") - args_str = '{"arr": "[1, 2, 3]"}' - normalized = obj._normalize_arguments_str(args_str) - parsed = json.loads(normalized) - assert isinstance(parsed["arr"], list) - assert parsed["arr"] == [1, 2, 3] - - # Create instance with non-GLM model - obj_gpt = CustomClass(model="gpt-4") - normalized_gpt = obj_gpt._normalize_arguments_str(args_str) - assert normalized_gpt == args_str - - def test_normalize_preserves_non_json_strings(self): - """Test that regular strings that aren't JSON are preserved.""" - llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") - - args_str = '{"message": "Hello, world!", "count": 42}' - normalized = llm._normalize_arguments_str(args_str) - parsed = json.loads(normalized) - assert parsed["message"] == "Hello, world!" - assert parsed["count"] == 42 - - def test_normalize_empty_tool_calls(self): - """Test normalizing response with no tool calls.""" - llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6") - - message = LiteLLMMessage(role="assistant", content="Hello", tool_calls=None) - - choice = Choices(finish_reason="stop", index=0, message=message) - - response = ModelResponse( - id="resp_123", - model="glm-4", - object="chat.completion", - created=1234567890, - choices=[choice], - ) - - # Should not raise any errors - normalized = llm.normalize_chat_completion_response(response) - first_choice = normalized.choices[0] - assert isinstance(first_choice, Choices) - assert first_choice.message.content == "Hello" From 42dafb10702a7684e920186d45050aca1b93b980 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 10:41:40 +0000 Subject: [PATCH 05/10] Refactor _decode_json_strings to eliminate special-case union handling - Remove fragile Union type detection that checked typing.Union and types.UnionType separately - Use get_args() uniformly for both union and simple types, eliminating branching logic - Consolidate early exit conditions into single check for better readability - Remove unnecessary list() copy when iterating over data.items() - Remove unused imports: types module and Union from typing - Improve code comments to clarify json.loads() return types - Reduce function from 46 lines to 34 lines (-26%) while maintaining functionality This addresses code review feedback about treating normal cases as special cases and eliminates brittleness around Python version differences in union handling. Co-authored-by: openhands --- openhands-sdk/openhands/sdk/tool/schema.py | 39 ++++++++-------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py index 09c2911825..c3e49b3733 100644 --- a/openhands-sdk/openhands/sdk/tool/schema.py +++ b/openhands-sdk/openhands/sdk/tool/schema.py @@ -1,8 +1,7 @@ import json -import types from abc import ABC, abstractmethod from collections.abc import Sequence -from typing import Any, ClassVar, TypeVar, Union, get_args, get_origin +from typing import Any, ClassVar, TypeVar, get_args, get_origin from pydantic import ConfigDict, Field, create_model, model_validator from rich.text import Text @@ -126,42 +125,30 @@ def _decode_json_strings(cls, data: Any) -> Any: # Get the model's field annotations annotations = getattr(cls, "__annotations__", {}) - for field_name, value in list(data.items()): - # Skip if value is not a string - if not isinstance(value, str): - continue - - # Get the expected type for this field - if field_name not in annotations: + for field_name, value in data.items(): + # Skip if value is not a string or field not in annotations + if not isinstance(value, str) or field_name not in annotations: continue expected_type = annotations[field_name] - # Check if the expected type is a list or dict - # Handle Union types (e.g., list | None, Optional[list]) - origin = get_origin(expected_type) - - # Check for Union types (handles both typing.Union and | syntax) - is_union = origin is Union - # Python 3.10+ has types.UnionType for | syntax - if hasattr(types, "UnionType") and origin is types.UnionType: - is_union = True - - if is_union: - # For Union types, check all args - type_args = get_args(expected_type) + # Get all possible types (handles unions and simple types uniformly) + type_args = get_args(expected_type) + if type_args: + # Union or generic type - extract origins from all args expected_origins = [get_origin(arg) or arg for arg in type_args] else: + # Simple type - use origin or the type itself + origin = get_origin(expected_type) expected_origins = [origin or expected_type] # Check if any of the expected types is list or dict - expects_list_or_dict = any(exp in (list, dict) for exp in expected_origins) - - if expects_list_or_dict: + if any(exp in (list, dict) for exp in expected_origins): # Try to parse the string as JSON try: parsed_value = json.loads(value) - # Only replace if we got a list or dict + # json.loads() returns dict, list, str, int, float, bool, or None + # Only use parsed value if it matches expected collection types if isinstance(parsed_value, (list, dict)): data[field_name] = parsed_value except (json.JSONDecodeError, ValueError): From 61a5ecb583d5493b9dfa3d1df8fb577b1c4c5d2d Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 11:01:49 +0000 Subject: [PATCH 06/10] Fix _decode_json_strings to handle Annotated types and field aliases correctly This commit addresses two critical issues identified in code review: 1. **Annotated type handling**: The validator now properly unwraps Annotated types before checking for list/dict, ensuring fields like `Annotated[list[str], Field(...)]` are correctly decoded from JSON strings. 2. **Field alias support**: Changed from using __annotations__ to model_fields, which properly handles field aliases and inherited fields. The validator now checks both field names and their aliases when looking up values in the input data. 3. **Union type detection fix**: Fixed a critical bug where we were checking the origin of type arguments instead of the type itself. For example, with `list[str]`, we were incorrectly extracting `str` instead of `list`. Changes: - Import Annotated, Union, and types module - Use cls.model_fields instead of cls.__annotations__ - Add logic to unwrap Annotated types before type checking - Fix Union type detection to check the type's origin, not its args - Properly handle both old-style Union and new-style (|) union syntax The validator now correctly handles: - `Annotated[list[str], Field(description='...')]` - Fields with aliases: `my_list: list[int] = Field(alias='myList')` - Union types: `list[str] | None` - Inherited fields from parent classes Co-authored-by: openhands --- openhands-sdk/openhands/sdk/tool/schema.py | 42 ++++++++++++++-------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py index c3e49b3733..6333e48f74 100644 --- a/openhands-sdk/openhands/sdk/tool/schema.py +++ b/openhands-sdk/openhands/sdk/tool/schema.py @@ -1,7 +1,8 @@ import json +import types from abc import ABC, abstractmethod from collections.abc import Sequence -from typing import Any, ClassVar, TypeVar, get_args, get_origin +from typing import Annotated, Any, ClassVar, TypeVar, Union, get_args, get_origin from pydantic import ConfigDict, Field, create_model, model_validator from rich.text import Text @@ -122,24 +123,37 @@ def _decode_json_strings(cls, data: Any) -> Any: if not isinstance(data, dict): return data - # Get the model's field annotations - annotations = getattr(cls, "__annotations__", {}) + # Use model_fields to properly handle aliases and inherited fields + for field_name, field_info in cls.model_fields.items(): + # Check both the field name and its alias (if any) + data_key = field_info.alias if field_info.alias else field_name + if data_key not in data: + continue - for field_name, value in data.items(): - # Skip if value is not a string or field not in annotations - if not isinstance(value, str) or field_name not in annotations: + value = data[data_key] + # Skip if value is not a string + if not isinstance(value, str): continue - expected_type = annotations[field_name] + expected_type = field_info.annotation + + # Unwrap Annotated types - only the first arg is the actual type + if get_origin(expected_type) is Annotated: + type_args = get_args(expected_type) + expected_type = type_args[0] if type_args else expected_type + + # Get the origin of the expected type (e.g., list from list[str]) + origin = get_origin(expected_type) - # Get all possible types (handles unions and simple types uniformly) - type_args = get_args(expected_type) - if type_args: - # Union or generic type - extract origins from all args + # For Union types, we need to check all union members + if origin is Union or ( + hasattr(types, "UnionType") and origin is types.UnionType + ): + # For Union types, check each union member + type_args = get_args(expected_type) expected_origins = [get_origin(arg) or arg for arg in type_args] else: - # Simple type - use origin or the type itself - origin = get_origin(expected_type) + # For non-Union types, just check the origin expected_origins = [origin or expected_type] # Check if any of the expected types is list or dict @@ -150,7 +164,7 @@ def _decode_json_strings(cls, data: Any) -> Any: # json.loads() returns dict, list, str, int, float, bool, or None # Only use parsed value if it matches expected collection types if isinstance(parsed_value, (list, dict)): - data[field_name] = parsed_value + data[data_key] = parsed_value except (json.JSONDecodeError, ValueError): # If parsing fails, leave the original value # Pydantic will raise validation error if needed From dd615968112376bcd93048a77d2455ecd932abca Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 4 Nov 2025 12:10:32 +0100 Subject: [PATCH 07/10] improve doc --- openhands-sdk/openhands/sdk/tool/schema.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py index 6333e48f74..c4122431bd 100644 --- a/openhands-sdk/openhands/sdk/tool/schema.py +++ b/openhands-sdk/openhands/sdk/tool/schema.py @@ -112,7 +112,9 @@ def _decode_json_strings(cls, data: Any) -> Any: it automatically decodes the string using json.loads(). This handles cases where LLMs (like GLM-4) return array/object values - as JSON strings instead of native JSON arrays/objects. + as JSON strings instead of native JSON arrays/objects i.e. + "[1, 100]" instead of + [1, 100]. Args: data: The input data (usually a dict) before validation. From a5feb9d3d347525447218823f1049cf58a5d6937 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 13:01:03 +0000 Subject: [PATCH 08/10] Add comprehensive unit tests for JSON string decoding in Schema validator This commit adds 16 unit tests to verify the _decode_json_strings validator works correctly for all edge cases: **Basic functionality:** - JSON strings decoded to native lists/dicts - Native lists/dicts pass through unchanged - Regular string fields not affected by JSON decoding **Advanced type handling:** - Annotated types (e.g., Annotated[list[str], Field(...)]) - Field aliases (e.g., Field(alias='myList')) - Optional/Union types (e.g., list[str] | None) - Nested structures (e.g., list[list[int]]) **Error handling:** - Invalid JSON strings rejected with ValidationError - JSON strings with wrong types rejected - Empty collections handled correctly **Edge cases:** - Unicode characters in JSON strings - Extra whitespace in JSON strings - Mixed native and JSON string values in same model All tests pass and validate that the validator correctly handles both JSON-encoded strings from models like GLM-4 and regular native objects from other models. Co-authored-by: openhands --- tests/sdk/tool/test_schema_json_decoding.py | 257 ++++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 tests/sdk/tool/test_schema_json_decoding.py diff --git a/tests/sdk/tool/test_schema_json_decoding.py b/tests/sdk/tool/test_schema_json_decoding.py new file mode 100644 index 0000000000..5c9a246ed2 --- /dev/null +++ b/tests/sdk/tool/test_schema_json_decoding.py @@ -0,0 +1,257 @@ +"""Tests for JSON string decoding in Schema validator. + +This module tests the _decode_json_strings validator that automatically +decodes JSON strings for list/dict fields. This handles cases where LLMs +(like GLM-4) return array/object values as JSON strings instead of native +JSON arrays/objects. +""" + +from typing import Annotated + +import pytest +from pydantic import Field, ValidationError + +from openhands.sdk.tool.schema import Action + + +class JsonDecodingTestAction(Action): + """Test action with list and dict fields.""" + + items: list[str] = Field(description="A list of items") + config: dict[str, int] = Field(description="Configuration dictionary") + name: str = Field(description="A regular string field") + + +class JsonDecodingAnnotatedAction(Action): + """Test action with Annotated types.""" + + items: Annotated[list[str], Field(description="A list of items")] + config: Annotated[dict[str, int], Field(description="Configuration dictionary")] + + +class JsonDecodingAliasAction(Action): + """Test action with field aliases.""" + + my_list: list[int] = Field(alias="myList", description="A list with alias") + my_dict: dict[str, str] = Field(alias="myDict", description="A dict with alias") + + +class JsonDecodingOptionalAction(Action): + """Test action with optional list/dict fields.""" + + items: list[str] | None = Field(default=None, description="Optional list") + config: dict[str, int] | None = Field(default=None, description="Optional dict") + + +def test_decode_json_string_list(): + """Test that JSON string lists are decoded to native lists.""" + data = { + "items": '["a", "b", "c"]', + "config": '{"x": 1, "y": 2}', + "name": "test", + } + action = JsonDecodingTestAction.model_validate(data) + + assert action.items == ["a", "b", "c"] + assert action.config == {"x": 1, "y": 2} + assert action.name == "test" + + +def test_decode_json_string_dict(): + """Test that JSON string dicts are decoded to native dicts.""" + data = { + "items": '["item1", "item2"]', + "config": '{"key1": 10, "key2": 20}', + "name": "dict_test", + } + action = JsonDecodingTestAction.model_validate(data) + + assert action.items == ["item1", "item2"] + assert action.config == {"key1": 10, "key2": 20} + assert action.name == "dict_test" + + +def test_native_list_dict_passthrough(): + """Test that native lists and dicts pass through unchanged.""" + data = { + "items": ["direct", "list"], + "config": {"direct": 42}, + "name": "native_test", + } + action = JsonDecodingTestAction.model_validate(data) + + assert action.items == ["direct", "list"] + assert action.config == {"direct": 42} + assert action.name == "native_test" + + +def test_regular_string_not_decoded(): + """Test that regular string fields are not affected by JSON decoding.""" + data = { + "items": "[]", + "config": "{}", + "name": "this is not json but a regular string", + } + action = JsonDecodingTestAction.model_validate(data) + + assert action.items == [] + assert action.config == {} + # Regular string field should NOT be decoded + assert action.name == "this is not json but a regular string" + + +def test_annotated_types(): + """Test that Annotated types are properly handled.""" + data = { + "items": '["x", "y", "z"]', + "config": '{"a": 1, "b": 2}', + } + action = JsonDecodingAnnotatedAction.model_validate(data) + + assert action.items == ["x", "y", "z"] + assert action.config == {"a": 1, "b": 2} + + +def test_field_aliases(): + """Test that field aliases are properly handled.""" + data = { + "myList": "[1, 2, 3]", + "myDict": '{"key": "value"}', + } + action = JsonDecodingAliasAction.model_validate(data) + + assert action.my_list == [1, 2, 3] + assert action.my_dict == {"key": "value"} + + +def test_optional_fields_with_json_strings(): + """Test that optional list/dict fields work with JSON strings.""" + data = { + "items": '["opt1", "opt2"]', + "config": '{"opt": 99}', + } + action = JsonDecodingOptionalAction.model_validate(data) + + assert action.items == ["opt1", "opt2"] + assert action.config == {"opt": 99} + + +def test_optional_fields_with_none(): + """Test that optional fields can be None.""" + data = {} + action = JsonDecodingOptionalAction.model_validate(data) + + assert action.items is None + assert action.config is None + + +def test_optional_fields_with_native_values(): + """Test that optional fields work with native values.""" + data = { + "items": ["native1", "native2"], + "config": {"native": 100}, + } + action = JsonDecodingOptionalAction.model_validate(data) + + assert action.items == ["native1", "native2"] + assert action.config == {"native": 100} + + +def test_invalid_json_string_rejected(): + """Test that invalid JSON strings are rejected with validation error.""" + data = { + "items": "not valid json", + "config": "{}", + "name": "test", + } + + with pytest.raises(ValidationError) as exc_info: + JsonDecodingTestAction.model_validate(data) + + # Should fail validation because "not valid json" can't be parsed as list + assert "items" in str(exc_info.value) + + +def test_json_string_with_wrong_type_rejected(): + """Test that JSON strings with wrong types are rejected.""" + # Field expects list but JSON string contains dict + data = { + "items": '{"not": "a list"}', + "config": "{}", + "name": "test", + } + + with pytest.raises(ValidationError) as exc_info: + JsonDecodingTestAction.model_validate(data) + + assert "items" in str(exc_info.value) + + +def test_nested_structures(): + """Test that nested lists and dicts in JSON strings work.""" + + class NestedAction(Action): + nested_list: list[list[int]] = Field(description="Nested list") + nested_dict: dict[str, dict[str, str]] = Field(description="Nested dict") + + data = { + "nested_list": "[[1, 2], [3, 4]]", + "nested_dict": '{"outer": {"inner": "value"}}', + } + action = NestedAction.model_validate(data) + + assert action.nested_list == [[1, 2], [3, 4]] + assert action.nested_dict == {"outer": {"inner": "value"}} + + +def test_empty_collections(): + """Test that empty lists and dicts work.""" + data = { + "items": "[]", + "config": "{}", + "name": "empty", + } + action = JsonDecodingTestAction.model_validate(data) + + assert action.items == [] + assert action.config == {} + + +def test_mixed_native_and_json_strings(): + """Test mixing native values and JSON strings in same model.""" + data = { + "items": ["native", "list"], # Native list + "config": '{"from": 1, "json": 2}', # JSON string + "name": "mixed", + } + action = JsonDecodingTestAction.model_validate(data) + + assert action.items == ["native", "list"] + assert action.config == {"from": 1, "json": 2} + assert action.name == "mixed" + + +def test_unicode_in_json_strings(): + """Test that unicode characters in JSON strings are handled correctly.""" + data = { + "items": '["hello", "世界", "🌍"]', + "config": '{"greeting": 1, "你好": 2}', + "name": "unicode", + } + action = JsonDecodingTestAction.model_validate(data) + + assert action.items == ["hello", "世界", "🌍"] + assert action.config == {"greeting": 1, "你好": 2} + + +def test_whitespace_in_json_strings(): + """Test that JSON strings with extra whitespace work.""" + data = { + "items": ' [ "a" , "b" , "c" ] ', + "config": ' { "x" : 1 , "y" : 2 } ', + "name": "whitespace", + } + action = JsonDecodingTestAction.model_validate(data) + + assert action.items == ["a", "b", "c"] + assert action.config == {"x": 1, "y": 2} From f934f1ec713de36964ba69d6d01d29a2b34b828c Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 4 Nov 2025 17:06:39 +0100 Subject: [PATCH 09/10] update doc --- openhands-sdk/openhands/sdk/tool/schema.py | 30 +++++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py index c4122431bd..20df1dda47 100644 --- a/openhands-sdk/openhands/sdk/tool/schema.py +++ b/openhands-sdk/openhands/sdk/tool/schema.py @@ -111,10 +111,32 @@ def _decode_json_strings(cls, data: Any) -> Any: expects a list or dict type has received a JSON string instead. If so, it automatically decodes the string using json.loads(). - This handles cases where LLMs (like GLM-4) return array/object values - as JSON strings instead of native JSON arrays/objects i.e. - "[1, 100]" instead of - [1, 100]. + This handles cases where certain LLMs (such as GLM 4.6) incorrectly encode + array/object parameters as JSON strings when using native function calling. + + Example raw LLM output from GLM 4.6: + { + "role": "assistant", + "content": "I'll view the file for you.", + "tool_calls": [{ + "id": "call_ef8e", + "type": "function", + "function": { + "name": "str_replace_editor", + "arguments": '{ + "command": "view", + "path": "/tmp/test.txt", + "view_range": "[1, 5]" + }' + } + }] + } + Expected output: `"view_range" : [1, 5]` + + Note: The arguments field is a JSON string. When decoded, view_range is + incorrectly a string "[1, 5]" instead of the proper array [1, 5]. + This validator automatically fixes this by detecting that view_range + expects a list type and decoding the JSON string to get the actual array. Args: data: The input data (usually a dict) before validation. From 512dfe8af1a70eff642a8e6b813230a9872bb1b9 Mon Sep 17 00:00:00 2001 From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com> Date: Wed, 5 Nov 2025 13:45:57 +0100 Subject: [PATCH 10/10] Apply suggestion from @simonrosenberg --- openhands-sdk/openhands/sdk/llm/utils/model_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py index 2ee16e37b2..e53d892085 100644 --- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py +++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py @@ -93,7 +93,7 @@ class ModelFeatures: # Keep these entries as bare substrings without wildcards. FORCE_STRING_SERIALIZER_PATTERNS: list[str] = [ "deepseek", # e.g., DeepSeek-V3.2-Exp - "glm-4", # e.g., GLM-4.5 / GLM-4.6 + "glm", # e.g., GLM-4.5 / GLM-4.6 # Kimi K2-Instruct requires string serialization only on Groq "groq/kimi-k2-instruct", # explicit provider-prefixed IDs ]