From 746b72734fee3057595abcc383c1595ca2d96d00 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Mon, 3 Nov 2025 15:47:54 +0100
Subject: [PATCH 01/10] revert stuff

---
 openhands-sdk/openhands/sdk/llm/llm.py             | 14 ++++++++++++++
 .../openhands/sdk/llm/utils/model_features.py      |  7 ++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
index 6fb213cf9b..45410accfb 100644
--- a/openhands-sdk/openhands/sdk/llm/llm.py
+++ b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -690,6 +690,20 @@ def _litellm_modify_params_ctx(self, flag: bool):
     # Capabilities, formatting, and info
     # =========================================================================
     def _init_model_info_and_caps(self) -> None:
+        # Check if this model doesn't support native tool calling
+        from openhands.sdk.llm.utils.model_features import (
+            NO_NATIVE_TOOL_CALLING_PATTERNS,
+            model_matches,
+        )
+
+        if model_matches(self.model, NO_NATIVE_TOOL_CALLING_PATTERNS):
+            if self.native_tool_calling:
+                logger.info(
+                    f"Model {self.model} does not support native tool calling. "
+                    f"Setting native_tool_calling=False."
+                )
+            self.native_tool_calling = False
+
         # Try to get model info via openrouter or litellm proxy first
         tried = False
         try:
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index 10a104b3de..e96e8b2630 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -91,10 +91,15 @@ class ModelFeatures:
 # and need plain strings instead
 FORCE_STRING_SERIALIZER_PATTERNS: list[str] = [
     "deepseek",
-    "glm",
     "groq/kimi-k2-instruct",
 ]
 
+# Models that don't support native tool calling
+# These models return tool call arguments with arrays/objects as JSON strings
+# instead of properly structured data (e.g., "[1, 100]" instead of [1, 100])
+# This causes Pydantic validation errors when parsing tool arguments
+NO_NATIVE_TOOL_CALLING_PATTERNS: list[str] = []
+
 
 def get_features(model: str) -> ModelFeatures:
     """Get model features."""

From eecd17688905c6f063c50327e5dfc35e9e18a945 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 4 Nov 2025 10:27:00 +0100
Subject: [PATCH 02/10] add glm4.6 response normalizer

---
 openhands-sdk/openhands/sdk/llm/llm.py        |  21 +-
 .../sdk/llm/mixins/response_normalizer.py     | 143 +++++++++++
 .../openhands/sdk/llm/utils/model_features.py |  14 +-
 tests/sdk/llm/test_response_normalizer.py     | 229 ++++++++++++++++++
 4 files changed, 391 insertions(+), 16 deletions(-)
 create mode 100644 openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py
 create mode 100644 tests/sdk/llm/test_response_normalizer.py

diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
index 45410accfb..3703c0e441 100644
--- a/openhands-sdk/openhands/sdk/llm/llm.py
+++ b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -70,6 +70,7 @@
     Message,
 )
 from openhands.sdk.llm.mixins.non_native_fc import NonNativeToolCallingMixin
+from openhands.sdk.llm.mixins.response_normalizer import ResponseNormalizerMixin
 from openhands.sdk.llm.options.chat_options import select_chat_options
 from openhands.sdk.llm.options.responses_options import select_responses_options
 from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot
@@ -100,7 +101,7 @@
 )
 
 
-class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
+class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin, ResponseNormalizerMixin):
     """Refactored LLM: simple `completion()`, centralized Telemetry, tiny helpers."""
 
     # =========================================================================
@@ -495,6 +496,9 @@ def _one_attempt(**retry_kwargs) -> ModelResponse:
         try:
             resp = _one_attempt()
 
+            # Normalize response for model-specific quirks (via mixin)
+            resp = self.normalize_chat_completion_response(resp)
+
             # Convert the first choice to an OpenHands Message
             first_choice = resp["choices"][0]
             message = Message.from_llm_chat_message(first_choice["message"])
@@ -614,6 +618,9 @@ def _one_attempt(**retry_kwargs) -> ResponsesAPIResponse:
         try:
             resp: ResponsesAPIResponse = _one_attempt()
 
+            # Normalize response for model-specific quirks (via mixin)
+            resp = self.normalize_responses_api_response(resp)
+
             # Parse output -> Message (typed)
             # Cast to a typed sequence
             # accepted by from_llm_responses_output
@@ -691,18 +698,6 @@ def _litellm_modify_params_ctx(self, flag: bool):
     # =========================================================================
     def _init_model_info_and_caps(self) -> None:
         # Check if this model doesn't support native tool calling
-        from openhands.sdk.llm.utils.model_features import (
-            NO_NATIVE_TOOL_CALLING_PATTERNS,
-            model_matches,
-        )
-
-        if model_matches(self.model, NO_NATIVE_TOOL_CALLING_PATTERNS):
-            if self.native_tool_calling:
-                logger.info(
-                    f"Model {self.model} does not support native tool calling. "
-                    f"Setting native_tool_calling=False."
-                )
-            self.native_tool_calling = False
 
         # Try to get model info via openrouter or litellm proxy first
         tried = False
diff --git a/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py b/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py
new file mode 100644
index 0000000000..c371088020
--- /dev/null
+++ b/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py
@@ -0,0 +1,143 @@
+"""Mixin for normalizing LLM responses with model-specific quirks.
+
+This handles models that support native function calling but return malformed
+tool call arguments (e.g., arrays/objects encoded as JSON strings).
+"""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Any, Protocol
+
+from litellm.types.utils import Choices
+
+
+if TYPE_CHECKING:
+    from litellm.types.llms.openai import ResponsesAPIResponse
+    from litellm.types.utils import ModelResponse
+
+
+class _HostSupports(Protocol):
+    """Protocol defining what the host class must provide."""
+
+    model: str
+
+    def _normalize_arguments_str(self, arguments_str: str) -> str:
+        """Normalize tool call arguments."""
+        ...
+
+
+class ResponseNormalizerMixin:
+    """Mixin for normalizing tool call arguments in LLM responses.
+
+    Some LLMs return tool call arguments with nested data structures
+    (arrays, objects) encoded as JSON strings instead of proper structures.
+    For example: {"view_range": "[1, 100]"} instead of {"view_range": [1, 100]}
+
+    This mixin provides methods to detect and fix these quirks based on model features.
+
+    Host requirements:
+    - self.model: str - The model identifier
+    """
+
+    def normalize_chat_completion_response(
+        self: _HostSupports, response: ModelResponse
+    ) -> ModelResponse:
+        """Normalize tool call arguments in a Chat Completions API response.
+
+        Modifies the response in-place if the model has known quirks.
+
+        Args:
+            response: The ModelResponse from litellm.completion()
+
+        Returns:
+            The same response object (modified in-place)
+        """
+        # Normalize tool call arguments in each choice's message
+        for choice in response.choices:
+            # Type guard: only handle non-streaming Choices
+            if not isinstance(choice, Choices):
+                continue
+            if choice.message and choice.message.tool_calls:
+                for tool_call in choice.message.tool_calls:
+                    if tool_call.function and tool_call.function.arguments:
+                        tool_call.function.arguments = self._normalize_arguments_str(
+                            tool_call.function.arguments
+                        )
+
+        return response
+
+    def normalize_responses_api_response(
+        self: _HostSupports, response: ResponsesAPIResponse
+    ) -> ResponsesAPIResponse:
+        """Normalize tool call arguments in a Responses API response.
+
+        Modifies the response in-place if the model has known quirks.
+
+        Args:
+            response: The ResponsesAPIResponse from litellm.responses()
+
+        Returns:
+            The same response object (modified in-place)
+        """
+        # Normalize tool call arguments in output items
+        if response.output:
+            for item in response.output:
+                # Check if it's a function_call item with arguments
+                if (
+                    hasattr(item, "type")
+                    and getattr(item, "type", None) == "function_call"
+                    and hasattr(item, "arguments")
+                ):
+                    args = getattr(item, "arguments", None)
+                    if args:
+                        setattr(item, "arguments", self._normalize_arguments_str(args))
+
+        return response
+
+    def _normalize_arguments_str(self: _HostSupports, arguments_str: str) -> str:
+        """Recursively parse nested JSON strings in tool call arguments.
+
+        This fixes arguments where nested structures are encoded as JSON strings.
+
+        Args:
+            arguments_str: The raw arguments JSON string from the LLM
+
+        Returns:
+            Normalized arguments JSON string with proper data structures
+        """
+        from openhands.sdk.llm.utils.model_features import get_features
+
+        # Only normalize if model has this quirk
+        if not get_features(self.model).args_as_json_strings:
+            return arguments_str
+
+        try:
+            # Parse the JSON string
+            args_dict = json.loads(arguments_str)
+
+            # Recursively parse nested JSON strings
+            def _recursively_parse(obj: Any) -> Any:
+                if isinstance(obj, dict):
+                    return {k: _recursively_parse(v) for k, v in obj.items()}
+                elif isinstance(obj, list):
+                    return [_recursively_parse(item) for item in obj]
+                elif isinstance(obj, str):
+                    # Try to parse strings as JSON
+                    try:
+                        parsed = json.loads(obj)
+                        return _recursively_parse(parsed)
+                    except (json.JSONDecodeError, ValueError):
+                        # Not JSON, return as-is
+                        return obj
+                else:
+                    # Numbers, booleans, None - return as-is
+                    return obj
+
+            normalized_dict = _recursively_parse(args_dict)
+
+            # Re-encode to JSON string
+            return json.dumps(normalized_dict)
+        except (json.JSONDecodeError, ValueError):
+            # If parsing fails, return original
+            return arguments_str
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index e96e8b2630..bb3d3c2cb1 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -23,6 +23,7 @@ class ModelFeatures:
     supports_stop_words: bool
     supports_responses_api: bool
     force_string_serializer: bool
+    args_as_json_strings: bool
 
 
 # Pattern tables capturing current behavior. Keep patterns lowercase.
@@ -92,13 +93,17 @@ class ModelFeatures:
 FORCE_STRING_SERIALIZER_PATTERNS: list[str] = [
     "deepseek",
     "groq/kimi-k2-instruct",
+    "glm-4",
 ]
 
-# Models that don't support native tool calling
-# These models return tool call arguments with arrays/objects as JSON strings
+
+# Models that return tool call arguments with arrays/objects as JSON strings
 # instead of properly structured data (e.g., "[1, 100]" instead of [1, 100])
 # This causes Pydantic validation errors when parsing tool arguments
-NO_NATIVE_TOOL_CALLING_PATTERNS: list[str] = []
+# These models need recursive JSON string parsing in their arguments
+FUNCTION_ARGS_AS_JSON_STRINGS_PATTERNS: list[str] = [
+    "glm-4",
+]
 
 
 def get_features(model: str) -> ModelFeatures:
@@ -112,4 +117,7 @@ def get_features(model: str) -> ModelFeatures:
         ),
         supports_responses_api=model_matches(model, RESPONSES_API_PATTERNS),
         force_string_serializer=model_matches(model, FORCE_STRING_SERIALIZER_PATTERNS),
+        args_as_json_strings=model_matches(
+            model, FUNCTION_ARGS_AS_JSON_STRINGS_PATTERNS
+        ),
     )
diff --git a/tests/sdk/llm/test_response_normalizer.py b/tests/sdk/llm/test_response_normalizer.py
new file mode 100644
index 0000000000..cfee35da35
--- /dev/null
+++ b/tests/sdk/llm/test_response_normalizer.py
@@ -0,0 +1,229 @@
+"""Tests for ResponseNormalizerMixin."""
+
+import json
+from unittest.mock import MagicMock
+
+from litellm import ChatCompletionMessageToolCall
+from litellm.types.utils import (
+    Choices,
+    Function,
+    Message as LiteLLMMessage,
+    ModelResponse,
+)
+
+from openhands.sdk.llm.llm import LLM
+from openhands.sdk.llm.mixins.response_normalizer import ResponseNormalizerMixin
+
+
+class TestResponseNormalizerMixin:
+    """Test ResponseNormalizerMixin functionality."""
+
+    def test_llm_inherits_mixin(self):
+        """Verify LLM inherits from ResponseNormalizerMixin."""
+        assert issubclass(LLM, ResponseNormalizerMixin)
+
+    def test_mixin_methods_available(self):
+        """Verify mixin methods are available on LLM instances."""
+        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
+        assert hasattr(llm, "normalize_chat_completion_response")
+        assert hasattr(llm, "normalize_responses_api_response")
+        assert hasattr(llm, "_normalize_arguments_str")
+
+    def test_normalize_arguments_str_glm(self):
+        """Test normalization for GLM models."""
+        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
+
+        # Test array normalization
+        args_str = '{"view_range": "[1, 100]"}'
+        normalized = llm._normalize_arguments_str(args_str)
+        parsed = json.loads(normalized)
+        assert isinstance(parsed["view_range"], list)
+        assert parsed["view_range"] == [1, 100]
+
+        # Test nested object normalization
+        args_str = '{"nested": "{\\"key\\": \\"value\\"}"}'
+        normalized = llm._normalize_arguments_str(args_str)
+        parsed = json.loads(normalized)
+        assert isinstance(parsed["nested"], dict)
+        assert parsed["nested"]["key"] == "value"
+
+        # Test complex nested structure
+        args_str = '{"data": "{\\"items\\": \\"[1, 2, 3]\\"}"}'
+        normalized = llm._normalize_arguments_str(args_str)
+        parsed = json.loads(normalized)
+        assert isinstance(parsed["data"], dict)
+        assert isinstance(parsed["data"]["items"], list)
+        assert parsed["data"]["items"] == [1, 2, 3]
+
+    def test_normalize_arguments_str_non_glm(self):
+        """Test that non-GLM models pass through unchanged."""
+        llm = LLM(model="gpt-4")
+
+        args_str = '{"view_range": "[1, 100]"}'
+        normalized = llm._normalize_arguments_str(args_str)
+        # Should not change for non-GLM models
+        assert normalized == args_str
+
+    def test_normalize_arguments_str_invalid_json(self):
+        """Test handling of invalid JSON."""
+        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
+
+        # Invalid JSON should return original string
+        args_str = "not json"
+        normalized = llm._normalize_arguments_str(args_str)
+        assert normalized == args_str
+
+    def test_normalize_chat_completion_response_glm(self):
+        """Test normalizing Chat Completions API response for GLM."""
+        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
+
+        # Create a mock response with malformed tool call arguments
+        tool_call = ChatCompletionMessageToolCall(
+            id="call_123",
+            type="function",
+            function=Function(
+                name="str_replace_editor",
+                arguments='{"path": "/test.py", "view_range": "[1, 100]"}',
+            ),
+        )
+
+        message = LiteLLMMessage(role="assistant", content=None, tool_calls=[tool_call])
+
+        choice = Choices(finish_reason="tool_calls", index=0, message=message)
+
+        response = ModelResponse(
+            id="resp_123",
+            model="glm-4",
+            object="chat.completion",
+            created=1234567890,
+            choices=[choice],
+        )
+
+        # Normalize the response
+        normalized = llm.normalize_chat_completion_response(response)
+
+        # Check that tool call arguments are normalized
+        first_choice = normalized.choices[0]
+        assert isinstance(first_choice, Choices)
+        assert first_choice.message.tool_calls is not None
+        normalized_args = json.loads(
+            first_choice.message.tool_calls[0].function.arguments
+        )
+        assert isinstance(normalized_args["view_range"], list)
+        assert normalized_args["view_range"] == [1, 100]
+
+    def test_normalize_chat_completion_response_non_glm(self):
+        """Test that non-GLM models are not normalized."""
+        llm = LLM(model="gpt-4")
+
+        # Create a mock response
+        tool_call = ChatCompletionMessageToolCall(
+            id="call_123",
+            type="function",
+            function=Function(
+                name="test_function",
+                arguments='{"view_range": "[1, 100]"}',
+            ),
+        )
+
+        message = LiteLLMMessage(role="assistant", content=None, tool_calls=[tool_call])
+
+        choice = Choices(finish_reason="tool_calls", index=0, message=message)
+
+        response = ModelResponse(
+            id="resp_123",
+            model="gpt-4",
+            object="chat.completion",
+            created=1234567890,
+            choices=[choice],
+        )
+
+        # Normalize the response
+        normalized = llm.normalize_chat_completion_response(response)
+
+        # Arguments should remain unchanged for non-GLM models
+        first_choice = normalized.choices[0]
+        assert isinstance(first_choice, Choices)
+        assert first_choice.message.tool_calls is not None
+        assert (
+            first_choice.message.tool_calls[0].function.arguments
+            == '{"view_range": "[1, 100]"}'
+        )
+
+    def test_normalize_responses_api_response_glm(self):
+        """Test normalizing Responses API response for GLM."""
+        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
+
+        # Create a mock Responses API response
+        function_call_item = MagicMock()
+        function_call_item.type = "function_call"
+        function_call_item.arguments = '{"path": "/test.py", "view_range": "[1, 100]"}'
+
+        response = MagicMock()
+        response.output = [function_call_item]
+
+        # Normalize the response
+        normalized = llm.normalize_responses_api_response(response)
+
+        # Check that arguments are normalized
+        assert normalized.output is not None
+        assert len(normalized.output) > 0
+        first_item = normalized.output[0]
+        assert hasattr(first_item, "arguments")
+        arguments_str = getattr(first_item, "arguments")
+        assert arguments_str is not None
+        normalized_args = json.loads(arguments_str)
+        assert isinstance(normalized_args["view_range"], list)
+        assert normalized_args["view_range"] == [1, 100]
+
+    def test_mixin_is_reusable(self):
+        """Test that the mixin can be used by other classes."""
+
+        class CustomClass(ResponseNormalizerMixin):
+            def __init__(self, model: str):
+                self.model = model
+
+        # Create instance with GLM model
+        obj = CustomClass(model="litellm_proxy/openrouter/z-ai/glm-4.6")
+        args_str = '{"arr": "[1, 2, 3]"}'
+        normalized = obj._normalize_arguments_str(args_str)
+        parsed = json.loads(normalized)
+        assert isinstance(parsed["arr"], list)
+        assert parsed["arr"] == [1, 2, 3]
+
+        # Create instance with non-GLM model
+        obj_gpt = CustomClass(model="gpt-4")
+        normalized_gpt = obj_gpt._normalize_arguments_str(args_str)
+        assert normalized_gpt == args_str
+
+    def test_normalize_preserves_non_json_strings(self):
+        """Test that regular strings that aren't JSON are preserved."""
+        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
+
+        args_str = '{"message": "Hello, world!", "count": 42}'
+        normalized = llm._normalize_arguments_str(args_str)
+        parsed = json.loads(normalized)
+        assert parsed["message"] == "Hello, world!"
+        assert parsed["count"] == 42
+
+    def test_normalize_empty_tool_calls(self):
+        """Test normalizing response with no tool calls."""
+        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
+
+        message = LiteLLMMessage(role="assistant", content="Hello", tool_calls=None)
+
+        choice = Choices(finish_reason="stop", index=0, message=message)
+
+        response = ModelResponse(
+            id="resp_123",
+            model="glm-4",
+            object="chat.completion",
+            created=1234567890,
+            choices=[choice],
+        )
+
+        # Should not raise any errors
+        normalized = llm.normalize_chat_completion_response(response)
+        first_choice = normalized.choices[0]
+        assert isinstance(first_choice, Choices)
+        assert first_choice.message.content == "Hello"

From 9f630fb7891fa3cdf60c83878513b5c775ba38fb Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 4 Nov 2025 10:32:52 +0100
Subject: [PATCH 03/10] remove comment

---
 openhands-sdk/openhands/sdk/llm/llm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
index 3703c0e441..a63337d9d6 100644
--- a/openhands-sdk/openhands/sdk/llm/llm.py
+++ b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -697,8 +697,6 @@ def _litellm_modify_params_ctx(self, flag: bool):
     # Capabilities, formatting, and info
     # =========================================================================
     def _init_model_info_and_caps(self) -> None:
-        # Check if this model doesn't support native tool calling
-
         # Try to get model info via openrouter or litellm proxy first
         tried = False
         try:

From 33d21fb2f1d6b73049d8acd92d4fd73036ad8955 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 4 Nov 2025 11:31:51 +0100
Subject: [PATCH 04/10] add pydantic tool call validation

---
 openhands-sdk/openhands/sdk/llm/llm.py        |   9 +-
 .../sdk/llm/mixins/response_normalizer.py     | 143 -----------
 .../openhands/sdk/llm/utils/model_features.py |  12 -
 openhands-sdk/openhands/sdk/tool/schema.py    |  75 +++++-
 tests/sdk/llm/test_response_normalizer.py     | 229 ------------------
 5 files changed, 74 insertions(+), 394 deletions(-)
 delete mode 100644 openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py
 delete mode 100644 tests/sdk/llm/test_response_normalizer.py

diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
index d3f5585089..7fea0e1335 100644
--- a/openhands-sdk/openhands/sdk/llm/llm.py
+++ b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -70,7 +70,6 @@
     Message,
 )
 from openhands.sdk.llm.mixins.non_native_fc import NonNativeToolCallingMixin
-from openhands.sdk.llm.mixins.response_normalizer import ResponseNormalizerMixin
 from openhands.sdk.llm.options.chat_options import select_chat_options
 from openhands.sdk.llm.options.responses_options import select_responses_options
 from openhands.sdk.llm.utils.metrics import Metrics, MetricsSnapshot
@@ -101,7 +100,7 @@
 )
 
 
-class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin, ResponseNormalizerMixin):
+class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
     """Language model interface for OpenHands agents.
 
     The LLM class provides a unified interface for interacting with various
@@ -534,9 +533,6 @@ def _one_attempt(**retry_kwargs) -> ModelResponse:
         try:
             resp = _one_attempt()
 
-            # Normalize response for model-specific quirks (via mixin)
-            resp = self.normalize_chat_completion_response(resp)
-
             # Convert the first choice to an OpenHands Message
             first_choice = resp["choices"][0]
             message = Message.from_llm_chat_message(first_choice["message"])
@@ -659,9 +655,6 @@ def _one_attempt(**retry_kwargs) -> ResponsesAPIResponse:
         try:
             resp: ResponsesAPIResponse = _one_attempt()
 
-            # Normalize response for model-specific quirks (via mixin)
-            resp = self.normalize_responses_api_response(resp)
-
             # Parse output -> Message (typed)
             # Cast to a typed sequence
             # accepted by from_llm_responses_output
diff --git a/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py b/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py
deleted file mode 100644
index c371088020..0000000000
--- a/openhands-sdk/openhands/sdk/llm/mixins/response_normalizer.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""Mixin for normalizing LLM responses with model-specific quirks.
-
-This handles models that support native function calling but return malformed
-tool call arguments (e.g., arrays/objects encoded as JSON strings).
-"""
-
-from __future__ import annotations
-
-import json
-from typing import TYPE_CHECKING, Any, Protocol
-
-from litellm.types.utils import Choices
-
-
-if TYPE_CHECKING:
-    from litellm.types.llms.openai import ResponsesAPIResponse
-    from litellm.types.utils import ModelResponse
-
-
-class _HostSupports(Protocol):
-    """Protocol defining what the host class must provide."""
-
-    model: str
-
-    def _normalize_arguments_str(self, arguments_str: str) -> str:
-        """Normalize tool call arguments."""
-        ...
-
-
-class ResponseNormalizerMixin:
-    """Mixin for normalizing tool call arguments in LLM responses.
-
-    Some LLMs return tool call arguments with nested data structures
-    (arrays, objects) encoded as JSON strings instead of proper structures.
-    For example: {"view_range": "[1, 100]"} instead of {"view_range": [1, 100]}
-
-    This mixin provides methods to detect and fix these quirks based on model features.
-
-    Host requirements:
-    - self.model: str - The model identifier
-    """
-
-    def normalize_chat_completion_response(
-        self: _HostSupports, response: ModelResponse
-    ) -> ModelResponse:
-        """Normalize tool call arguments in a Chat Completions API response.
-
-        Modifies the response in-place if the model has known quirks.
-
-        Args:
-            response: The ModelResponse from litellm.completion()
-
-        Returns:
-            The same response object (modified in-place)
-        """
-        # Normalize tool call arguments in each choice's message
-        for choice in response.choices:
-            # Type guard: only handle non-streaming Choices
-            if not isinstance(choice, Choices):
-                continue
-            if choice.message and choice.message.tool_calls:
-                for tool_call in choice.message.tool_calls:
-                    if tool_call.function and tool_call.function.arguments:
-                        tool_call.function.arguments = self._normalize_arguments_str(
-                            tool_call.function.arguments
-                        )
-
-        return response
-
-    def normalize_responses_api_response(
-        self: _HostSupports, response: ResponsesAPIResponse
-    ) -> ResponsesAPIResponse:
-        """Normalize tool call arguments in a Responses API response.
-
-        Modifies the response in-place if the model has known quirks.
-
-        Args:
-            response: The ResponsesAPIResponse from litellm.responses()
-
-        Returns:
-            The same response object (modified in-place)
-        """
-        # Normalize tool call arguments in output items
-        if response.output:
-            for item in response.output:
-                # Check if it's a function_call item with arguments
-                if (
-                    hasattr(item, "type")
-                    and getattr(item, "type", None) == "function_call"
-                    and hasattr(item, "arguments")
-                ):
-                    args = getattr(item, "arguments", None)
-                    if args:
-                        setattr(item, "arguments", self._normalize_arguments_str(args))
-
-        return response
-
-    def _normalize_arguments_str(self: _HostSupports, arguments_str: str) -> str:
-        """Recursively parse nested JSON strings in tool call arguments.
-
-        This fixes arguments where nested structures are encoded as JSON strings.
-
-        Args:
-            arguments_str: The raw arguments JSON string from the LLM
-
-        Returns:
-            Normalized arguments JSON string with proper data structures
-        """
-        from openhands.sdk.llm.utils.model_features import get_features
-
-        # Only normalize if model has this quirk
-        if not get_features(self.model).args_as_json_strings:
-            return arguments_str
-
-        try:
-            # Parse the JSON string
-            args_dict = json.loads(arguments_str)
-
-            # Recursively parse nested JSON strings
-            def _recursively_parse(obj: Any) -> Any:
-                if isinstance(obj, dict):
-                    return {k: _recursively_parse(v) for k, v in obj.items()}
-                elif isinstance(obj, list):
-                    return [_recursively_parse(item) for item in obj]
-                elif isinstance(obj, str):
-                    # Try to parse strings as JSON
-                    try:
-                        parsed = json.loads(obj)
-                        return _recursively_parse(parsed)
-                    except (json.JSONDecodeError, ValueError):
-                        # Not JSON, return as-is
-                        return obj
-                else:
-                    # Numbers, booleans, None - return as-is
-                    return obj
-
-            normalized_dict = _recursively_parse(args_dict)
-
-            # Re-encode to JSON string
-            return json.dumps(normalized_dict)
-        except (json.JSONDecodeError, ValueError):
-            # If parsing fails, return original
-            return arguments_str
diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index b6e0fb7142..2ee16e37b2 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -23,7 +23,6 @@ class ModelFeatures:
     supports_stop_words: bool
     supports_responses_api: bool
     force_string_serializer: bool
-    args_as_json_strings: bool
 
 
 # Pattern tables capturing current behavior. Keep patterns lowercase.
@@ -99,14 +98,6 @@ class ModelFeatures:
     "groq/kimi-k2-instruct",  # explicit provider-prefixed IDs
 ]
 
-# Models that return tool call arguments with arrays/objects as JSON strings
-# instead of properly structured data (e.g., "[1, 100]" instead of [1, 100])
-# This causes Pydantic validation errors when parsing tool arguments
-# These models need recursive JSON string parsing in their arguments
-FUNCTION_ARGS_AS_JSON_STRINGS_PATTERNS: list[str] = [
-    "glm-4",  # e.g., GLM-4.5 / GLM-4.6
-]
-
 
 def get_features(model: str) -> ModelFeatures:
     """Get model features."""
@@ -119,7 +110,4 @@ def get_features(model: str) -> ModelFeatures:
         ),
         supports_responses_api=model_matches(model, RESPONSES_API_PATTERNS),
         force_string_serializer=model_matches(model, FORCE_STRING_SERIALIZER_PATTERNS),
-        args_as_json_strings=model_matches(
-            model, FUNCTION_ARGS_AS_JSON_STRINGS_PATTERNS
-        ),
     )
diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 1c041aa659..09c2911825 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -1,8 +1,10 @@
+import json
+import types
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
-from typing import Any, ClassVar, TypeVar
+from typing import Any, ClassVar, TypeVar, Union, get_args, get_origin
 
-from pydantic import ConfigDict, Field, create_model
+from pydantic import ConfigDict, Field, create_model, model_validator
 from rich.text import Text
 
 from openhands.sdk.llm import ImageContent, TextContent
@@ -100,6 +102,75 @@ class Schema(DiscriminatedUnionMixin):
 
     model_config: ClassVar[ConfigDict] = ConfigDict(extra="forbid", frozen=True)
 
+    @model_validator(mode="before")
+    @classmethod
+    def _decode_json_strings(cls, data: Any) -> Any:
+        """Pre-validator that automatically decodes JSON strings for list/dict fields.
+
+        This validator runs before field validation and checks if any field that
+        expects a list or dict type has received a JSON string instead. If so,
+        it automatically decodes the string using json.loads().
+
+        This handles cases where LLMs (like GLM-4) return array/object values
+        as JSON strings instead of native JSON arrays/objects.
+
+        Args:
+            data: The input data (usually a dict) before validation.
+
+        Returns:
+            The data with JSON strings decoded where appropriate.
+        """
+        if not isinstance(data, dict):
+            return data
+
+        # Get the model's field annotations
+        annotations = getattr(cls, "__annotations__", {})
+
+        for field_name, value in list(data.items()):
+            # Skip if value is not a string
+            if not isinstance(value, str):
+                continue
+
+            # Get the expected type for this field
+            if field_name not in annotations:
+                continue
+
+            expected_type = annotations[field_name]
+
+            # Check if the expected type is a list or dict
+            # Handle Union types (e.g., list | None, Optional[list])
+            origin = get_origin(expected_type)
+
+            # Check for Union types (handles both typing.Union and | syntax)
+            is_union = origin is Union
+            # Python 3.10+ has types.UnionType for | syntax
+            if hasattr(types, "UnionType") and origin is types.UnionType:
+                is_union = True
+
+            if is_union:
+                # For Union types, check all args
+                type_args = get_args(expected_type)
+                expected_origins = [get_origin(arg) or arg for arg in type_args]
+            else:
+                expected_origins = [origin or expected_type]
+
+            # Check if any of the expected types is list or dict
+            expects_list_or_dict = any(exp in (list, dict) for exp in expected_origins)
+
+            if expects_list_or_dict:
+                # Try to parse the string as JSON
+                try:
+                    parsed_value = json.loads(value)
+                    # Only replace if we got a list or dict
+                    if isinstance(parsed_value, (list, dict)):
+                        data[field_name] = parsed_value
+                except (json.JSONDecodeError, ValueError):
+                    # If parsing fails, leave the original value
+                    # Pydantic will raise validation error if needed
+                    pass
+
+        return data
+
     @classmethod
     def to_mcp_schema(cls) -> dict[str, Any]:
         """Convert to JSON schema format compatible with MCP."""
diff --git a/tests/sdk/llm/test_response_normalizer.py b/tests/sdk/llm/test_response_normalizer.py
deleted file mode 100644
index cfee35da35..0000000000
--- a/tests/sdk/llm/test_response_normalizer.py
+++ /dev/null
@@ -1,229 +0,0 @@
-"""Tests for ResponseNormalizerMixin."""
-
-import json
-from unittest.mock import MagicMock
-
-from litellm import ChatCompletionMessageToolCall
-from litellm.types.utils import (
-    Choices,
-    Function,
-    Message as LiteLLMMessage,
-    ModelResponse,
-)
-
-from openhands.sdk.llm.llm import LLM
-from openhands.sdk.llm.mixins.response_normalizer import ResponseNormalizerMixin
-
-
-class TestResponseNormalizerMixin:
-    """Test ResponseNormalizerMixin functionality."""
-
-    def test_llm_inherits_mixin(self):
-        """Verify LLM inherits from ResponseNormalizerMixin."""
-        assert issubclass(LLM, ResponseNormalizerMixin)
-
-    def test_mixin_methods_available(self):
-        """Verify mixin methods are available on LLM instances."""
-        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
-        assert hasattr(llm, "normalize_chat_completion_response")
-        assert hasattr(llm, "normalize_responses_api_response")
-        assert hasattr(llm, "_normalize_arguments_str")
-
-    def test_normalize_arguments_str_glm(self):
-        """Test normalization for GLM models."""
-        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
-
-        # Test array normalization
-        args_str = '{"view_range": "[1, 100]"}'
-        normalized = llm._normalize_arguments_str(args_str)
-        parsed = json.loads(normalized)
-        assert isinstance(parsed["view_range"], list)
-        assert parsed["view_range"] == [1, 100]
-
-        # Test nested object normalization
-        args_str = '{"nested": "{\\"key\\": \\"value\\"}"}'
-        normalized = llm._normalize_arguments_str(args_str)
-        parsed = json.loads(normalized)
-        assert isinstance(parsed["nested"], dict)
-        assert parsed["nested"]["key"] == "value"
-
-        # Test complex nested structure
-        args_str = '{"data": "{\\"items\\": \\"[1, 2, 3]\\"}"}'
-        normalized = llm._normalize_arguments_str(args_str)
-        parsed = json.loads(normalized)
-        assert isinstance(parsed["data"], dict)
-        assert isinstance(parsed["data"]["items"], list)
-        assert parsed["data"]["items"] == [1, 2, 3]
-
-    def test_normalize_arguments_str_non_glm(self):
-        """Test that non-GLM models pass through unchanged."""
-        llm = LLM(model="gpt-4")
-
-        args_str = '{"view_range": "[1, 100]"}'
-        normalized = llm._normalize_arguments_str(args_str)
-        # Should not change for non-GLM models
-        assert normalized == args_str
-
-    def test_normalize_arguments_str_invalid_json(self):
-        """Test handling of invalid JSON."""
-        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
-
-        # Invalid JSON should return original string
-        args_str = "not json"
-        normalized = llm._normalize_arguments_str(args_str)
-        assert normalized == args_str
-
-    def test_normalize_chat_completion_response_glm(self):
-        """Test normalizing Chat Completions API response for GLM."""
-        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
-
-        # Create a mock response with malformed tool call arguments
-        tool_call = ChatCompletionMessageToolCall(
-            id="call_123",
-            type="function",
-            function=Function(
-                name="str_replace_editor",
-                arguments='{"path": "/test.py", "view_range": "[1, 100]"}',
-            ),
-        )
-
-        message = LiteLLMMessage(role="assistant", content=None, tool_calls=[tool_call])
-
-        choice = Choices(finish_reason="tool_calls", index=0, message=message)
-
-        response = ModelResponse(
-            id="resp_123",
-            model="glm-4",
-            object="chat.completion",
-            created=1234567890,
-            choices=[choice],
-        )
-
-        # Normalize the response
-        normalized = llm.normalize_chat_completion_response(response)
-
-        # Check that tool call arguments are normalized
-        first_choice = normalized.choices[0]
-        assert isinstance(first_choice, Choices)
-        assert first_choice.message.tool_calls is not None
-        normalized_args = json.loads(
-            first_choice.message.tool_calls[0].function.arguments
-        )
-        assert isinstance(normalized_args["view_range"], list)
-        assert normalized_args["view_range"] == [1, 100]
-
-    def test_normalize_chat_completion_response_non_glm(self):
-        """Test that non-GLM models are not normalized."""
-        llm = LLM(model="gpt-4")
-
-        # Create a mock response
-        tool_call = ChatCompletionMessageToolCall(
-            id="call_123",
-            type="function",
-            function=Function(
-                name="test_function",
-                arguments='{"view_range": "[1, 100]"}',
-            ),
-        )
-
-        message = LiteLLMMessage(role="assistant", content=None, tool_calls=[tool_call])
-
-        choice = Choices(finish_reason="tool_calls", index=0, message=message)
-
-        response = ModelResponse(
-            id="resp_123",
-            model="gpt-4",
-            object="chat.completion",
-            created=1234567890,
-            choices=[choice],
-        )
-
-        # Normalize the response
-        normalized = llm.normalize_chat_completion_response(response)
-
-        # Arguments should remain unchanged for non-GLM models
-        first_choice = normalized.choices[0]
-        assert isinstance(first_choice, Choices)
-        assert first_choice.message.tool_calls is not None
-        assert (
-            first_choice.message.tool_calls[0].function.arguments
-            == '{"view_range": "[1, 100]"}'
-        )
-
-    def test_normalize_responses_api_response_glm(self):
-        """Test normalizing Responses API response for GLM."""
-        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
-
-        # Create a mock Responses API response
-        function_call_item = MagicMock()
-        function_call_item.type = "function_call"
-        function_call_item.arguments = '{"path": "/test.py", "view_range": "[1, 100]"}'
-
-        response = MagicMock()
-        response.output = [function_call_item]
-
-        # Normalize the response
-        normalized = llm.normalize_responses_api_response(response)
-
-        # Check that arguments are normalized
-        assert normalized.output is not None
-        assert len(normalized.output) > 0
-        first_item = normalized.output[0]
-        assert hasattr(first_item, "arguments")
-        arguments_str = getattr(first_item, "arguments")
-        assert arguments_str is not None
-        normalized_args = json.loads(arguments_str)
-        assert isinstance(normalized_args["view_range"], list)
-        assert normalized_args["view_range"] == [1, 100]
-
-    def test_mixin_is_reusable(self):
-        """Test that the mixin can be used by other classes."""
-
-        class CustomClass(ResponseNormalizerMixin):
-            def __init__(self, model: str):
-                self.model = model
-
-        # Create instance with GLM model
-        obj = CustomClass(model="litellm_proxy/openrouter/z-ai/glm-4.6")
-        args_str = '{"arr": "[1, 2, 3]"}'
-        normalized = obj._normalize_arguments_str(args_str)
-        parsed = json.loads(normalized)
-        assert isinstance(parsed["arr"], list)
-        assert parsed["arr"] == [1, 2, 3]
-
-        # Create instance with non-GLM model
-        obj_gpt = CustomClass(model="gpt-4")
-        normalized_gpt = obj_gpt._normalize_arguments_str(args_str)
-        assert normalized_gpt == args_str
-
-    def test_normalize_preserves_non_json_strings(self):
-        """Test that regular strings that aren't JSON are preserved."""
-        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
-
-        args_str = '{"message": "Hello, world!", "count": 42}'
-        normalized = llm._normalize_arguments_str(args_str)
-        parsed = json.loads(normalized)
-        assert parsed["message"] == "Hello, world!"
-        assert parsed["count"] == 42
-
-    def test_normalize_empty_tool_calls(self):
-        """Test normalizing response with no tool calls."""
-        llm = LLM(model="litellm_proxy/openrouter/z-ai/glm-4.6")
-
-        message = LiteLLMMessage(role="assistant", content="Hello", tool_calls=None)
-
-        choice = Choices(finish_reason="stop", index=0, message=message)
-
-        response = ModelResponse(
-            id="resp_123",
-            model="glm-4",
-            object="chat.completion",
-            created=1234567890,
-            choices=[choice],
-        )
-
-        # Should not raise any errors
-        normalized = llm.normalize_chat_completion_response(response)
-        first_choice = normalized.choices[0]
-        assert isinstance(first_choice, Choices)
-        assert first_choice.message.content == "Hello"

From 42dafb10702a7684e920186d45050aca1b93b980 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 10:41:40 +0000
Subject: [PATCH 05/10] Refactor _decode_json_strings to eliminate special-case
 union handling

- Remove fragile Union type detection that checked typing.Union and types.UnionType separately
- Use get_args() uniformly for both union and simple types, eliminating branching logic
- Consolidate early exit conditions into single check for better readability
- Remove unnecessary list() copy when iterating over data.items()
- Remove unused imports: types module and Union from typing
- Improve code comments to clarify json.loads() return types
- Reduce function from 46 lines to 34 lines (-26%) while maintaining functionality

This addresses code review feedback about treating normal cases as special cases
and eliminates brittleness around Python version differences in union handling.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/schema.py | 39 ++++++++--------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 09c2911825..c3e49b3733 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -1,8 +1,7 @@
 import json
-import types
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
-from typing import Any, ClassVar, TypeVar, Union, get_args, get_origin
+from typing import Any, ClassVar, TypeVar, get_args, get_origin
 
 from pydantic import ConfigDict, Field, create_model, model_validator
 from rich.text import Text
@@ -126,42 +125,30 @@ def _decode_json_strings(cls, data: Any) -> Any:
         # Get the model's field annotations
         annotations = getattr(cls, "__annotations__", {})
 
-        for field_name, value in list(data.items()):
-            # Skip if value is not a string
-            if not isinstance(value, str):
-                continue
-
-            # Get the expected type for this field
-            if field_name not in annotations:
+        for field_name, value in data.items():
+            # Skip if value is not a string or field not in annotations
+            if not isinstance(value, str) or field_name not in annotations:
                 continue
 
             expected_type = annotations[field_name]
 
-            # Check if the expected type is a list or dict
-            # Handle Union types (e.g., list | None, Optional[list])
-            origin = get_origin(expected_type)
-
-            # Check for Union types (handles both typing.Union and | syntax)
-            is_union = origin is Union
-            # Python 3.10+ has types.UnionType for | syntax
-            if hasattr(types, "UnionType") and origin is types.UnionType:
-                is_union = True
-
-            if is_union:
-                # For Union types, check all args
-                type_args = get_args(expected_type)
+            # Get all possible types (handles unions and simple types uniformly)
+            type_args = get_args(expected_type)
+            if type_args:
+                # Union or generic type - extract origins from all args
                 expected_origins = [get_origin(arg) or arg for arg in type_args]
             else:
+                # Simple type - use origin or the type itself
+                origin = get_origin(expected_type)
                 expected_origins = [origin or expected_type]
 
             # Check if any of the expected types is list or dict
-            expects_list_or_dict = any(exp in (list, dict) for exp in expected_origins)
-
-            if expects_list_or_dict:
+            if any(exp in (list, dict) for exp in expected_origins):
                 # Try to parse the string as JSON
                 try:
                     parsed_value = json.loads(value)
-                    # Only replace if we got a list or dict
+                    # json.loads() returns dict, list, str, int, float, bool, or None
+                    # Only use parsed value if it matches expected collection types
                     if isinstance(parsed_value, (list, dict)):
                         data[field_name] = parsed_value
                 except (json.JSONDecodeError, ValueError):

From 61a5ecb583d5493b9dfa3d1df8fb577b1c4c5d2d Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 11:01:49 +0000
Subject: [PATCH 06/10] Fix _decode_json_strings to handle Annotated types and
 field aliases correctly

This commit addresses two critical issues identified in code review:

1. **Annotated type handling**: The validator now properly unwraps Annotated types
   before checking for list/dict, ensuring fields like `Annotated[list[str], Field(...)]`
   are correctly decoded from JSON strings.

2. **Field alias support**: Changed from using __annotations__ to model_fields,
   which properly handles field aliases and inherited fields. The validator now
   checks both field names and their aliases when looking up values in the input data.

3. **Union type detection fix**: Fixed a critical bug where we were checking the
   origin of type arguments instead of the type itself. For example, with `list[str]`,
   we were incorrectly extracting `str` instead of `list`.

Changes:
- Import Annotated, Union, and types module
- Use cls.model_fields instead of cls.__annotations__
- Add logic to unwrap Annotated types before type checking
- Fix Union type detection to check the type's origin, not its args
- Properly handle both old-style Union and new-style (|) union syntax

The validator now correctly handles:
- `Annotated[list[str], Field(description='...')]`
- Fields with aliases: `my_list: list[int] = Field(alias='myList')`
- Union types: `list[str] | None`
- Inherited fields from parent classes

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/schema.py | 42 ++++++++++++++--------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index c3e49b3733..6333e48f74 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -1,7 +1,8 @@
 import json
+import types
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
-from typing import Any, ClassVar, TypeVar, get_args, get_origin
+from typing import Annotated, Any, ClassVar, TypeVar, Union, get_args, get_origin
 
 from pydantic import ConfigDict, Field, create_model, model_validator
 from rich.text import Text
@@ -122,24 +123,37 @@ def _decode_json_strings(cls, data: Any) -> Any:
         if not isinstance(data, dict):
             return data
 
-        # Get the model's field annotations
-        annotations = getattr(cls, "__annotations__", {})
+        # Use model_fields to properly handle aliases and inherited fields
+        for field_name, field_info in cls.model_fields.items():
+            # Check both the field name and its alias (if any)
+            data_key = field_info.alias if field_info.alias else field_name
+            if data_key not in data:
+                continue
 
-        for field_name, value in data.items():
-            # Skip if value is not a string or field not in annotations
-            if not isinstance(value, str) or field_name not in annotations:
+            value = data[data_key]
+            # Skip if value is not a string
+            if not isinstance(value, str):
                 continue
 
-            expected_type = annotations[field_name]
+            expected_type = field_info.annotation
+
+            # Unwrap Annotated types - only the first arg is the actual type
+            if get_origin(expected_type) is Annotated:
+                type_args = get_args(expected_type)
+                expected_type = type_args[0] if type_args else expected_type
+
+            # Get the origin of the expected type (e.g., list from list[str])
+            origin = get_origin(expected_type)
 
-            # Get all possible types (handles unions and simple types uniformly)
-            type_args = get_args(expected_type)
-            if type_args:
-                # Union or generic type - extract origins from all args
+            # For Union types, we need to check all union members
+            if origin is Union or (
+                hasattr(types, "UnionType") and origin is types.UnionType
+            ):
+                # For Union types, check each union member
+                type_args = get_args(expected_type)
                 expected_origins = [get_origin(arg) or arg for arg in type_args]
             else:
-                # Simple type - use origin or the type itself
-                origin = get_origin(expected_type)
+                # For non-Union types, just check the origin
                 expected_origins = [origin or expected_type]
 
             # Check if any of the expected types is list or dict
@@ -150,7 +164,7 @@ def _decode_json_strings(cls, data: Any) -> Any:
                     # json.loads() returns dict, list, str, int, float, bool, or None
                     # Only use parsed value if it matches expected collection types
                     if isinstance(parsed_value, (list, dict)):
-                        data[field_name] = parsed_value
+                        data[data_key] = parsed_value
                 except (json.JSONDecodeError, ValueError):
                     # If parsing fails, leave the original value
                     # Pydantic will raise validation error if needed

From dd615968112376bcd93048a77d2455ecd932abca Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 4 Nov 2025 12:10:32 +0100
Subject: [PATCH 07/10] improve doc

---
 openhands-sdk/openhands/sdk/tool/schema.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 6333e48f74..c4122431bd 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -112,7 +112,9 @@ def _decode_json_strings(cls, data: Any) -> Any:
         it automatically decodes the string using json.loads().
 
         This handles cases where LLMs (like GLM-4) return array/object values
-        as JSON strings instead of native JSON arrays/objects.
+        as JSON strings instead of native JSON arrays/objects i.e.
+        <parameter=view_range>"[1, 100]"</parameter> instead of
+        <parameter=view_range>[1, 100]</parameter>.
 
         Args:
             data: The input data (usually a dict) before validation.

From a5feb9d3d347525447218823f1049cf58a5d6937 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 13:01:03 +0000
Subject: [PATCH 08/10] Add comprehensive unit tests for JSON string decoding
 in Schema validator

This commit adds 16 unit tests to verify the _decode_json_strings validator
works correctly for all edge cases:

**Basic functionality:**
- JSON strings decoded to native lists/dicts
- Native lists/dicts pass through unchanged
- Regular string fields not affected by JSON decoding

**Advanced type handling:**
- Annotated types (e.g., Annotated[list[str], Field(...)])
- Field aliases (e.g., Field(alias='myList'))
- Optional/Union types (e.g., list[str] | None)
- Nested structures (e.g., list[list[int]])

**Error handling:**
- Invalid JSON strings rejected with ValidationError
- JSON strings with wrong types rejected
- Empty collections handled correctly

**Edge cases:**
- Unicode characters in JSON strings
- Extra whitespace in JSON strings
- Mixed native and JSON string values in same model

All tests pass and validate that the validator correctly handles both
JSON-encoded strings from models like GLM-4 and regular native objects
from other models.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/sdk/tool/test_schema_json_decoding.py | 257 ++++++++++++++++++++
 1 file changed, 257 insertions(+)
 create mode 100644 tests/sdk/tool/test_schema_json_decoding.py

diff --git a/tests/sdk/tool/test_schema_json_decoding.py b/tests/sdk/tool/test_schema_json_decoding.py
new file mode 100644
index 0000000000..5c9a246ed2
--- /dev/null
+++ b/tests/sdk/tool/test_schema_json_decoding.py
@@ -0,0 +1,257 @@
+"""Tests for JSON string decoding in Schema validator.
+
+This module tests the _decode_json_strings validator that automatically
+decodes JSON strings for list/dict fields. This handles cases where LLMs
+(like GLM-4) return array/object values as JSON strings instead of native
+JSON arrays/objects.
+"""
+
+from typing import Annotated
+
+import pytest
+from pydantic import Field, ValidationError
+
+from openhands.sdk.tool.schema import Action
+
+
+class JsonDecodingTestAction(Action):
+    """Test action with list and dict fields."""
+
+    items: list[str] = Field(description="A list of items")
+    config: dict[str, int] = Field(description="Configuration dictionary")
+    name: str = Field(description="A regular string field")
+
+
+class JsonDecodingAnnotatedAction(Action):
+    """Test action with Annotated types."""
+
+    items: Annotated[list[str], Field(description="A list of items")]
+    config: Annotated[dict[str, int], Field(description="Configuration dictionary")]
+
+
+class JsonDecodingAliasAction(Action):
+    """Test action with field aliases."""
+
+    my_list: list[int] = Field(alias="myList", description="A list with alias")
+    my_dict: dict[str, str] = Field(alias="myDict", description="A dict with alias")
+
+
+class JsonDecodingOptionalAction(Action):
+    """Test action with optional list/dict fields."""
+
+    items: list[str] | None = Field(default=None, description="Optional list")
+    config: dict[str, int] | None = Field(default=None, description="Optional dict")
+
+
+def test_decode_json_string_list():
+    """Test that JSON string lists are decoded to native lists."""
+    data = {
+        "items": '["a", "b", "c"]',
+        "config": '{"x": 1, "y": 2}',
+        "name": "test",
+    }
+    action = JsonDecodingTestAction.model_validate(data)
+
+    assert action.items == ["a", "b", "c"]
+    assert action.config == {"x": 1, "y": 2}
+    assert action.name == "test"
+
+
+def test_decode_json_string_dict():
+    """Test that JSON string dicts are decoded to native dicts."""
+    data = {
+        "items": '["item1", "item2"]',
+        "config": '{"key1": 10, "key2": 20}',
+        "name": "dict_test",
+    }
+    action = JsonDecodingTestAction.model_validate(data)
+
+    assert action.items == ["item1", "item2"]
+    assert action.config == {"key1": 10, "key2": 20}
+    assert action.name == "dict_test"
+
+
+def test_native_list_dict_passthrough():
+    """Test that native lists and dicts pass through unchanged."""
+    data = {
+        "items": ["direct", "list"],
+        "config": {"direct": 42},
+        "name": "native_test",
+    }
+    action = JsonDecodingTestAction.model_validate(data)
+
+    assert action.items == ["direct", "list"]
+    assert action.config == {"direct": 42}
+    assert action.name == "native_test"
+
+
+def test_regular_string_not_decoded():
+    """Test that regular string fields are not affected by JSON decoding."""
+    data = {
+        "items": "[]",
+        "config": "{}",
+        "name": "this is not json but a regular string",
+    }
+    action = JsonDecodingTestAction.model_validate(data)
+
+    assert action.items == []
+    assert action.config == {}
+    # Regular string field should NOT be decoded
+    assert action.name == "this is not json but a regular string"
+
+
+def test_annotated_types():
+    """Test that Annotated types are properly handled."""
+    data = {
+        "items": '["x", "y", "z"]',
+        "config": '{"a": 1, "b": 2}',
+    }
+    action = JsonDecodingAnnotatedAction.model_validate(data)
+
+    assert action.items == ["x", "y", "z"]
+    assert action.config == {"a": 1, "b": 2}
+
+
+def test_field_aliases():
+    """Test that field aliases are properly handled."""
+    data = {
+        "myList": "[1, 2, 3]",
+        "myDict": '{"key": "value"}',
+    }
+    action = JsonDecodingAliasAction.model_validate(data)
+
+    assert action.my_list == [1, 2, 3]
+    assert action.my_dict == {"key": "value"}
+
+
+def test_optional_fields_with_json_strings():
+    """Test that optional list/dict fields work with JSON strings."""
+    data = {
+        "items": '["opt1", "opt2"]',
+        "config": '{"opt": 99}',
+    }
+    action = JsonDecodingOptionalAction.model_validate(data)
+
+    assert action.items == ["opt1", "opt2"]
+    assert action.config == {"opt": 99}
+
+
+def test_optional_fields_with_none():
+    """Test that optional fields can be None."""
+    data = {}
+    action = JsonDecodingOptionalAction.model_validate(data)
+
+    assert action.items is None
+    assert action.config is None
+
+
+def test_optional_fields_with_native_values():
+    """Test that optional fields work with native values."""
+    data = {
+        "items": ["native1", "native2"],
+        "config": {"native": 100},
+    }
+    action = JsonDecodingOptionalAction.model_validate(data)
+
+    assert action.items == ["native1", "native2"]
+    assert action.config == {"native": 100}
+
+
+def test_invalid_json_string_rejected():
+    """Test that invalid JSON strings are rejected with validation error."""
+    data = {
+        "items": "not valid json",
+        "config": "{}",
+        "name": "test",
+    }
+
+    with pytest.raises(ValidationError) as exc_info:
+        JsonDecodingTestAction.model_validate(data)
+
+    # Should fail validation because "not valid json" can't be parsed as list
+    assert "items" in str(exc_info.value)
+
+
+def test_json_string_with_wrong_type_rejected():
+    """Test that JSON strings with wrong types are rejected."""
+    # Field expects list but JSON string contains dict
+    data = {
+        "items": '{"not": "a list"}',
+        "config": "{}",
+        "name": "test",
+    }
+
+    with pytest.raises(ValidationError) as exc_info:
+        JsonDecodingTestAction.model_validate(data)
+
+    assert "items" in str(exc_info.value)
+
+
+def test_nested_structures():
+    """Test that nested lists and dicts in JSON strings work."""
+
+    class NestedAction(Action):
+        nested_list: list[list[int]] = Field(description="Nested list")
+        nested_dict: dict[str, dict[str, str]] = Field(description="Nested dict")
+
+    data = {
+        "nested_list": "[[1, 2], [3, 4]]",
+        "nested_dict": '{"outer": {"inner": "value"}}',
+    }
+    action = NestedAction.model_validate(data)
+
+    assert action.nested_list == [[1, 2], [3, 4]]
+    assert action.nested_dict == {"outer": {"inner": "value"}}
+
+
+def test_empty_collections():
+    """Test that empty lists and dicts work."""
+    data = {
+        "items": "[]",
+        "config": "{}",
+        "name": "empty",
+    }
+    action = JsonDecodingTestAction.model_validate(data)
+
+    assert action.items == []
+    assert action.config == {}
+
+
+def test_mixed_native_and_json_strings():
+    """Test mixing native values and JSON strings in same model."""
+    data = {
+        "items": ["native", "list"],  # Native list
+        "config": '{"from": 1, "json": 2}',  # JSON string
+        "name": "mixed",
+    }
+    action = JsonDecodingTestAction.model_validate(data)
+
+    assert action.items == ["native", "list"]
+    assert action.config == {"from": 1, "json": 2}
+    assert action.name == "mixed"
+
+
+def test_unicode_in_json_strings():
+    """Test that unicode characters in JSON strings are handled correctly."""
+    data = {
+        "items": '["hello", "世界", "🌍"]',
+        "config": '{"greeting": 1, "你好": 2}',
+        "name": "unicode",
+    }
+    action = JsonDecodingTestAction.model_validate(data)
+
+    assert action.items == ["hello", "世界", "🌍"]
+    assert action.config == {"greeting": 1, "你好": 2}
+
+
+def test_whitespace_in_json_strings():
+    """Test that JSON strings with extra whitespace work."""
+    data = {
+        "items": '  [ "a" , "b" , "c" ]  ',
+        "config": '  { "x" : 1 , "y" : 2 }  ',
+        "name": "whitespace",
+    }
+    action = JsonDecodingTestAction.model_validate(data)
+
+    assert action.items == ["a", "b", "c"]
+    assert action.config == {"x": 1, "y": 2}

From f934f1ec713de36964ba69d6d01d29a2b34b828c Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 4 Nov 2025 17:06:39 +0100
Subject: [PATCH 09/10] update doc

---
 openhands-sdk/openhands/sdk/tool/schema.py | 30 +++++++++++++++++++---
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index c4122431bd..20df1dda47 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -111,10 +111,32 @@ def _decode_json_strings(cls, data: Any) -> Any:
         expects a list or dict type has received a JSON string instead. If so,
         it automatically decodes the string using json.loads().
 
-        This handles cases where LLMs (like GLM-4) return array/object values
-        as JSON strings instead of native JSON arrays/objects i.e.
-        <parameter=view_range>"[1, 100]"</parameter> instead of
-        <parameter=view_range>[1, 100]</parameter>.
+        This handles cases where certain LLMs (such as GLM 4.6) incorrectly encode
+        array/object parameters as JSON strings when using native function calling.
+
+        Example raw LLM output from GLM 4.6:
+        {
+            "role": "assistant",
+            "content": "I'll view the file for you.",
+            "tool_calls": [{
+            "id": "call_ef8e",
+            "type": "function",
+            "function": {
+                "name": "str_replace_editor",
+                "arguments": '{
+                    "command": "view",
+                    "path": "/tmp/test.txt",
+                    "view_range": "[1, 5]"
+                }'
+            }
+            }]
+        }
+        Expected output: `"view_range" : [1, 5]`
+
+        Note: The arguments field is a JSON string. When decoded, view_range is
+        incorrectly a string "[1, 5]" instead of the proper array [1, 5].
+        This validator automatically fixes this by detecting that view_range
+        expects a list type and decoding the JSON string to get the actual array.
 
         Args:
             data: The input data (usually a dict) before validation.

From 512dfe8af1a70eff642a8e6b813230a9872bb1b9 Mon Sep 17 00:00:00 2001
From: simonrosenberg <157206163+simonrosenberg@users.noreply.github.com>
Date: Wed, 5 Nov 2025 13:45:57 +0100
Subject: [PATCH 10/10] Apply suggestion from @simonrosenberg

---
 openhands-sdk/openhands/sdk/llm/utils/model_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openhands-sdk/openhands/sdk/llm/utils/model_features.py b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
index 2ee16e37b2..e53d892085 100644
--- a/openhands-sdk/openhands/sdk/llm/utils/model_features.py
+++ b/openhands-sdk/openhands/sdk/llm/utils/model_features.py
@@ -93,7 +93,7 @@ class ModelFeatures:
 #       Keep these entries as bare substrings without wildcards.
 FORCE_STRING_SERIALIZER_PATTERNS: list[str] = [
     "deepseek",  # e.g., DeepSeek-V3.2-Exp
-    "glm-4",  # e.g., GLM-4.5 / GLM-4.6
+    "glm",  # e.g., GLM-4.5 / GLM-4.6
     # Kimi K2-Instruct requires string serialization only on Groq
     "groq/kimi-k2-instruct",  # explicit provider-prefixed IDs
 ]