Fix: Make reasoning_effort optional (#1004)

Chesars · openhands-agent · web-flow · commit 2bb7c89e6d4c · 2025-11-06T05:11:59.000+08:00
Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/openhands-sdk/openhands/sdk/llm/llm.py b/openhands-sdk/openhands/sdk/llm/llm.py
@@ -213,11 +213,17 @@ class LLM(BaseModel, RetryMixin, NonNativeToolCallingMixin):
         description="Whether to use native tool calling.",
     )
     reasoning_effort: Literal["low", "medium", "high", "none"] | None = Field(
-        default=None,
+        default="high",
         description="The effort to put into reasoning. "
         "This is a string that can be one of 'low', 'medium', 'high', or 'none'. "
         "Can apply to all reasoning models.",
     )
+    reasoning_summary: Literal["auto", "concise", "detailed"] | None = Field(
+        default=None,
+        description="The level of detail for reasoning summaries. "
+        "This is a string that can be one of 'auto', 'concise', or 'detailed'. "
+        "Requires verified OpenAI organization. Only sent when explicitly set.",
+    )
     enable_encrypted_reasoning: bool = Field(
         default=False,
         description="If True, ask for ['reasoning.encrypted_content'] "
@@ -312,14 +318,6 @@ def _coerce_inputs(cls, data):
         if not model_val:
             raise ValueError("model must be specified in LLM")
 
-        # default reasoning_effort unless Gemini 2.5
-        # (we keep consistent with old behavior)
-        excluded_models = ["gemini-2.5-pro", "claude-sonnet-4-5", "claude-haiku-4-5"]
-        if d.get("reasoning_effort") is None and not any(
-            model in model_val for model in excluded_models
-        ):
-            d["reasoning_effort"] = "high"
-
         # Azure default version
         if model_val.startswith("azure") and not d.get("api_version"):
             d["api_version"] = "2024-12-01-preview"
diff --git a/openhands-sdk/openhands/sdk/llm/options/responses_options.py b/openhands-sdk/openhands/sdk/llm/options/responses_options.py
@@ -43,9 +43,12 @@ def select_responses_options(
     if include_list:
         out["include"] = include_list
 
-    # Request plaintext reasoning summary
-    effort = llm.reasoning_effort or "high"
-    out["reasoning"] = {"effort": effort, "summary": "detailed"}
+    # Include reasoning effort only if explicitly set
+    if llm.reasoning_effort:
+        out["reasoning"] = {"effort": llm.reasoning_effort}
+        # Optionally include summary if explicitly set (requires verified org)
+        if llm.reasoning_summary:
+            out["reasoning"]["summary"] = llm.reasoning_summary
 
     # Pass through litellm_extra_body if provided
     if llm.litellm_extra_body:
diff --git a/tests/sdk/config/test_llm_config.py b/tests/sdk/config/test_llm_config.py
@@ -37,7 +37,7 @@ def test_llm_config_defaults():
     assert config.log_completions is False
     assert config.custom_tokenizer is None
     assert config.native_tool_calling is True
-    assert config.reasoning_effort == "high"  # Default for non-Gemini models
+    assert config.reasoning_effort == "high"
     assert config.seed is None
     assert config.safety_settings is None
 
@@ -170,13 +170,17 @@ def test_llm_config_post_init_openrouter_env_vars():
 
 
 def test_llm_config_post_init_reasoning_effort_default():
-    """Test that reasoning_effort is set to 'high' by default for non-Gemini models."""
+    """Test reasoning_effort defaults to high."""
     config = LLM(model="gpt-4", usage_id="test-llm")
     assert config.reasoning_effort == "high"
 
-    # Test that Gemini models don't get default reasoning_effort
+    # Test that Gemini models also default to high
     config = LLM(model="gemini-2.5-pro-experimental", usage_id="test-llm")
-    assert config.reasoning_effort is None
+    assert config.reasoning_effort == "high"
+
+    # Test that explicit reasoning_effort is preserved
+    config = LLM(model="gpt-4", reasoning_effort="low", usage_id="test-llm")
+    assert config.reasoning_effort == "low"
 
 
 def test_llm_config_post_init_azure_api_version():
@@ -363,8 +367,6 @@ def test_llm_config_optional_fields():
     assert config.disable_vision is None
     assert config.disable_stop_word is None
     assert config.custom_tokenizer is None
-    assert (
-        config.reasoning_effort == "high"
-    )  # Even when set to None, post_init sets it to "high" for non-Gemini models
+    assert config.reasoning_effort is None  # Explicitly set to None overrides default
     assert config.seed is None
     assert config.safety_settings is None
diff --git a/tests/sdk/llm/test_responses_parsing_and_kwargs.py b/tests/sdk/llm/test_responses_parsing_and_kwargs.py
@@ -58,7 +58,7 @@ def test_from_llm_responses_output_parsing():
 
 
 def test_normalize_responses_kwargs_policy():
-    llm = LLM(model="gpt-5-mini")
+    llm = LLM(model="gpt-5-mini", reasoning_effort="high")
     # Use a model that is explicitly Responses-capable per model_features
 
     # enable encrypted reasoning and set max_output_tokens to test passthrough
@@ -75,14 +75,27 @@ def test_normalize_responses_kwargs_policy():
     assert set(out["include"]) >= {"text.output_text", "reasoning.encrypted_content"}
     # store default to False when None passed
     assert out["store"] is False
-    # reasoning config defaulted
+    # reasoning config with effort only (no summary for unverified orgs)
     r = out["reasoning"]
     assert r["effort"] in {"low", "medium", "high", "none"}
-    assert r["summary"] == "detailed"
+    assert "summary" not in r  # Summary not included to support unverified orgs
     # max_output_tokens preserved
     assert out["max_output_tokens"] == 128
 
 
+def test_normalize_responses_kwargs_with_summary():
+    """Test reasoning_summary is included when set (verified orgs)."""
+    llm = LLM(model="gpt-5-mini", reasoning_effort="high", reasoning_summary="detailed")
+
+    out = select_responses_options(
+        llm, {"temperature": 0.3}, include=["text.output_text"], store=None
+    )
+    # Verify reasoning includes both effort and summary when summary is set
+    r = out["reasoning"]
+    assert r["effort"] == "high"
+    assert r["summary"] == "detailed"
+
+
 @patch("openhands.sdk.llm.llm.litellm_responses")
 def test_llm_responses_end_to_end(mock_responses_call):
     # Configure LLM