Make image viewing instruction conditional on LLM vision support

openhands-agent · openhands-agent · commit dac93578f37e · 2025-11-10T20:51:08.000Z
The FileEditor tool now conditionally includes the image viewing
instruction in its description based on whether the LLM supports
vision capabilities. This prevents confusion when using OSS models
that don't support image inputs.

- Remove image viewing line from base TOOL_DESCRIPTION
- Add conditional logic in FileEditorTool.create() to include the
  image viewing line only when conv_state.agent.llm.vision_is_active()
  returns True
- Add tests to verify the behavior with vision-enabled and
  vision-disabled LLMs

Co-authored-by: openhands &lt;openhands@all-hands.dev&gt;
diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -158,7 +158,6 @@ def _has_meaningful_diff(self) -> bool:
 TOOL_DESCRIPTION = """Custom editing tool for viewing, creating and editing files in plain-text format
 * State is persistent across command calls and discussions with the user
 * If `path` is a text file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
-* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, .bmp), `view` displays the image content
 * The `create` command cannot be used if the specified `path` already exists as a file
 * If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
 * The `undo_edit` command will revert the last edit made to the file at `path`
@@ -210,11 +209,28 @@ def create(
         # Initialize the executor
         executor = FileEditorExecutor(workspace_root=conv_state.workspace.working_dir)
 
+        # Build the tool description with conditional image viewing support
+        # Split TOOL_DESCRIPTION to insert image viewing line after the second bullet
+        description_lines = TOOL_DESCRIPTION.split("\n")
+        base_description = "\n".join(description_lines[:2])  # First two lines
+        remaining_description = "\n".join(description_lines[2:])  # Rest of description
+
+        # Add image viewing line if LLM supports vision
+        if conv_state.agent.llm.vision_is_active():
+            tool_description = (
+                f"{base_description}\n"
+                "* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, "
+                ".bmp), `view` displays the image content\n"
+                f"{remaining_description}"
+            )
+        else:
+            tool_description = TOOL_DESCRIPTION
+
         # Add working directory information to the tool description
         # to guide the agent to use the correct directory instead of root
         working_dir = conv_state.workspace.working_dir
         enhanced_description = (
-            f"{TOOL_DESCRIPTION}\n\n"
+            f"{tool_description}\n\n"
             f"Your current working directory is: {working_dir}\n"
             f"When exploring project structure, start with this directory "
             f"instead of the root filesystem."
diff --git a/tests/tools/file_editor/test_file_editor_tool.py b/tests/tools/file_editor/test_file_editor_tool.py
@@ -222,3 +222,50 @@ def test_file_editor_tool_openai_format_includes_working_directory():
             "When exploring project structure, start with this directory "
             "instead of the root filesystem."
         ) in description
+
+
+def test_file_editor_tool_image_viewing_line_with_vision_enabled():
+    """Test that image viewing line is included when LLM supports vision."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Create LLM with vision support (gpt-4o-mini supports vision)
+        llm = LLM(
+            model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm"
+        )
+        agent = Agent(llm=llm, tools=[])
+        conv_state = ConversationState.create(
+            id=uuid4(),
+            agent=agent,
+            workspace=LocalWorkspace(working_dir=temp_dir),
+        )
+
+        tools = FileEditorTool.create(conv_state)
+        tool = tools[0]
+
+        # Check that the image viewing line is included in description
+        assert (
+            "If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, .bmp)"
+            in tool.description
+        )
+        assert "view` displays the image content" in tool.description
+
+
+def test_file_editor_tool_image_viewing_line_with_vision_disabled():
+    """Test that image viewing line is excluded when LLM doesn't support vision."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Create LLM without vision support (gpt-3.5-turbo doesn't support vision)
+        llm = LLM(
+            model="gpt-3.5-turbo", api_key=SecretStr("test-key"), usage_id="test-llm"
+        )
+        agent = Agent(llm=llm, tools=[])
+        conv_state = ConversationState.create(
+            id=uuid4(),
+            agent=agent,
+            workspace=LocalWorkspace(working_dir=temp_dir),
+        )
+
+        tools = FileEditorTool.create(conv_state)
+        tool = tools[0]
+
+        # Check that the image viewing line is NOT included in description
+        assert "is an image file" not in tool.description
+        assert "displays the image content" not in tool.description