From bc9e673fd76caa718b8482ff4f44ba8706fd3570 Mon Sep 17 00:00:00 2001 From: openhands Date: Tue, 4 Nov 2025 15:21:19 +0530 Subject: [PATCH 01/13] Add image file viewing support to FileEditor FileEditor now supports viewing image files (.png, .jpg, .jpeg, .gif, .webp, .bmp) by returning base64-encoded image data and displaying it as ImageContent. Updated FileEditorObservation and tool description to handle image data, and added tests to verify image handling and backward compatibility with text files. --- .../openhands/tools/file_editor/definition.py | 28 ++++++++++++++++- .../openhands/tools/file_editor/editor.py | 31 +++++++++++++++++-- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py index f3d96f9152..d010983469 100644 --- a/openhands-tools/openhands/tools/file_editor/definition.py +++ b/openhands-tools/openhands/tools/file_editor/definition.py @@ -85,6 +85,9 @@ class FileEditorObservation(Observation): default=None, description="The content of the file after the edit." ) error: str | None = Field(default=None, description="Error message if any.") + image_data: str | None = Field( + default=None, description="Base64-encoded image data if viewing an image file." + ) _diff_cache: Text | None = PrivateAttr(default=None) @@ -92,7 +95,29 @@ class FileEditorObservation(Observation): def to_llm_content(self) -> Sequence[TextContent | ImageContent]: if self.error: return [TextContent(text=self.error)] - return [TextContent(text=self.output)] + + content: list[TextContent | ImageContent] = [TextContent(text=self.output)] + + # If image_data is present, add it as ImageContent + if self.image_data: + # Detect MIME type from the data prefix + mime_type = "image/png" # default + if self.image_data.startswith("/9j/"): + mime_type = "image/jpeg" + elif self.image_data.startswith("iVBORw0KGgo"): + mime_type = "image/png" + elif self.image_data.startswith("R0lGODlh"): + mime_type = "image/gif" + elif self.image_data.startswith("UklGR"): + mime_type = "image/webp" + elif self.image_data.startswith("Qk"): + mime_type = "image/bmp" + + # Convert base64 to data URL format for ImageContent + data_url = f"data:{mime_type};base64,{self.image_data}" + content.append(ImageContent(image_urls=[data_url])) + + return content @property def visualize(self) -> Text: @@ -157,6 +182,7 @@ def _has_meaningful_diff(self) -> bool: TOOL_DESCRIPTION = """Custom editing tool for viewing, creating and editing files in plain-text format * State is persistent across command calls and discussions with the user * If `path` is a text file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep +* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, .bmp), `view` displays the image content * The `create` command cannot be used if the specified `path` already exists as a file * If a `command` generates a long output, it will be truncated and marked with `` * The `undo_edit` command will revert the last edit made to the file at `path` diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py index 509bba6323..212b568a5b 100644 --- a/openhands-tools/openhands/tools/file_editor/editor.py +++ b/openhands-tools/openhands/tools/file_editor/editor.py @@ -1,3 +1,4 @@ +import base64 import os import re import shutil @@ -36,6 +37,9 @@ logger = get_logger(__name__) +# Supported image extensions for viewing as base64-encoded content +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"} + class FileEditor: """ @@ -321,6 +325,28 @@ def view( prev_exist=True, ) + # Check if the file is an image + file_extension = path.suffix.lower() + if file_extension in IMAGE_EXTENSIONS: + # Read image file as base64 + try: + with open(path, "rb") as f: + image_bytes = f.read() + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + + output_msg = f"Image file {path} read successfully. Displaying image content." + return FileEditorObservation( + command="view", + output=output_msg, + path=str(path), + prev_exist=True, + image_data=image_base64, + ) + except Exception as e: + raise ToolError( + f"Failed to read image file {path}: {e}" + ) from None + # Validate file and count lines self.validate_file(path) num_lines = self._count_lines(path) @@ -603,8 +629,9 @@ def validate_file(self, path: Path) -> None: ), ) - # Check file type - if is_binary(str(path)): + # Check file type - allow image files + file_extension = path.suffix.lower() + if is_binary(str(path)) and file_extension not in IMAGE_EXTENSIONS: raise FileValidationError( path=str(path), reason=( From 97355164524dafe0642a1cfda83e9887e20f6ff4 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 09:55:27 +0530 Subject: [PATCH 02/13] ruff --- .../openhands/tools/file_editor/definition.py | 1 + openhands-tools/openhands/tools/file_editor/editor.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py index 6063b34d7c..64b8580e22 100644 --- a/openhands-tools/openhands/tools/file_editor/definition.py +++ b/openhands-tools/openhands/tools/file_editor/definition.py @@ -85,6 +85,7 @@ class FileEditorObservation(Observation): new_content: str | None = Field( default=None, description="The content of the file after the edit." ) + _diff_cache: Text | None = PrivateAttr(default=None) @property diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py index 2d1f2b2a50..2daff7de31 100644 --- a/openhands-tools/openhands/tools/file_editor/editor.py +++ b/openhands-tools/openhands/tools/file_editor/editor.py @@ -352,10 +352,17 @@ def view( mime_type = "image/webp" elif image_base64.startswith("Qk"): mime_type = "image/bmp" - output_msg = f"Image file {path} read successfully. Displaying image content." + output_msg = ( + f"Image file {path} read successfully. " + "Displaying image content." + ) + image_url = f"data:{mime_type};base64,{image_base64}" return FileEditorObservation( command="view", - content=[TextContent(text=output_msg), ImageContent(image_urls=[f"data:{mime_type};base64,{image_base64}"])], + content=[ + TextContent(text=output_msg), + ImageContent(image_urls=[image_url]), + ], path=str(path), prev_exist=True, ) From 2f0e725f5d64415d712677367e97465fa4c2d235 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 10:13:36 +0530 Subject: [PATCH 03/13] ruff --- openhands-tools/openhands/tools/file_editor/editor.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py index 2daff7de31..197e6e1e78 100644 --- a/openhands-tools/openhands/tools/file_editor/editor.py +++ b/openhands-tools/openhands/tools/file_editor/editor.py @@ -340,7 +340,7 @@ def view( with open(path, "rb") as f: image_bytes = f.read() image_base64 = base64.b64encode(image_bytes).decode("utf-8") - + mime_type = "image/png" # default if image_base64.startswith("/9j/"): mime_type = "image/jpeg" @@ -353,8 +353,7 @@ def view( elif image_base64.startswith("Qk"): mime_type = "image/bmp" output_msg = ( - f"Image file {path} read successfully. " - "Displaying image content." + f"Image file {path} read successfully. Displaying image content." ) image_url = f"data:{mime_type};base64,{image_base64}" return FileEditorObservation( @@ -367,9 +366,7 @@ def view( prev_exist=True, ) except Exception as e: - raise ToolError( - f"Failed to read image file {path}: {e}" - ) from None + raise ToolError(f"Failed to read image file {path}: {e}") from None # Validate file and count lines self.validate_file(path) From a883f91c0b4af530be6e315519f57e1b21794baa Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 10:13:57 +0530 Subject: [PATCH 04/13] Update image file validation test assertion --- tests/tools/file_editor/test_file_validation.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/tools/file_editor/test_file_validation.py b/tests/tools/file_editor/test_file_validation.py index 7f41fe54e3..d507aaf8d6 100644 --- a/tests/tools/file_editor/test_file_validation.py +++ b/tests/tools/file_editor/test_file_validation.py @@ -96,8 +96,5 @@ def test_validate_image_file(tmp_path): assert is_binary(str(image_file)) - # Images are not supported and should be detected as binary - with pytest.raises(FileValidationError) as exc_info: - editor.validate_file(image_file) - - assert "file appears to be binary" in str(exc_info.value).lower() + # Images are not supported, so no exception should be raised + editor.validate_file(image_file) \ No newline at end of file From d1f4666d54cd44436bc7f0e8b8038e324338502c Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 6 Nov 2025 10:31:00 +0530 Subject: [PATCH 05/13] format --- tests/tools/file_editor/test_file_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tools/file_editor/test_file_validation.py b/tests/tools/file_editor/test_file_validation.py index d507aaf8d6..989bd28ada 100644 --- a/tests/tools/file_editor/test_file_validation.py +++ b/tests/tools/file_editor/test_file_validation.py @@ -97,4 +97,4 @@ def test_validate_image_file(tmp_path): assert is_binary(str(image_file)) # Images are not supported, so no exception should be raised - editor.validate_file(image_file) \ No newline at end of file + editor.validate_file(image_file) From 5fa71b81fb1236c432de1c69f576a02b88849f81 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 02:55:34 +0530 Subject: [PATCH 06/13] Add test for viewing image files in FileEditor Introduces a test to verify that viewing a PNG image file with FileEditor returns an ImageContent object containing a base64-encoded image URL. This ensures image files are handled correctly by the view command. --- .../tools/file_editor/test_file_validation.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/tools/file_editor/test_file_validation.py b/tests/tools/file_editor/test_file_validation.py index 989bd28ada..f7c01961a3 100644 --- a/tests/tools/file_editor/test_file_validation.py +++ b/tests/tools/file_editor/test_file_validation.py @@ -3,6 +3,7 @@ import pytest from binaryornot.check import is_binary +from openhands.sdk import ImageContent from openhands.tools.file_editor.editor import FileEditor from openhands.tools.file_editor.exceptions import ( FileValidationError, @@ -98,3 +99,37 @@ def test_validate_image_file(tmp_path): # Images are not supported, so no exception should be raised editor.validate_file(image_file) + + +def test_view_image_file_returns_image_content(tmp_path): + """Test that viewing an image file returns ImageContent without error.""" + editor = FileEditor() + image_file = tmp_path / "test.png" + + # Create a minimal valid 1x1 PNG image (red pixel) + # This is a complete, valid PNG file + png_data = ( + b'\x89PNG\r\n\x1a\n' # PNG signature + b'\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01' # IHDR chunk (1x1) + b'\x08\x02\x00\x00\x00\x90wS\xde' # IHDR data + CRC + b'\x00\x00\x00\x0cIDATx\x9cc\xf8\xcf\xc0\x00\x00\x00\x03\x00\x01' # IDAT chunk + b'\x00\x18\xdd\x8d\xb4' # IDAT CRC + b'\x00\x00\x00\x00IEND\xaeB`\x82' # IEND chunk + ) + + with open(image_file, "wb") as f: + f.write(png_data) + + # View the image file - should return ImageContent + result = editor(command="view", path=str(image_file)) + + # Verify result contains ImageContent + assert result is not None + assert hasattr(result, 'content') + assert len(result.content) == 2 # TextContent with message + ImageContent + assert any(isinstance(c, ImageContent) for c in result.content) + + # Get the ImageContent and verify it has image_urls + image_content = [c for c in result.content if isinstance(c, ImageContent)][0] + assert len(image_content.image_urls) == 1 + assert image_content.image_urls[0].startswith("data:image/png;base64,") From d7e27e8f6e1b014487dfbdd1765aa8ffe5b87f68 Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 7 Nov 2025 03:04:55 +0530 Subject: [PATCH 07/13] Use double quotes for byte strings in PNG test Replaces single quotes with double quotes for byte string literals in the PNG image data and updates an attribute check to use double quotes for consistency in test_view_image_file_returns_image_content. --- tests/tools/file_editor/test_file_validation.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/tools/file_editor/test_file_validation.py b/tests/tools/file_editor/test_file_validation.py index f7c01961a3..c86f696e9a 100644 --- a/tests/tools/file_editor/test_file_validation.py +++ b/tests/tools/file_editor/test_file_validation.py @@ -109,12 +109,12 @@ def test_view_image_file_returns_image_content(tmp_path): # Create a minimal valid 1x1 PNG image (red pixel) # This is a complete, valid PNG file png_data = ( - b'\x89PNG\r\n\x1a\n' # PNG signature - b'\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01' # IHDR chunk (1x1) - b'\x08\x02\x00\x00\x00\x90wS\xde' # IHDR data + CRC - b'\x00\x00\x00\x0cIDATx\x9cc\xf8\xcf\xc0\x00\x00\x00\x03\x00\x01' # IDAT chunk - b'\x00\x18\xdd\x8d\xb4' # IDAT CRC - b'\x00\x00\x00\x00IEND\xaeB`\x82' # IEND chunk + b"\x89PNG\r\n\x1a\n" # PNG signature + b"\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01" # IHDR chunk (1x1) + b"\x08\x02\x00\x00\x00\x90wS\xde" # IHDR data + CRC + b"\x00\x00\x00\x0cIDATx\x9cc\xf8\xcf\xc0\x00\x00\x00\x03\x00\x01" # IDAT chunk + b"\x00\x18\xdd\x8d\xb4" # IDAT CRC + b"\x00\x00\x00\x00IEND\xaeB`\x82" # IEND chunk ) with open(image_file, "wb") as f: @@ -125,7 +125,7 @@ def test_view_image_file_returns_image_content(tmp_path): # Verify result contains ImageContent assert result is not None - assert hasattr(result, 'content') + assert hasattr(result, "content") assert len(result.content) == 2 # TextContent with message + ImageContent assert any(isinstance(c, ImageContent) for c in result.content) From 81377cb506f22d12c64d7c54fd0208e8e90005db Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 8 Nov 2025 16:55:01 +0530 Subject: [PATCH 08/13] Improve image MIME type detection in file editor Replaced base64 prefix checks with mimetypes.guess_type for determining image MIME type when reading image files. Falls back to 'image/png' if MIME type is not detected or not an image. This simplifies and improves accuracy of MIME type detection. --- .../openhands/tools/file_editor/editor.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py index 197e6e1e78..db70a064aa 100644 --- a/openhands-tools/openhands/tools/file_editor/editor.py +++ b/openhands-tools/openhands/tools/file_editor/editor.py @@ -1,4 +1,5 @@ import base64 +import mimetypes import os import re import shutil @@ -341,17 +342,9 @@ def view( image_bytes = f.read() image_base64 = base64.b64encode(image_bytes).decode("utf-8") - mime_type = "image/png" # default - if image_base64.startswith("/9j/"): - mime_type = "image/jpeg" - elif image_base64.startswith("iVBORw0KGgo"): - mime_type = "image/png" - elif image_base64.startswith("R0lGODlh"): - mime_type = "image/gif" - elif image_base64.startswith("UklGR"): - mime_type = "image/webp" - elif image_base64.startswith("Qk"): - mime_type = "image/bmp" + mime_type, _ = mimetypes.guess_type(str(path)) + if not mime_type or not mime_type.startswith('image/'): + mime_type = 'image/png' output_msg = ( f"Image file {path} read successfully. Displaying image content." ) From a1801b8ab79587ffb935ad2ff5003c7735cc52a5 Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 8 Nov 2025 16:58:22 +0530 Subject: [PATCH 09/13] Use double quotes for image MIME type check Replaces single quotes with double quotes in MIME type string comparison and assignment for consistency in the file editor. --- openhands-tools/openhands/tools/file_editor/editor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py index db70a064aa..e4b455014d 100644 --- a/openhands-tools/openhands/tools/file_editor/editor.py +++ b/openhands-tools/openhands/tools/file_editor/editor.py @@ -343,8 +343,8 @@ def view( image_base64 = base64.b64encode(image_bytes).decode("utf-8") mime_type, _ = mimetypes.guess_type(str(path)) - if not mime_type or not mime_type.startswith('image/'): - mime_type = 'image/png' + if not mime_type or not mime_type.startswith("image/"): + mime_type = "image/png" output_msg = ( f"Image file {path} read successfully. Displaying image content." ) From f783fcfca327f89b82427e1439a4bd1e46635467 Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 8 Nov 2025 19:05:44 +0000 Subject: [PATCH 10/13] Add integration test for image file viewing with FileEditor This test verifies that the FileEditor can view and display image files by having an agent analyze the OpenHands logo and identify its colors. The test: 1. Downloads the OpenHands logo to the workspace 2. Asks the agent to view the logo.png file and identify its colors 3. Verifies that the agent correctly identifies yellow as one of the colors This test requires a vision-capable LLM model. Co-authored-by: openhands --- .../tests/t08_image_file_viewing.py | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 tests/integration/tests/t08_image_file_viewing.py diff --git a/tests/integration/tests/t08_image_file_viewing.py b/tests/integration/tests/t08_image_file_viewing.py new file mode 100644 index 0000000000..df8f1db365 --- /dev/null +++ b/tests/integration/tests/t08_image_file_viewing.py @@ -0,0 +1,92 @@ +"""Test that an agent can view and analyze image files using FileEditor.""" + +import os +import urllib.request + +from openhands.sdk import TextContent, get_logger +from openhands.sdk.event.llm_convertible import MessageEvent +from openhands.sdk.tool import Tool, register_tool +from openhands.tools.file_editor import FileEditorTool +from openhands.tools.terminal import TerminalTool +from tests.integration.base import BaseIntegrationTest, TestResult + + +INSTRUCTION = ( + "Please view the logo.png file in the current directory and tell me what " + "colors you see in it. Is the logo blue, yellow, or green? Please analyze " + "the image and provide your answer." +) + +IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png" + +logger = get_logger(__name__) + + +class ImageFileViewingTest(BaseIntegrationTest): + """Test that an agent can view and analyze image files.""" + + INSTRUCTION: str = INSTRUCTION + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.logo_path: str = os.path.join(self.workspace, "logo.png") + + # Verify that the LLM supports vision + if not self.llm.vision_is_active(): + raise ValueError( + "This test requires a vision-capable LLM model. " + "Please use a model that supports image input." + ) + + @property + def tools(self) -> list[Tool]: + """List of tools available to the agent.""" + register_tool("TerminalTool", TerminalTool) + register_tool("FileEditorTool", FileEditorTool) + return [ + Tool(name="TerminalTool"), + Tool(name="FileEditorTool"), + ] + + def setup(self) -> None: + """Download the OpenHands logo for the agent to analyze.""" + try: + urllib.request.urlretrieve(IMAGE_URL, self.logo_path) + logger.info(f"Downloaded test logo to: {self.logo_path}") + except Exception as e: + logger.error(f"Failed to download logo: {e}") + raise + + def verify_result(self) -> TestResult: + """Verify that the agent identified yellow as one of the logo colors.""" + if not os.path.exists(self.logo_path): + return TestResult( + success=False, reason="Logo file not found after agent execution" + ) + + # Check the agent's responses in collected events + # Look for messages mentioning yellow color + agent_responses = [] + for event in self.collected_events: + if isinstance(event, MessageEvent): + message = event.llm_message + if message.role == "assistant": + for content_item in message.content: + if isinstance(content_item, TextContent): + agent_responses.append(content_item.text.lower()) + + combined_response = " ".join(agent_responses) + + if "yellow" in combined_response: + return TestResult( + success=True, + reason="Agent successfully identified yellow color in the logo", + ) + else: + return TestResult( + success=False, + reason=( + f"Agent did not identify yellow color in the logo. " + f"Response: {combined_response[:500]}" + ), + ) From e7666de65cddf60664515f90305bef2893434c27 Mon Sep 17 00:00:00 2001 From: openhands Date: Sun, 9 Nov 2025 19:36:11 +0000 Subject: [PATCH 11/13] Refactor browser_use MIME type detection Replace if-elif chain with dictionary-based lookup for cleaner, more maintainable code. Extract detection logic into reusable detect_image_mime_type() helper function. Add comprehensive test coverage for JPEG, PNG, GIF, WebP, and unknown formats. Co-authored-by: openhands --- .../openhands/tools/browser_use/definition.py | 33 +++++++++++----- .../browser_use/test_browser_observation.py | 38 +++++++++++++++++++ 2 files changed, 62 insertions(+), 9 deletions(-) diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py index 6a02026b2b..b3463af2bd 100644 --- a/openhands-tools/openhands/tools/browser_use/definition.py +++ b/openhands-tools/openhands/tools/browser_use/definition.py @@ -24,6 +24,29 @@ # Maximum output size for browser observations MAX_BROWSER_OUTPUT_SIZE = 50000 +# Mapping of base64 prefixes to MIME types for image detection +BASE64_IMAGE_PREFIXES = { + "/9j/": "image/jpeg", + "iVBORw0KGgo": "image/png", + "R0lGODlh": "image/gif", + "UklGR": "image/webp", +} + + +def detect_image_mime_type(base64_data: str) -> str: + """Detect MIME type from base64-encoded image data. + + Args: + base64_data: Base64-encoded image data + + Returns: + Detected MIME type, defaults to "image/png" if not detected + """ + for prefix, mime_type in BASE64_IMAGE_PREFIXES.items(): + if base64_data.startswith(prefix): + return mime_type + return "image/png" + class BrowserObservation(Observation): """Base observation for browser operations.""" @@ -48,15 +71,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]: ) if self.screenshot_data: - mime_type = "image/png" - if self.screenshot_data.startswith("/9j/"): - mime_type = "image/jpeg" - elif self.screenshot_data.startswith("iVBORw0KGgo"): - mime_type = "image/png" - elif self.screenshot_data.startswith("R0lGODlh"): - mime_type = "image/gif" - elif self.screenshot_data.startswith("UklGR"): - mime_type = "image/webp" + mime_type = detect_image_mime_type(self.screenshot_data) # Convert base64 to data URL format for ImageContent data_url = f"data:{mime_type};base64,{self.screenshot_data}" llm_content.append(ImageContent(image_urls=[data_url])) diff --git a/tests/tools/browser_use/test_browser_observation.py b/tests/tools/browser_use/test_browser_observation.py index 09992781ee..612ad5bb50 100644 --- a/tests/tools/browser_use/test_browser_observation.py +++ b/tests/tools/browser_use/test_browser_observation.py @@ -112,3 +112,41 @@ def test_browser_observation_empty_screenshot_handling(): observation = BrowserObservation.from_text(text="Test", screenshot_data=None) agent_obs = observation.to_llm_content assert len(agent_obs) == 1 # Only text content, no image + + +def test_browser_observation_mime_type_detection(): + """Test MIME type detection for different image formats.""" + test_cases = [ + ( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==", # noqa: E501 + "image/png", + ), + ( + "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwA/", # noqa: E501 + "image/jpeg", + ), + ( + "R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", + "image/gif", + ), + ( + "UklGRiQAAABXRUJQVlA4IBgAAAAwAQCdASoBAAEAAQAcJaQAA3AA/v3AgAA=", + "image/webp", + ), + ( + "AAAABBBBCCCC", # Unknown format + "image/png", # Falls back to PNG + ), + ] + + for screenshot_data, expected_mime_type in test_cases: + observation = BrowserObservation.from_text( + text="Test", screenshot_data=screenshot_data + ) + agent_obs = observation.to_llm_content + + assert len(agent_obs) == 2 + assert isinstance(agent_obs[1], ImageContent) + assert ( + agent_obs[1].image_urls[0].startswith(f"data:{expected_mime_type};base64,") + ) From 262bd60900a47481600b10681f705a54d1f2c1cd Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 10 Nov 2025 15:27:05 +0000 Subject: [PATCH 12/13] Implement SkipTest exception to gracefully skip integration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Added SkipTest exception in base.py to signal when tests should be skipped - Extended TestResult and TestResultData schemas with 'skipped' field - Updated ModelTestResults to track skipped_tests count - Modified run_infer.py to catch SkipTest and handle skipped tests appropriately - Updated t08_image_file_viewing.py to raise SkipTest instead of ValueError when vision not supported - Enhanced console output to display skipped tests with ⊘ symbol - Updated markdown report generator to show skipped tests in summary and detailed results This allows integration tests to gracefully skip when LLMs lack required capabilities (e.g., vision support) rather than failing. Co-authored-by: openhands --- tests/integration/base.py | 12 +++++++ tests/integration/run_infer.py | 24 +++++++++++-- tests/integration/schemas.py | 5 +++ .../tests/t08_image_file_viewing.py | 4 +-- .../utils/generate_markdown_report.py | 35 ++++++++++++++++--- 5 files changed, 71 insertions(+), 9 deletions(-) diff --git a/tests/integration/base.py b/tests/integration/base.py index 7bbc0841ae..4c22a65a2f 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -26,11 +26,23 @@ from openhands.sdk.tool import Tool +class SkipTest(Exception): + """ + Exception raised to indicate that a test should be skipped. + + This is useful for tests that require specific capabilities (e.g., vision) + that may not be available in all LLMs. + """ + + pass + + class TestResult(BaseModel): """Result of an integration test.""" success: bool reason: str | None = None + skipped: bool = False class BaseIntegrationTest(ABC): diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py index 9107ac35cd..e202409845 100755 --- a/tests/integration/run_infer.py +++ b/tests/integration/run_infer.py @@ -17,7 +17,7 @@ from pydantic import BaseModel, ConfigDict from openhands.sdk.logger import get_logger -from tests.integration.base import BaseIntegrationTest, TestResult +from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult from tests.integration.schemas import ModelTestResults from tests.integration.utils.format_costs import format_cost @@ -171,6 +171,20 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval log_file_path=log_file_path, ) + except SkipTest as e: + # Test should be skipped (e.g., LLM doesn't support required capabilities) + logger.info("Test %s skipped: %s", instance.instance_id, str(e)) + return EvalOutput( + instance_id=instance.instance_id, + test_result=TestResult( + success=False, + reason=str(e), + skipped=True, + ), + llm_model=llm_config.get("model", "unknown"), + cost=0.0, + ) + except Exception as e: logger.error("Error running test %s: %s", instance.instance_id, e) return EvalOutput( @@ -274,11 +288,17 @@ def generate_structured_results( # Print summary for console output success_rate = structured_results.success_rate successful = structured_results.successful_tests + skipped = structured_results.skipped_tests total = structured_results.total_tests logger.info("Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, total) + if skipped > 0: + logger.info("Skipped tests: %d", skipped) logger.info("Evaluation Results:") for instance in structured_results.test_instances: - status = "✓" if instance.test_result.success else "✗" + if instance.test_result.skipped: + status = "⊘" # Skipped symbol + else: + status = "✓" if instance.test_result.success else "✗" reason = instance.test_result.reason or "N/A" logger.info("%s: %s - %s", instance.instance_id, status, reason) logger.info("Total cost: %s", format_cost(structured_results.total_cost)) diff --git a/tests/integration/schemas.py b/tests/integration/schemas.py index c557d939b5..dd20c354ab 100644 --- a/tests/integration/schemas.py +++ b/tests/integration/schemas.py @@ -20,6 +20,7 @@ class TestResultData(BaseModel): success: bool reason: str | None = None + skipped: bool = False class TestInstanceResult(BaseModel): @@ -46,6 +47,7 @@ class ModelTestResults(BaseModel): # Summary statistics total_tests: int successful_tests: int + skipped_tests: int success_rate: float total_cost: float @@ -75,6 +77,7 @@ def from_eval_outputs( test_result=TestResultData( success=output.test_result.success, reason=output.test_result.reason, + skipped=output.test_result.skipped, ), cost=output.cost, error_message=output.error_message, @@ -84,6 +87,7 @@ def from_eval_outputs( # Calculate summary statistics total_tests = len(test_instances) successful_tests = sum(1 for t in test_instances if t.test_result.success) + skipped_tests = sum(1 for t in test_instances if t.test_result.skipped) success_rate = successful_tests / total_tests if total_tests > 0 else 0.0 total_cost = sum(t.cost for t in test_instances) @@ -94,6 +98,7 @@ def from_eval_outputs( test_instances=test_instances, total_tests=total_tests, successful_tests=successful_tests, + skipped_tests=skipped_tests, success_rate=success_rate, total_cost=total_cost, eval_note=eval_note, diff --git a/tests/integration/tests/t08_image_file_viewing.py b/tests/integration/tests/t08_image_file_viewing.py index df8f1db365..d48a04ed37 100644 --- a/tests/integration/tests/t08_image_file_viewing.py +++ b/tests/integration/tests/t08_image_file_viewing.py @@ -8,7 +8,7 @@ from openhands.sdk.tool import Tool, register_tool from openhands.tools.file_editor import FileEditorTool from openhands.tools.terminal import TerminalTool -from tests.integration.base import BaseIntegrationTest, TestResult +from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult INSTRUCTION = ( @@ -33,7 +33,7 @@ def __init__(self, *args, **kwargs): # Verify that the LLM supports vision if not self.llm.vision_is_active(): - raise ValueError( + raise SkipTest( "This test requires a vision-capable LLM model. " "Please use a model that supports image input." ) diff --git a/tests/integration/utils/generate_markdown_report.py b/tests/integration/utils/generate_markdown_report.py index 0cfaf59e28..4747199f2a 100644 --- a/tests/integration/utils/generate_markdown_report.py +++ b/tests/integration/utils/generate_markdown_report.py @@ -18,20 +18,21 @@ def generate_model_summary_table(model_results: list[ModelTestResults]) -> str: """Generate a summary table for all models.""" table_lines = [ - "| Model | Success Rate | Tests Passed | Total Tests | Cost |", - "|-------|--------------|--------------|-------------|------|", + "| Model | Success Rate | Tests Passed | Skipped | Total Tests | Cost |", + "|-------|--------------|--------------|---------|-------------|------|", ] for result in model_results: success_rate = f"{result.success_rate:.1%}" tests_passed = f"{result.successful_tests}/{result.total_tests}" + skipped = f"{result.skipped_tests}" cost = format_cost(result.total_cost) model_name = result.model_name total_tests = result.total_tests row = ( f"| {model_name} | {success_rate} | {tests_passed} | " - f"{total_tests} | {cost} |" + f"{skipped} | {total_tests} | {cost} |" ) table_lines.append(row) @@ -51,11 +52,35 @@ def generate_detailed_results(model_results: list[ModelTestResults]) -> str: f"({result.successful_tests}/{result.total_tests})", f"- **Total Cost**: {format_cost(result.total_cost)}", f"- **Run Suffix**: `{result.run_suffix}`", - "", ] + if result.skipped_tests > 0: + section_lines.append(f"- **Skipped Tests**: {result.skipped_tests}") + + section_lines.append("") + + # Add skipped tests if any + skipped_tests = [t for t in result.test_instances if t.test_result.skipped] + if skipped_tests: + section_lines.extend( + [ + "**Skipped Tests:**", + "", + ] + ) + + for test in skipped_tests: + reason = test.test_result.reason or "No reason provided" + section_lines.append(f"- `{test.instance_id}`: {reason}") + + section_lines.append("") + # Add failed tests if any - failed_tests = [t for t in result.test_instances if not t.test_result.success] + failed_tests = [ + t + for t in result.test_instances + if not t.test_result.success and not t.test_result.skipped + ] if failed_tests: section_lines.extend( [ From dac93578f37e2b2daf067a68db687dedb2c02551 Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 10 Nov 2025 20:51:08 +0000 Subject: [PATCH 13/13] Make image viewing instruction conditional on LLM vision support The FileEditor tool now conditionally includes the image viewing instruction in its description based on whether the LLM supports vision capabilities. This prevents confusion when using OSS models that don't support image inputs. - Remove image viewing line from base TOOL_DESCRIPTION - Add conditional logic in FileEditorTool.create() to include the image viewing line only when conv_state.agent.llm.vision_is_active() returns True - Add tests to verify the behavior with vision-enabled and vision-disabled LLMs Co-authored-by: openhands --- .../openhands/tools/file_editor/definition.py | 20 +++++++- .../file_editor/test_file_editor_tool.py | 47 +++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py index 64b8580e22..02743da6ab 100644 --- a/openhands-tools/openhands/tools/file_editor/definition.py +++ b/openhands-tools/openhands/tools/file_editor/definition.py @@ -158,7 +158,6 @@ def _has_meaningful_diff(self) -> bool: TOOL_DESCRIPTION = """Custom editing tool for viewing, creating and editing files in plain-text format * State is persistent across command calls and discussions with the user * If `path` is a text file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep -* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, .bmp), `view` displays the image content * The `create` command cannot be used if the specified `path` already exists as a file * If a `command` generates a long output, it will be truncated and marked with `` * The `undo_edit` command will revert the last edit made to the file at `path` @@ -210,11 +209,28 @@ def create( # Initialize the executor executor = FileEditorExecutor(workspace_root=conv_state.workspace.working_dir) + # Build the tool description with conditional image viewing support + # Split TOOL_DESCRIPTION to insert image viewing line after the second bullet + description_lines = TOOL_DESCRIPTION.split("\n") + base_description = "\n".join(description_lines[:2]) # First two lines + remaining_description = "\n".join(description_lines[2:]) # Rest of description + + # Add image viewing line if LLM supports vision + if conv_state.agent.llm.vision_is_active(): + tool_description = ( + f"{base_description}\n" + "* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, " + ".bmp), `view` displays the image content\n" + f"{remaining_description}" + ) + else: + tool_description = TOOL_DESCRIPTION + # Add working directory information to the tool description # to guide the agent to use the correct directory instead of root working_dir = conv_state.workspace.working_dir enhanced_description = ( - f"{TOOL_DESCRIPTION}\n\n" + f"{tool_description}\n\n" f"Your current working directory is: {working_dir}\n" f"When exploring project structure, start with this directory " f"instead of the root filesystem." diff --git a/tests/tools/file_editor/test_file_editor_tool.py b/tests/tools/file_editor/test_file_editor_tool.py index f5963d34b7..e0e6d1186c 100644 --- a/tests/tools/file_editor/test_file_editor_tool.py +++ b/tests/tools/file_editor/test_file_editor_tool.py @@ -222,3 +222,50 @@ def test_file_editor_tool_openai_format_includes_working_directory(): "When exploring project structure, start with this directory " "instead of the root filesystem." ) in description + + +def test_file_editor_tool_image_viewing_line_with_vision_enabled(): + """Test that image viewing line is included when LLM supports vision.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create LLM with vision support (gpt-4o-mini supports vision) + llm = LLM( + model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm" + ) + agent = Agent(llm=llm, tools=[]) + conv_state = ConversationState.create( + id=uuid4(), + agent=agent, + workspace=LocalWorkspace(working_dir=temp_dir), + ) + + tools = FileEditorTool.create(conv_state) + tool = tools[0] + + # Check that the image viewing line is included in description + assert ( + "If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, .bmp)" + in tool.description + ) + assert "view` displays the image content" in tool.description + + +def test_file_editor_tool_image_viewing_line_with_vision_disabled(): + """Test that image viewing line is excluded when LLM doesn't support vision.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create LLM without vision support (gpt-3.5-turbo doesn't support vision) + llm = LLM( + model="gpt-3.5-turbo", api_key=SecretStr("test-key"), usage_id="test-llm" + ) + agent = Agent(llm=llm, tools=[]) + conv_state = ConversationState.create( + id=uuid4(), + agent=agent, + workspace=LocalWorkspace(working_dir=temp_dir), + ) + + tools = FileEditorTool.create(conv_state) + tool = tools[0] + + # Check that the image viewing line is NOT included in description + assert "is an image file" not in tool.description + assert "displays the image content" not in tool.description