Skip to content

Commit dac9357

Browse files
Make image viewing instruction conditional on LLM vision support
The FileEditor tool now conditionally includes the image viewing instruction in its description based on whether the LLM supports vision capabilities. This prevents confusion when using OSS models that don't support image inputs. - Remove image viewing line from base TOOL_DESCRIPTION - Add conditional logic in FileEditorTool.create() to include the image viewing line only when conv_state.agent.llm.vision_is_active() returns True - Add tests to verify the behavior with vision-enabled and vision-disabled LLMs Co-authored-by: openhands <openhands@all-hands.dev>
1 parent 262bd60 commit dac9357

File tree

2 files changed

+65
-2
lines changed

2 files changed

+65
-2
lines changed

openhands-tools/openhands/tools/file_editor/definition.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,6 @@ def _has_meaningful_diff(self) -> bool:
158158
TOOL_DESCRIPTION = """Custom editing tool for viewing, creating and editing files in plain-text format
159159
* State is persistent across command calls and discussions with the user
160160
* If `path` is a text file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
161-
* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, .bmp), `view` displays the image content
162161
* The `create` command cannot be used if the specified `path` already exists as a file
163162
* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
164163
* The `undo_edit` command will revert the last edit made to the file at `path`
@@ -210,11 +209,28 @@ def create(
210209
# Initialize the executor
211210
executor = FileEditorExecutor(workspace_root=conv_state.workspace.working_dir)
212211

212+
# Build the tool description with conditional image viewing support
213+
# Split TOOL_DESCRIPTION to insert image viewing line after the second bullet
214+
description_lines = TOOL_DESCRIPTION.split("\n")
215+
base_description = "\n".join(description_lines[:2]) # First two lines
216+
remaining_description = "\n".join(description_lines[2:]) # Rest of description
217+
218+
# Add image viewing line if LLM supports vision
219+
if conv_state.agent.llm.vision_is_active():
220+
tool_description = (
221+
f"{base_description}\n"
222+
"* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, "
223+
".bmp), `view` displays the image content\n"
224+
f"{remaining_description}"
225+
)
226+
else:
227+
tool_description = TOOL_DESCRIPTION
228+
213229
# Add working directory information to the tool description
214230
# to guide the agent to use the correct directory instead of root
215231
working_dir = conv_state.workspace.working_dir
216232
enhanced_description = (
217-
f"{TOOL_DESCRIPTION}\n\n"
233+
f"{tool_description}\n\n"
218234
f"Your current working directory is: {working_dir}\n"
219235
f"When exploring project structure, start with this directory "
220236
f"instead of the root filesystem."

tests/tools/file_editor/test_file_editor_tool.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,3 +222,50 @@ def test_file_editor_tool_openai_format_includes_working_directory():
222222
"When exploring project structure, start with this directory "
223223
"instead of the root filesystem."
224224
) in description
225+
226+
227+
def test_file_editor_tool_image_viewing_line_with_vision_enabled():
228+
"""Test that image viewing line is included when LLM supports vision."""
229+
with tempfile.TemporaryDirectory() as temp_dir:
230+
# Create LLM with vision support (gpt-4o-mini supports vision)
231+
llm = LLM(
232+
model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm"
233+
)
234+
agent = Agent(llm=llm, tools=[])
235+
conv_state = ConversationState.create(
236+
id=uuid4(),
237+
agent=agent,
238+
workspace=LocalWorkspace(working_dir=temp_dir),
239+
)
240+
241+
tools = FileEditorTool.create(conv_state)
242+
tool = tools[0]
243+
244+
# Check that the image viewing line is included in description
245+
assert (
246+
"If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, .bmp)"
247+
in tool.description
248+
)
249+
assert "view` displays the image content" in tool.description
250+
251+
252+
def test_file_editor_tool_image_viewing_line_with_vision_disabled():
253+
"""Test that image viewing line is excluded when LLM doesn't support vision."""
254+
with tempfile.TemporaryDirectory() as temp_dir:
255+
# Create LLM without vision support (gpt-3.5-turbo doesn't support vision)
256+
llm = LLM(
257+
model="gpt-3.5-turbo", api_key=SecretStr("test-key"), usage_id="test-llm"
258+
)
259+
agent = Agent(llm=llm, tools=[])
260+
conv_state = ConversationState.create(
261+
id=uuid4(),
262+
agent=agent,
263+
workspace=LocalWorkspace(working_dir=temp_dir),
264+
)
265+
266+
tools = FileEditorTool.create(conv_state)
267+
tool = tools[0]
268+
269+
# Check that the image viewing line is NOT included in description
270+
assert "is an image file" not in tool.description
271+
assert "displays the image content" not in tool.description

0 commit comments

Comments
 (0)