Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,15 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
:return: The evaluation result.
:rtype: Dict
"""
if eval_input.get("query") is None:
raise EvaluationException(
message=("Query is a required input to the Tool Call Accuracy evaluator."),
internal_message=("Query is a required input to the Tool Call Accuracy evaluator."),
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR,
)

# Single LLM call for all tool calls
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", {})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float]]
"""
# Format conversation history for cleaner evaluation
if "query" in eval_input:
eval_input["query"] = reformat_conversation_history(
eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
if eval_input.get("query") is None:
raise EvaluationException(
message=("Query is a required input to " "the Tool Input Accuracy evaluator."),
internal_message=("Query is a required input " "to the Tool Input Accuracy evaluator."),
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR,
)

# Format conversation history for cleaner evaluation
eval_input["query"] = reformat_conversation_history(
eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
)

# Call the LLM to evaluate
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", {})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]:
:return: A dictionary containing the result of the evaluation.
:rtype: Dict[str, Union[str, float]]
"""
# Format conversation history for cleaner evaluation
if "query" in eval_input:
eval_input["query"] = reformat_conversation_history(
eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
if eval_input.get("query") is None:
raise EvaluationException(
message=("Query is a required input to the Tool Selection evaluator."),
internal_message=("Query is a required input to the Tool Selection evaluator."),
blame=ErrorBlame.USER_ERROR,
category=ErrorCategory.INVALID_VALUE,
target=ErrorTarget.TOOL_SELECTION_EVALUATOR,
)

# Format conversation history for cleaner evaluation
eval_input["query"] = reformat_conversation_history(
eval_input["query"], logger, include_system_messages=True, include_tool_messages=True
)

# Call the LLM to evaluate
prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
llm_output = prompty_output_dict.get("llm_output", {})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -688,3 +688,41 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config):
assert result is not None
assert result[key] == 5.0
assert result[f"{key}_result"] == "pass"

def test_evaluate_missing_query(self, mock_model_config):
"""Test that evaluator raises exception when query is None or missing."""
evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config)
evaluator._flow = MagicMock(side_effect=flow_side_effect)

tool_calls = [
{
"type": "tool_call",
"tool_call_id": "call_good",
"name": "get_weather",
"arguments": {"location": "Paris"},
}
]
tool_definitions = [
{
"name": "get_weather",
"type": "function",
"description": "Get weather information",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string", "description": "The location"}},
"required": ["location"],
},
}
]

# Test with query=None
with pytest.raises(EvaluationException) as exc_info:
evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions)

assert "Query is a required input" in str(exc_info.value)

# Test with query not provided at all
with pytest.raises(EvaluationException) as exc_info:
evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions)

assert "Query is a required input" in str(exc_info.value)
Original file line number Diff line number Diff line change
Expand Up @@ -652,3 +652,46 @@ def test_evaluate_with_single_tool_definition(self, mock_model_config):
assert result is not None
assert result[key] == 1
assert result[f"{key}_result"] == "pass"

def test_evaluate_missing_query(self, mock_model_config):
"""Test that evaluator raises exception when query is None or missing."""
evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config)
evaluator._flow = MagicMock(side_effect=flow_side_effect)

response = [
{
"role": "assistant",
"content": [
{
"type": "tool_call",
"tool_call_id": "call_123",
"name": "get_weather",
"arguments": {"location": "Paris"},
}
],
}
]
tool_definitions = [
{
"name": "get_weather",
"type": "function",
"description": "Get weather information for a location",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string", "description": "The location to get weather for"}},
"required": ["location"],
},
}
]

# Test with query=None
with pytest.raises(EvaluationException) as exc_info:
evaluator(query=None, response=response, tool_definitions=tool_definitions)

assert "Query is a required input" in str(exc_info.value)

# Test with query not provided at all
with pytest.raises(EvaluationException) as exc_info:
evaluator(response=response, tool_definitions=tool_definitions)

assert "Query is a required input" in str(exc_info.value)
Original file line number Diff line number Diff line change
Expand Up @@ -284,3 +284,37 @@ def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config
evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions)

assert "Invalid score value" in str(exc_info.value)

def test_evaluate_tool_selection_missing_query(self, mock_model_config):
"""Test that evaluator raises exception when query is None or missing."""
evaluator = _ToolSelectionEvaluator(model_config=mock_model_config)
evaluator._flow = MagicMock(side_effect=tool_selection_flow_side_effect)

tool_calls = [
{
"type": "tool_call",
"tool_call_id": "call_weather",
"name": "get_weather",
"arguments": {"location": "current"},
}
]
tool_definitions = [
{
"name": "get_weather",
"type": "function",
"description": "Get weather information",
"parameters": {"type": "object", "properties": {"location": {"type": "string"}}},
}
]

# Test with query=None
with pytest.raises(EvaluationException) as exc_info:
evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions)

assert "Query is a required input" in str(exc_info.value)

# Test with query not provided at all
with pytest.raises(EvaluationException) as exc_info:
evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions)

assert "Query is a required input" in str(exc_info.value)