diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 80c470f02eba..cb1b608dcdb6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -205,6 +205,15 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + if eval_input.get("query") is None: + raise EvaluationException( + message=("Query is a required input to the Tool Call Accuracy evaluator."), + internal_message=("Query is a required input to the Tool Call Accuracy evaluator."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + ) + # Single LLM call for all tool calls prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", {}) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 006c877e0db2..0a3d00ddeb12 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -153,12 +153,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ - # Format conversation history for cleaner evaluation - if "query" in eval_input: - eval_input["query"] = reformat_conversation_history( - eval_input["query"], logger, include_system_messages=True, include_tool_messages=True + if eval_input.get("query") is None: + raise EvaluationException( + message=("Query is a required input to " "the Tool Input Accuracy evaluator."), + internal_message=("Query is a required input " "to the Tool Input Accuracy evaluator."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, ) + # Format conversation history for cleaner evaluation + eval_input["query"] = reformat_conversation_history( + eval_input["query"], logger, include_system_messages=True, include_tool_messages=True + ) + # Call the LLM to evaluate prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", {}) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 616be11b9ee8..48963fa00d58 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -175,12 +175,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ - # Format conversation history for cleaner evaluation - if "query" in eval_input: - eval_input["query"] = reformat_conversation_history( - eval_input["query"], logger, include_system_messages=True, include_tool_messages=True + if eval_input.get("query") is None: + raise EvaluationException( + message=("Query is a required input to the Tool Selection evaluator."), + internal_message=("Query is a required input to the Tool Selection evaluator."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_SELECTION_EVALUATOR, ) + # Format conversation history for cleaner evaluation + eval_input["query"] = reformat_conversation_history( + eval_input["query"], logger, include_system_messages=True, include_tool_messages=True + ) + # Call the LLM to evaluate prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", {}) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 7b82c1beb8c3..f84ad64b53b8 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -688,3 +688,41 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config): assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" + + def test_evaluate_missing_query(self, mock_model_config): + """Test that evaluator raises exception when query is None or missing.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + tool_calls = [ + { + "type": "tool_call", + "tool_call_id": "call_good", + "name": "get_weather", + "arguments": {"location": "Paris"}, + } + ] + tool_definitions = [ + { + "name": "get_weather", + "type": "function", + "description": "Get weather information", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "The location"}}, + "required": ["location"], + }, + } + ] + + # Test with query=None + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) + + # Test with query not provided at all + with pytest.raises(EvaluationException) as exc_info: + evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py index c8da3c223b9a..c41193c489ca 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py @@ -652,3 +652,46 @@ def test_evaluate_with_single_tool_definition(self, mock_model_config): assert result is not None assert result[key] == 1 assert result[f"{key}_result"] == "pass" + + def test_evaluate_missing_query(self, mock_model_config): + """Test that evaluator raises exception when query is None or missing.""" + evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + response = [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_123", + "name": "get_weather", + "arguments": {"location": "Paris"}, + } + ], + } + ] + tool_definitions = [ + { + "name": "get_weather", + "type": "function", + "description": "Get weather information for a location", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "The location to get weather for"}}, + "required": ["location"], + }, + } + ] + + # Test with query=None + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=None, response=response, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) + + # Test with query not provided at all + with pytest.raises(EvaluationException) as exc_info: + evaluator(response=response, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py index 8390e30c3e4c..bf23c45f5d43 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py @@ -284,3 +284,37 @@ def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) assert "Invalid score value" in str(exc_info.value) + + def test_evaluate_tool_selection_missing_query(self, mock_model_config): + """Test that evaluator raises exception when query is None or missing.""" + evaluator = _ToolSelectionEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=tool_selection_flow_side_effect) + + tool_calls = [ + { + "type": "tool_call", + "tool_call_id": "call_weather", + "name": "get_weather", + "arguments": {"location": "current"}, + } + ] + tool_definitions = [ + { + "name": "get_weather", + "type": "function", + "description": "Get weather information", + "parameters": {"type": "object", "properties": {"location": {"type": "string"}}}, + } + ] + + # Test with query=None + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) + + # Test with query not provided at all + with pytest.raises(EvaluationException) as exc_info: + evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value)