From 690d697bf3180160af8cd98e6fde0afb71a85322 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Mon, 10 Nov 2025 14:53:56 +0200 Subject: [PATCH 1/4] Ensure query exists for tool-based evaluators --- .../_tool_call_accuracy.py | 9 ++++ .../_tool_input_accuracy.py | 16 +++++-- .../_tool_selection/_tool_selection.py | 16 +++++-- .../test_tool_call_accuracy_evaluator.py | 42 ++++++++++++++++++ .../test_tool_input_accuracy_evaluator.py | 43 +++++++++++++++++++ .../test_tool_selection_evaluator.py | 34 +++++++++++++++ 6 files changed, 152 insertions(+), 8 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index 80c470f02eba..af827d9fbae2 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -205,6 +205,15 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ + if "query" not in eval_input or eval_input.get("query") is None: + raise EvaluationException( + message=("Query is a required input to the Tool Call Accuracy evaluator."), + internal_message=("Query is a required input to the Tool Call Accuracy evaluator."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_CALL_ACCURACY_EVALUATOR, + ) + # Single LLM call for all tool calls prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", {}) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 006c877e0db2..1802730663dd 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -153,12 +153,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ - # Format conversation history for cleaner evaluation - if "query" in eval_input: - eval_input["query"] = reformat_conversation_history( - eval_input["query"], logger, include_system_messages=True, include_tool_messages=True + if "query" not in eval_input or eval_input.get("query") is None: + raise EvaluationException( + message=("Query is a required input to " "the Tool Input Accuracy evaluator."), + internal_message=("Query is a required input " "to the Tool Input Accuracy evaluator."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_INPUT_ACCURACY_EVALUATOR, ) + # Format conversation history for cleaner evaluation + eval_input["query"] = reformat_conversation_history( + eval_input["query"], logger, include_system_messages=True, include_tool_messages=True + ) + # Call the LLM to evaluate prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", {}) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 616be11b9ee8..b1008a8f707c 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -175,12 +175,20 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ - # Format conversation history for cleaner evaluation - if "query" in eval_input: - eval_input["query"] = reformat_conversation_history( - eval_input["query"], logger, include_system_messages=True, include_tool_messages=True + if "query" not in eval_input or eval_input.get("query") is None: + raise EvaluationException( + message=("Query is a required input to the Tool Selection evaluator."), + internal_message=("Query is a required inputto the Tool Selection evaluator."), + blame=ErrorBlame.USER_ERROR, + category=ErrorCategory.INVALID_VALUE, + target=ErrorTarget.TOOL_SELECTION_EVALUATOR, ) + # Format conversation history for cleaner evaluation + eval_input["query"] = reformat_conversation_history( + eval_input["query"], logger, include_system_messages=True, include_tool_messages=True + ) + # Call the LLM to evaluate prompty_output_dict = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input) llm_output = prompty_output_dict.get("llm_output", {}) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 7b82c1beb8c3..54ea80496540 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -11,6 +11,10 @@ async def flow_side_effect(timeout, **kwargs): tool_calls = kwargs.get("tool_calls", []) query = kwargs.get("query", "") + # Handle None query case + if query is None: + query = "" + # Handle built-in tool calls first - count them as relevant builtin_calls = 0 custom_function_calls = [] @@ -688,3 +692,41 @@ def test_evaluate_open_api_with_tool_definition(self, mock_model_config): assert result is not None assert result[key] == 5.0 assert result[f"{key}_result"] == "pass" + + def test_evaluate_missing_query(self, mock_model_config): + """Test that evaluator raises exception when query is None or missing.""" + evaluator = ToolCallAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + tool_calls = [ + { + "type": "tool_call", + "tool_call_id": "call_good", + "name": "get_weather", + "arguments": {"location": "Paris"}, + } + ] + tool_definitions = [ + { + "name": "get_weather", + "type": "function", + "description": "Get weather information", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "The location"}}, + "required": ["location"], + }, + } + ] + + # Test with query=None + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) + + # Test with query not provided at all + with pytest.raises(EvaluationException) as exc_info: + evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py index c8da3c223b9a..c41193c489ca 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_input_accuracy_evaluator.py @@ -652,3 +652,46 @@ def test_evaluate_with_single_tool_definition(self, mock_model_config): assert result is not None assert result[key] == 1 assert result[f"{key}_result"] == "pass" + + def test_evaluate_missing_query(self, mock_model_config): + """Test that evaluator raises exception when query is None or missing.""" + evaluator = _ToolInputAccuracyEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=flow_side_effect) + + response = [ + { + "role": "assistant", + "content": [ + { + "type": "tool_call", + "tool_call_id": "call_123", + "name": "get_weather", + "arguments": {"location": "Paris"}, + } + ], + } + ] + tool_definitions = [ + { + "name": "get_weather", + "type": "function", + "description": "Get weather information for a location", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string", "description": "The location to get weather for"}}, + "required": ["location"], + }, + } + ] + + # Test with query=None + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=None, response=response, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) + + # Test with query not provided at all + with pytest.raises(EvaluationException) as exc_info: + evaluator(response=response, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py index 8390e30c3e4c..bf23c45f5d43 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_selection_evaluator.py @@ -284,3 +284,37 @@ def test_evaluate_tool_selection_exception_invalid_score(self, mock_model_config evaluator(query=query, tool_calls=tool_calls, tool_definitions=tool_definitions) assert "Invalid score value" in str(exc_info.value) + + def test_evaluate_tool_selection_missing_query(self, mock_model_config): + """Test that evaluator raises exception when query is None or missing.""" + evaluator = _ToolSelectionEvaluator(model_config=mock_model_config) + evaluator._flow = MagicMock(side_effect=tool_selection_flow_side_effect) + + tool_calls = [ + { + "type": "tool_call", + "tool_call_id": "call_weather", + "name": "get_weather", + "arguments": {"location": "current"}, + } + ] + tool_definitions = [ + { + "name": "get_weather", + "type": "function", + "description": "Get weather information", + "parameters": {"type": "object", "properties": {"location": {"type": "string"}}}, + } + ] + + # Test with query=None + with pytest.raises(EvaluationException) as exc_info: + evaluator(query=None, tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) + + # Test with query not provided at all + with pytest.raises(EvaluationException) as exc_info: + evaluator(tool_calls=tool_calls, tool_definitions=tool_definitions) + + assert "Query is a required input" in str(exc_info.value) From 72c53748ddec6983915b50451335e215d7ba065e Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Mon, 10 Nov 2025 14:57:02 +0200 Subject: [PATCH 2/4] Remove unnecessary condition in test --- .../tests/unittests/test_tool_call_accuracy_evaluator.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py index 54ea80496540..f84ad64b53b8 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py @@ -11,10 +11,6 @@ async def flow_side_effect(timeout, **kwargs): tool_calls = kwargs.get("tool_calls", []) query = kwargs.get("query", "") - # Handle None query case - if query is None: - query = "" - # Handle built-in tool calls first - count them as relevant builtin_calls = 0 custom_function_calls = [] From 9322d83152af710ada9378bec8733d51e8904013 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Mon, 10 Nov 2025 15:13:52 +0200 Subject: [PATCH 3/4] Fix condition --- .../_evaluators/_tool_call_accuracy/_tool_call_accuracy.py | 2 +- .../_evaluators/_tool_input_accuracy/_tool_input_accuracy.py | 2 +- .../evaluation/_evaluators/_tool_selection/_tool_selection.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py index af827d9fbae2..cb1b608dcdb6 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py @@ -205,7 +205,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t :return: The evaluation result. :rtype: Dict """ - if "query" not in eval_input or eval_input.get("query") is None: + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Call Accuracy evaluator."), internal_message=("Query is a required input to the Tool Call Accuracy evaluator."), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py index 1802730663dd..0a3d00ddeb12 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_input_accuracy/_tool_input_accuracy.py @@ -153,7 +153,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ - if "query" not in eval_input or eval_input.get("query") is None: + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to " "the Tool Input Accuracy evaluator."), internal_message=("Query is a required input " "to the Tool Input Accuracy evaluator."), diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index b1008a8f707c..41f868e8185e 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -175,7 +175,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: :return: A dictionary containing the result of the evaluation. :rtype: Dict[str, Union[str, float]] """ - if "query" not in eval_input or eval_input.get("query") is None: + if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Selection evaluator."), internal_message=("Query is a required inputto the Tool Selection evaluator."), From 9e4fdb86b2deca331e11a801a511e3f4e1d06b95 Mon Sep 17 00:00:00 2001 From: Salma Elshafey Date: Mon, 10 Nov 2025 16:03:20 +0200 Subject: [PATCH 4/4] Fix spelling mistake --- .../evaluation/_evaluators/_tool_selection/_tool_selection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py index 41f868e8185e..48963fa00d58 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_selection/_tool_selection.py @@ -178,7 +178,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: if eval_input.get("query") is None: raise EvaluationException( message=("Query is a required input to the Tool Selection evaluator."), - internal_message=("Query is a required inputto the Tool Selection evaluator."), + internal_message=("Query is a required input to the Tool Selection evaluator."), blame=ErrorBlame.USER_ERROR, category=ErrorCategory.INVALID_VALUE, target=ErrorTarget.TOOL_SELECTION_EVALUATOR,