Rename Tool Success to Tool Call Success (#43810)

ahibrahimm · web-flow · commit d0c3eeaff255 · 2025-11-09T20:20:24.000-08:00
* rename

* update import

* updates

* updates

* black
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
@@ -33,6 +33,7 @@
 from ._evaluators._tool_call_accuracy import ToolCallAccuracyEvaluator
 from ._evaluators._document_retrieval import DocumentRetrievalEvaluator
 from ._evaluators._tool_output_utilization import _ToolOutputUtilizationEvaluator
+from ._evaluators._tool_call_success import _ToolCallSuccessEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -133,6 +134,7 @@ def lazy_import():
     "UngroundedAttributesEvaluator",
     "ToolCallAccuracyEvaluator",
     "_ToolOutputUtilizationEvaluator",
+    "_ToolCallSuccessEvaluator",
     "AzureOpenAIGrader",
     "AzureOpenAILabelGrader",
     "AzureOpenAIStringCheckGrader",
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/constants.py
@@ -20,7 +20,7 @@
     "tool_output_utilization",
     "task_completion",
     "tool_input_accuracy",
-    "tool_success",
+    "tool_call_success",
     "tool_call_accuracy",
 ]
 
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py
@@ -143,7 +143,6 @@ class _EvaluatorMetricMapping:
         "hate_unfairness": ["hate_unfairness"],
         "tool_input_accuracy": ["tool_input_accuracy"],
         "task_completion": ["task_completion"],
-        "tool_success": ["tool_success"],
         "tool_call_success": ["tool_call_success"],
         "tool_selection": ["tool_selection"],
         "tool_output_utilization": ["tool_output_utilization"],
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_eval_mapping.py
@@ -12,10 +12,14 @@
 # Import all evals
 from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
 from azure.ai.evaluation._evaluators._task_completion import _TaskCompletionEvaluator
-from azure.ai.evaluation._evaluators._tool_input_accuracy import _ToolInputAccuracyEvaluator
+from azure.ai.evaluation._evaluators._tool_input_accuracy import (
+    _ToolInputAccuracyEvaluator,
+)
 from azure.ai.evaluation._evaluators._tool_selection import _ToolSelectionEvaluator
-from azure.ai.evaluation._evaluators._tool_success import _ToolSuccessEvaluator
-from azure.ai.evaluation._evaluators._task_navigation_efficiency import _TaskNavigationEfficiencyEvaluator
+from azure.ai.evaluation._evaluators._tool_call_success import _ToolCallSuccessEvaluator
+from azure.ai.evaluation._evaluators._task_navigation_efficiency import (
+    _TaskNavigationEfficiencyEvaluator,
+)
 from azure.ai.evaluation import (
     BleuScoreEvaluator,
     CodeVulnerabilityEvaluator,
@@ -77,7 +81,7 @@
     ToolCallAccuracyEvaluator: "tool_call_accuracy",
     _ToolInputAccuracyEvaluator: "tool_input_accuracy",
     _ToolSelectionEvaluator: "tool_selection",
-    _ToolSuccessEvaluator: "tool_success",
+    _ToolCallSuccessEvaluator: "tool_call_success",
     UngroundedAttributesEvaluator: "ungrounded_attributes",
     ViolenceEvaluator: "violence",
 }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/__init__.py
@@ -2,6 +2,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
 
-from ._tool_success import _ToolSuccessEvaluator
+from ._tool_call_success import _ToolCallSuccessEvaluator
 
-__all__ = ["_ToolSuccessEvaluator"]
+__all__ = ["_ToolCallSuccessEvaluator"]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/_tool_call_success.py
@@ -6,7 +6,12 @@
 import logging
 from typing import Dict, Union, List, Optional
 from typing_extensions import overload, override
-from azure.ai.evaluation._exceptions import EvaluationException, ErrorBlame, ErrorCategory, ErrorTarget
+from azure.ai.evaluation._exceptions import (
+    EvaluationException,
+    ErrorBlame,
+    ErrorCategory,
+    ErrorTarget,
+)
 from azure.ai.evaluation._evaluators._common import PromptyEvaluatorBase
 from azure.ai.evaluation._common._experimental import experimental
 
@@ -15,8 +20,8 @@
 
 
 @experimental
-class _ToolSuccessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
-    """The Tool Success evaluator determines whether tool calls done by an AI agent includes failures or not.
+class _ToolCallSuccessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
+    """The Tool Call Success evaluator determines whether tool calls done by an AI agent includes failures or not.
 
     This evaluator focuses solely on tool call results and tool definitions, disregarding user's query to
     the agent, conversation history and agent's final response. Although tool definitions is optional,
@@ -36,34 +41,34 @@ class _ToolSuccessEvaluator(PromptyEvaluatorBase[Union[str, float]]):
 
     .. admonition:: Example:
         .. literalinclude:: ../samples/evaluation_samples_evaluate.py
-            :start-after: [START tool_success_evaluator]
-            :end-before: [END tool_success_evaluator]
+            :start-after: [START tool_call_success_evaluator]
+            :end-before: [END tool_call_success_evaluator]
             :language: python
             :dedent: 8
-            :caption: Initialize and call a _ToolSuccessEvaluator with a tool definitions and response.
+            :caption: Initialize and call a _ToolCallSuccessEvaluator with a tool definitions and response.
 
     .. admonition:: Example using Azure AI Project URL:
 
     .. literalinclude:: ../samples/evaluation_samples_evaluate_fdp.py
-        :start-after: [START tool_success_evaluator]
-        :end-before: [END tool_success_evaluator]
+        :start-after: [START tool_call_success_evaluator]
+        :end-before: [END tool_call_success_evaluator]
         :language: python
         :dedent: 8
-        :caption: Initialize and call a _ToolSuccessEvaluator using Azure AI Project URL in the following
+        :caption: Initialize and call a _ToolCallSuccessEvaluator using Azure AI Project URL in the following
             format https://{resource_name}.services.ai.azure.com/api/projects/{project_name}
 
     """
 
-    _PROMPTY_FILE = "tool_success.prompty"
-    _RESULT_KEY = "tool_success"
+    _PROMPTY_FILE = "tool_call_success.prompty"
+    _RESULT_KEY = "tool_call_success"
     _OPTIONAL_PARAMS = ["tool_definitions"]
 
-    id = "azureai://built-in/evaluators/tool_success"
+    id = "azureai://built-in/evaluators/tool_call_success"
     """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
 
     @override
     def __init__(self, model_config, *, credential=None, **kwargs):
-        """Initialize the Tool Success evaluator."""
+        """Initialize the Tool Call Success evaluator."""
         current_dir = os.path.dirname(__file__)
         prompty_path = os.path.join(current_dir, self._PROMPTY_FILE)
         super().__init__(
@@ -86,7 +91,7 @@ def __call__(
         """Evaluate tool call success for a given response, and optionally tool definitions.
 
         Example with list of messages:
-            evaluator = _ToolSuccessEvaluator(model_config)
+            evaluator = _ToolCallSuccessEvaluator(model_config)
             response = [{'createdAt': 1700000070, 'run_id': '0', 'role': 'assistant',
             'content': [{'type': 'text', 'text': '**Day 1:** Morning: Visit Louvre Museum (9 AM - 12 PM)...'}]}]
 
@@ -97,7 +102,7 @@ def __call__(
         :paramtype response: Union[str, List[dict]]
         :keyword tool_definitions: Optional tool definitions to use for evaluation.
         :paramtype tool_definitions: Union[dict, List[dict]]
-        :return: A dictionary with the tool success evaluation results.
+        :return: A dictionary with the Tool Call Success evaluation results.
         :rtype: Dict[str, Union[str, float]]
         """
 
@@ -116,7 +121,7 @@ def __call__(  # pylint: disable=docstring-missing-param
 
     @override
     async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # type: ignore[override]
-        """Do Tool Success evaluation.
+        """Do Tool Call Success evaluation.
 
         :param eval_input: The input to the evaluator. Expected to contain whatever inputs are
         needed for the _flow method
@@ -126,19 +131,19 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[str, float]]:  # t
         """
         if "response" not in eval_input:
             raise EvaluationException(
-                message="response is a required input to the Tool Success evaluator.",
-                internal_message="response is a required input to the Tool Success evaluator.",
+                message="response is a required input to the Tool Call Success evaluator.",
+                internal_message="response is a required input to the Tool Call Success evaluator.",
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.MISSING_FIELD,
-                target=ErrorTarget.TOOL_SUCCESS_EVALUATOR,
+                target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             )
         if eval_input["response"] is None or eval_input["response"] == []:
             raise EvaluationException(
-                message="response cannot be None or empty for the Tool Success evaluator.",
-                internal_message="response cannot be None or empty for the Tool Success evaluator.",
+                message="response cannot be None or empty for the Tool Call Success evaluator.",
+                internal_message="response cannot be None or empty for the Tool Call Success evaluator.",
                 blame=ErrorBlame.USER_ERROR,
                 category=ErrorCategory.INVALID_VALUE,
-                target=ErrorTarget.TOOL_SUCCESS_EVALUATOR,
+                target=ErrorTarget.TOOL_CALL_SUCCESS_EVALUATOR,
             )
 
         eval_input["tool_calls"] = _reformat_tool_calls_results(eval_input["response"], logger)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_success/tool_call_success.prompty
@@ -1,5 +1,5 @@
 ---
-name: Tool Success
+name: Tool Call Success
 description: Evaluates whether a Tool call was successful or resulted in a technical error
 model:
   api: chat
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py
@@ -85,7 +85,7 @@ class ErrorTarget(Enum):
     SIMILARITY_EVALUATOR = "SimilarityEvaluator"
     FLUENCY_EVALUATOR = "FluencyEvaluator"
     RETRIEVAL_EVALUATOR = "RetrievalEvaluator"
-    TOOL_SUCCESS_EVALUATOR = "_ToolSuccessEvaluator"
+    TOOL_CALL_SUCCESS_EVALUATOR = "_ToolCallSuccessEvaluator"
     TASK_ADHERENCE_EVALUATOR = "TaskAdherenceEvaluator"
     TASK_COMPLETION_EVALUATOR = "_TaskCompletionEvaluator"
     INDIRECT_ATTACK_EVALUATOR = "IndirectAttackEvaluator"
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate.py
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py b/sdk/evaluation/azure-ai-evaluation/samples/evaluation_samples_evaluate_fdp.py

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@`
`20`	`20`	`"tool_output_utilization",`
`21`	`21`	`"task_completion",`
`22`	`22`	`"tool_input_accuracy",`
`23`		`- "tool_success",`
	`23`	`+ "tool_call_success",`
`24`	`24`	`"tool_call_accuracy",`
`25`	`25`	`]`
`26`	`26`