Skip to content

Commit 8ac8b3e

Browse files
authored
[EvaluationResult Convert]Counts only for primary metrics when multiple metrics and exclude errored counts for passed/failed (#43878)
* update * rename * run black * fix result counts * update * Fix bug * run black * fix bug * Add UT * fix bug: handle null value for summary counts * address comments * Update counts to ignore non-primary metrics when multiple metrics * update primary sequence * update to get eval name then metrics * update doc string and address comments * update result count to exclude errored counts for passed/failed counts * add the renamed evaluator into mappings * run black * handle empty string for token counts
1 parent 484d5d0 commit 8ac8b3e

File tree

4 files changed

+67
-12
lines changed

4 files changed

+67
-12
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_constants.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,14 +97,15 @@ class _EvaluatorMetricMapping:
9797
"""
9898
Static mapping of evaluator names to their metric names, based on assets.json.
9999
The 'builtin.' prefix is removed from the evaluator name keys.
100+
If an evaluator maps to multiple metrics, all metric names are included in the list, and the first one is considered the primary metric.
100101
"""
101102

102103
EVALUATOR_NAME_METRICS_MAPPINGS = {
103104
"bleu_score": ["bleu"],
104105
"coherence": ["coherence"],
105106
"document_retrieval": [
106-
"ndcg@3",
107107
"xdcg@3",
108+
"ndcg@3",
108109
"fidelity",
109110
"top1_relevance",
110111
"top3_max_relevance",
@@ -119,7 +120,7 @@ class _EvaluatorMetricMapping:
119120
"meteor_score": ["meteor"],
120121
"relevance": ["relevance"],
121122
"response_completeness": ["response_completeness"],
122-
"rouge_score": ["rouge_precision", "rouge_recall", "rouge_f1_score"],
123+
"rouge_score": ["rouge_f1_score", "rouge_precision", "rouge_recall"],
123124
"groundedness_pro": ["groundedness_pro"],
124125
"similarity": ["similarity"],
125126
"intent_resolution": ["intent_resolution"],
@@ -143,6 +144,7 @@ class _EvaluatorMetricMapping:
143144
"tool_input_accuracy": ["tool_input_accuracy"],
144145
"task_completion": ["task_completion"],
145146
"tool_success": ["tool_success"],
147+
"tool_call_success": ["tool_call_success"],
146148
"tool_selection": ["tool_selection"],
147149
"tool_output_utilization": ["tool_output_utilization"],
148150
"task_navigation_efficiency": ["task_navigation_efficiency"],

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2004,6 +2004,7 @@ def _convert_results_to_aoai_evaluation_results(
20042004
)
20052005
elif metric_key.endswith("_total_tokens"):
20062006
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2007+
metric_value = None if _is_none_or_nan(metric_value) else metric_value
20072008
if metric not in result_per_metric:
20082009
result_per_metric[metric] = {"sample": {"usage": {"total_tokens": metric_value}}}
20092010
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
@@ -2021,6 +2022,7 @@ def _convert_results_to_aoai_evaluation_results(
20212022
)
20222023
elif metric_key.endswith("_prompt_tokens"):
20232024
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2025+
metric_value = None if _is_none_or_nan(metric_value) else metric_value
20242026
if metric not in result_per_metric:
20252027
result_per_metric[metric] = {"sample": {"usage": {"prompt_tokens": metric_value}}}
20262028
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
@@ -2038,6 +2040,7 @@ def _convert_results_to_aoai_evaluation_results(
20382040
)
20392041
elif metric_key.endswith("_completion_tokens"):
20402042
metric = _get_metric_from_criteria(criteria_name, metric_key, expected_metrics)
2043+
metric_value = None if _is_none_or_nan(metric_value) else metric_value
20412044
if metric not in result_per_metric:
20422045
result_per_metric[metric] = {"sample": {"usage": {"completion_tokens": metric_value}}}
20432046
elif metric in result_per_metric and "sample" not in result_per_metric[metric]:
@@ -2195,7 +2198,7 @@ def _convert_results_to_aoai_evaluation_results(
21952198
f"Converted {len(converted_rows)} rows to AOAI evaluation format, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
21962199
)
21972200
# Calculate summary statistics
2198-
evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger)
2201+
evaluation_summary = _calculate_aoai_evaluation_summary(converted_rows, logger, criteria_name_types_from_meta)
21992202
results["_evaluation_summary"] = evaluation_summary
22002203
logger.info(
22012204
f"Summary statistics calculated for {len(converted_rows)} rows, eval_id: {eval_id}, eval_run_id: {eval_run_id}"
@@ -2215,7 +2218,7 @@ def _is_none_or_nan(value: Any) -> bool:
22152218
return True
22162219
if isinstance(value, float) and math.isnan(value):
22172220
return True
2218-
if isinstance(value, str) and value.lower() in ["nan", "null", "none"]:
2221+
if isinstance(value, str) and value.lower() in ["nan", "null", "none", ""]:
22192222
return True
22202223
return False
22212224

@@ -2314,7 +2317,34 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
23142317
return metric
23152318

23162319

2317-
def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logger) -> Dict[str, Any]:
2320+
def _is_primary_metric(metric_name: str, evaluator_name: str) -> bool:
2321+
"""
2322+
Check if the given metric name is a primary metric.
2323+
2324+
:param metric_name: The name of the metric
2325+
:type metric_name: str
2326+
:param evaluator_name: The name of the evaluator
2327+
:type evaluator_name: str
2328+
:return: True if the metric is a primary metric, False otherwise
2329+
:rtype: bool
2330+
"""
2331+
if (
2332+
not _is_none_or_nan(metric_name)
2333+
and not _is_none_or_nan(evaluator_name)
2334+
and evaluator_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS
2335+
and isinstance(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name], list)
2336+
and len(_EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]) > 1
2337+
and metric_name in _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name]
2338+
and metric_name.lower() != _EvaluatorMetricMapping.EVALUATOR_NAME_METRICS_MAPPINGS[evaluator_name][0].lower()
2339+
):
2340+
return False
2341+
else:
2342+
return True
2343+
2344+
2345+
def _calculate_aoai_evaluation_summary(
2346+
aoai_results: list, logger: logging.Logger, criteria_name_types_from_meta: Optional[Dict[str, Any]]
2347+
) -> Dict[str, Any]:
23182348
"""
23192349
Calculate summary statistics for AOAI evaluation results.
23202350
@@ -2344,9 +2374,25 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
23442374
)
23452375
for result_item in aoai_result["results"]:
23462376
if isinstance(result_item, dict):
2377+
testing_criteria = result_item.get("name", "")
2378+
is_primary_metric = True
2379+
if (
2380+
criteria_name_types_from_meta is not None
2381+
and isinstance(criteria_name_types_from_meta, dict)
2382+
and testing_criteria in criteria_name_types_from_meta
2383+
):
2384+
evaluator_name = criteria_name_types_from_meta[testing_criteria].get("evaluator_name", None)
2385+
criteria_type = criteria_name_types_from_meta[testing_criteria].get("type", None)
2386+
if criteria_type == "azure_ai_evaluator" and evaluator_name.startswith("builtin."):
2387+
evaluator_name = evaluator_name.replace("builtin.", "")
2388+
is_primary_metric = _is_primary_metric(result_item.get("metric", ""), evaluator_name)
2389+
if not is_primary_metric:
2390+
logger.info(
2391+
f"Skip counts for non-primary metric for testing_criteria: {testing_criteria}, metric: {result_item.get('metric', '')}"
2392+
)
2393+
continue
23472394
# Check if the result has a 'passed' field
23482395
if "passed" in result_item and result_item["passed"] is not None:
2349-
testing_criteria = result_item.get("name", "")
23502396
if testing_criteria not in result_counts_stats:
23512397
result_counts_stats[testing_criteria] = {
23522398
"testing_criteria": testing_criteria,
@@ -2372,15 +2418,14 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
23722418
elif isinstance(aoai_result, dict) and aoai_result.get("status") == "error":
23732419
error_count += 1
23742420

2421+
# Update overall result counts, error counts will not be considered for passed/failed
23752422
if error_count > 0:
23762423
result_counts["errored"] += 1
2377-
elif failed_count > 0:
2424+
2425+
if failed_count > 0:
23782426
result_counts["failed"] += 1
23792427
elif (
2380-
error_count == 0
2381-
and failed_count == 0
2382-
and passed_count > 0
2383-
and passed_count == len(aoai_result.get("results", []))
2428+
failed_count == 0 and passed_count > 0 and passed_count == len(aoai_result.get("results", [])) - error_count
23842429
):
23852430
result_counts["passed"] += 1
23862431

@@ -2428,6 +2473,9 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
24282473
cur_cached_tokens = usage_data.get("cached_tokens", 0)
24292474
if _is_none_or_nan(cur_cached_tokens):
24302475
cur_cached_tokens = 0
2476+
logger.info(
2477+
f"Model: {model_name}, cur_total_tokens: {cur_total_tokens}, {_is_none_or_nan(cur_total_tokens)}, cur_prompt_tokens: {cur_prompt_tokens}, cur_completion_tokens: {cur_completion_tokens}, cur_cached_tokens: {cur_cached_tokens}"
2478+
)
24312479
model_stats["total_tokens"] += cur_total_tokens
24322480
model_stats["prompt_tokens"] += cur_prompt_tokens
24332481
model_stats["completion_tokens"] += cur_completion_tokens

sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_eval_meta_data.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
"name": "violence",
1212
"evaluator_name": "violence"
1313
},
14+
{
15+
"type": "azure_ai_evaluator",
16+
"name": "self_harm",
17+
"evaluator_name": "builtin.self_harm"
18+
},
1419
{
1520
"type": "azure_ai_evaluator",
1621
"id": "ViolenceContentCustomEvaluator_35feb949-e01b-4502-8011-d22347d092af",

sdk/evaluation/azure-ai-evaluation/tests/unittests/data/evaluation_util_convert_expected_output.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@
458458
"result_counts": {
459459
"total": 2,
460460
"errored": 2,
461-
"failed": 0,
461+
"failed": 2,
462462
"passed": 0
463463
},
464464
"per_model_usage": [

0 commit comments

Comments
 (0)