@@ -2004,6 +2004,7 @@ def _convert_results_to_aoai_evaluation_results(
20042004 )
20052005 elif metric_key .endswith ("_total_tokens" ):
20062006 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
2007+ metric_value = None if _is_none_or_nan (metric_value ) else metric_value
20072008 if metric not in result_per_metric :
20082009 result_per_metric [metric ] = {"sample" : {"usage" : {"total_tokens" : metric_value }}}
20092010 elif metric in result_per_metric and "sample" not in result_per_metric [metric ]:
@@ -2021,6 +2022,7 @@ def _convert_results_to_aoai_evaluation_results(
20212022 )
20222023 elif metric_key .endswith ("_prompt_tokens" ):
20232024 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
2025+ metric_value = None if _is_none_or_nan (metric_value ) else metric_value
20242026 if metric not in result_per_metric :
20252027 result_per_metric [metric ] = {"sample" : {"usage" : {"prompt_tokens" : metric_value }}}
20262028 elif metric in result_per_metric and "sample" not in result_per_metric [metric ]:
@@ -2038,6 +2040,7 @@ def _convert_results_to_aoai_evaluation_results(
20382040 )
20392041 elif metric_key .endswith ("_completion_tokens" ):
20402042 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
2043+ metric_value = None if _is_none_or_nan (metric_value ) else metric_value
20412044 if metric not in result_per_metric :
20422045 result_per_metric [metric ] = {"sample" : {"usage" : {"completion_tokens" : metric_value }}}
20432046 elif metric in result_per_metric and "sample" not in result_per_metric [metric ]:
@@ -2195,7 +2198,7 @@ def _convert_results_to_aoai_evaluation_results(
21952198 f"Converted { len (converted_rows )} rows to AOAI evaluation format, eval_id: { eval_id } , eval_run_id: { eval_run_id } "
21962199 )
21972200 # Calculate summary statistics
2198- evaluation_summary = _calculate_aoai_evaluation_summary (converted_rows , logger )
2201+ evaluation_summary = _calculate_aoai_evaluation_summary (converted_rows , logger , criteria_name_types_from_meta )
21992202 results ["_evaluation_summary" ] = evaluation_summary
22002203 logger .info (
22012204 f"Summary statistics calculated for { len (converted_rows )} rows, eval_id: { eval_id } , eval_run_id: { eval_run_id } "
@@ -2215,7 +2218,7 @@ def _is_none_or_nan(value: Any) -> bool:
22152218 return True
22162219 if isinstance (value , float ) and math .isnan (value ):
22172220 return True
2218- if isinstance (value , str ) and value .lower () in ["nan" , "null" , "none" ]:
2221+ if isinstance (value , str ) and value .lower () in ["nan" , "null" , "none" , "" ]:
22192222 return True
22202223 return False
22212224
@@ -2314,7 +2317,34 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
23142317 return metric
23152318
23162319
2317- def _calculate_aoai_evaluation_summary (aoai_results : list , logger : logging .Logger ) -> Dict [str , Any ]:
2320+ def _is_primary_metric (metric_name : str , evaluator_name : str ) -> bool :
2321+ """
2322+ Check if the given metric name is a primary metric.
2323+
2324+ :param metric_name: The name of the metric
2325+ :type metric_name: str
2326+ :param evaluator_name: The name of the evaluator
2327+ :type evaluator_name: str
2328+ :return: True if the metric is a primary metric, False otherwise
2329+ :rtype: bool
2330+ """
2331+ if (
2332+ not _is_none_or_nan (metric_name )
2333+ and not _is_none_or_nan (evaluator_name )
2334+ and evaluator_name in _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS
2335+ and isinstance (_EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS [evaluator_name ], list )
2336+ and len (_EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS [evaluator_name ]) > 1
2337+ and metric_name in _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS [evaluator_name ]
2338+ and metric_name .lower () != _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS [evaluator_name ][0 ].lower ()
2339+ ):
2340+ return False
2341+ else :
2342+ return True
2343+
2344+
2345+ def _calculate_aoai_evaluation_summary (
2346+ aoai_results : list , logger : logging .Logger , criteria_name_types_from_meta : Optional [Dict [str , Any ]]
2347+ ) -> Dict [str , Any ]:
23182348 """
23192349 Calculate summary statistics for AOAI evaluation results.
23202350
@@ -2344,9 +2374,25 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
23442374 )
23452375 for result_item in aoai_result ["results" ]:
23462376 if isinstance (result_item , dict ):
2377+ testing_criteria = result_item .get ("name" , "" )
2378+ is_primary_metric = True
2379+ if (
2380+ criteria_name_types_from_meta is not None
2381+ and isinstance (criteria_name_types_from_meta , dict )
2382+ and testing_criteria in criteria_name_types_from_meta
2383+ ):
2384+ evaluator_name = criteria_name_types_from_meta [testing_criteria ].get ("evaluator_name" , None )
2385+ criteria_type = criteria_name_types_from_meta [testing_criteria ].get ("type" , None )
2386+ if criteria_type == "azure_ai_evaluator" and evaluator_name .startswith ("builtin." ):
2387+ evaluator_name = evaluator_name .replace ("builtin." , "" )
2388+ is_primary_metric = _is_primary_metric (result_item .get ("metric" , "" ), evaluator_name )
2389+ if not is_primary_metric :
2390+ logger .info (
2391+ f"Skip counts for non-primary metric for testing_criteria: { testing_criteria } , metric: { result_item .get ('metric' , '' )} "
2392+ )
2393+ continue
23472394 # Check if the result has a 'passed' field
23482395 if "passed" in result_item and result_item ["passed" ] is not None :
2349- testing_criteria = result_item .get ("name" , "" )
23502396 if testing_criteria not in result_counts_stats :
23512397 result_counts_stats [testing_criteria ] = {
23522398 "testing_criteria" : testing_criteria ,
@@ -2372,15 +2418,14 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
23722418 elif isinstance (aoai_result , dict ) and aoai_result .get ("status" ) == "error" :
23732419 error_count += 1
23742420
2421+ # Update overall result counts, error counts will not be considered for passed/failed
23752422 if error_count > 0 :
23762423 result_counts ["errored" ] += 1
2377- elif failed_count > 0 :
2424+
2425+ if failed_count > 0 :
23782426 result_counts ["failed" ] += 1
23792427 elif (
2380- error_count == 0
2381- and failed_count == 0
2382- and passed_count > 0
2383- and passed_count == len (aoai_result .get ("results" , []))
2428+ failed_count == 0 and passed_count > 0 and passed_count == len (aoai_result .get ("results" , [])) - error_count
23842429 ):
23852430 result_counts ["passed" ] += 1
23862431
@@ -2428,6 +2473,9 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
24282473 cur_cached_tokens = usage_data .get ("cached_tokens" , 0 )
24292474 if _is_none_or_nan (cur_cached_tokens ):
24302475 cur_cached_tokens = 0
2476+ logger .info (
2477+ f"Model: { model_name } , cur_total_tokens: { cur_total_tokens } , { _is_none_or_nan (cur_total_tokens )} , cur_prompt_tokens: { cur_prompt_tokens } , cur_completion_tokens: { cur_completion_tokens } , cur_cached_tokens: { cur_cached_tokens } "
2478+ )
24312479 model_stats ["total_tokens" ] += cur_total_tokens
24322480 model_stats ["prompt_tokens" ] += cur_prompt_tokens
24332481 model_stats ["completion_tokens" ] += cur_completion_tokens
0 commit comments