@@ -1736,6 +1736,8 @@ def _convert_results_to_aoai_evaluation_results(
17361736 criteria_type = criteria_name_types_from_meta [criteria_name ].get ("type" , None )
17371737 evaluator_name = criteria_name_types_from_meta [criteria_name ].get ("evaluator_name" , None )
17381738 if evaluator_name :
1739+ if criteria_type == "azure_ai_evaluator" and evaluator_name .startswith ("builtin." ):
1740+ evaluator_name = evaluator_name .replace ("builtin." , "" )
17391741 metrics_mapped = _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS .get (evaluator_name , [])
17401742 if metrics_mapped and len (metrics_mapped ) > 0 :
17411743 metrics .extend (metrics_mapped )
@@ -1798,6 +1800,9 @@ def _convert_results_to_aoai_evaluation_results(
17981800 result_per_metric [metric ] = {"score" : metric_value }
17991801 else :
18001802 result_per_metric [metric ]["score" ] = metric_value
1803+ _append_indirect_attachments_to_results (
1804+ result_per_metric , "score" , metric , metric_value
1805+ )
18011806 elif metric_key .endswith ("_result" ) or metric_key == "result" or metric_key .endswith ("_label" ):
18021807 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18031808 label = metric_value
@@ -1809,6 +1814,12 @@ def _convert_results_to_aoai_evaluation_results(
18091814 else :
18101815 result_per_metric [metric ]["label" ] = metric_value
18111816 result_per_metric [metric ]["passed" ] = passed
1817+ _append_indirect_attachments_to_results (
1818+ result_per_metric , "label" , metric , label
1819+ )
1820+ _append_indirect_attachments_to_results (
1821+ result_per_metric , "passed" , metric , passed
1822+ )
18121823 elif (
18131824 metric_key .endswith ("_reason" ) and not metric_key .endswith ("_finish_reason" )
18141825 ) or metric_key == "reason" :
@@ -1817,18 +1828,27 @@ def _convert_results_to_aoai_evaluation_results(
18171828 result_per_metric [metric ] = {"reason" : metric_value }
18181829 else :
18191830 result_per_metric [metric ]["reason" ] = metric_value
1831+ _append_indirect_attachments_to_results (
1832+ result_per_metric , "reason" , metric , metric_value
1833+ )
18201834 elif metric_key .endswith ("_threshold" ) or metric_key == "threshold" :
18211835 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18221836 if metric not in result_per_metric :
18231837 result_per_metric [metric ] = {"threshold" : metric_value }
18241838 else :
18251839 result_per_metric [metric ]["threshold" ] = metric_value
1840+ _append_indirect_attachments_to_results (
1841+ result_per_metric , "threshold" , metric , metric_value
1842+ )
18261843 elif metric_key == "sample" :
18271844 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18281845 if metric not in result_per_metric :
18291846 result_per_metric [metric ] = {"sample" : metric_value }
18301847 else :
18311848 result_per_metric [metric ]["sample" ] = metric_value
1849+ _append_indirect_attachments_to_results (
1850+ result_per_metric , "sample" , metric , metric_value
1851+ )
18321852 elif metric_key .endswith ("_finish_reason" ):
18331853 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18341854 if metric not in result_per_metric :
@@ -1841,6 +1861,9 @@ def _convert_results_to_aoai_evaluation_results(
18411861 and "finish_reason" not in result_per_metric [metric ]["sample" ]
18421862 ):
18431863 result_per_metric [metric ]["sample" ]["finish_reason" ] = metric_value
1864+ _append_indirect_attachments_to_results (
1865+ result_per_metric , "sample" , metric , metric_value , "finish_reason"
1866+ )
18441867 elif metric_key .endswith ("_model" ):
18451868 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18461869 if metric not in result_per_metric :
@@ -1853,6 +1876,9 @@ def _convert_results_to_aoai_evaluation_results(
18531876 and "model" not in result_per_metric [metric ]["sample" ]
18541877 ):
18551878 result_per_metric [metric ]["sample" ]["model" ] = metric_value
1879+ _append_indirect_attachments_to_results (
1880+ result_per_metric , "sample" , metric , metric_value , "model"
1881+ )
18561882 elif metric_key .endswith ("_sample_input" ):
18571883 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18581884 input_metric_val_json : Optional [List [Dict [str , Any ]]] = []
@@ -1870,6 +1896,9 @@ def _convert_results_to_aoai_evaluation_results(
18701896 and "input" not in result_per_metric [metric ]["sample" ]
18711897 ):
18721898 result_per_metric [metric ]["sample" ]["input" ] = input_metric_val_json
1899+ _append_indirect_attachments_to_results (
1900+ result_per_metric , "sample" , metric , input_metric_val_json , "input"
1901+ )
18731902 elif metric_key .endswith ("_sample_output" ):
18741903 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18751904 output_metric_val_json : Optional [List [Dict [str , Any ]]] = []
@@ -1887,6 +1916,9 @@ def _convert_results_to_aoai_evaluation_results(
18871916 and "output" not in result_per_metric [metric ]["sample" ]
18881917 ):
18891918 result_per_metric [metric ]["sample" ]["output" ] = output_metric_val_json
1919+ _append_indirect_attachments_to_results (
1920+ result_per_metric , "sample" , metric , output_metric_val_json , "output"
1921+ )
18901922 elif metric_key .endswith ("_total_tokens" ):
18911923 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18921924 if metric not in result_per_metric :
@@ -1901,6 +1933,9 @@ def _convert_results_to_aoai_evaluation_results(
19011933 result_per_metric [metric ]["sample" ]["usage" ] = {"total_tokens" : metric_value }
19021934 else :
19031935 result_per_metric [metric ]["sample" ]["usage" ]["total_tokens" ] = metric_value
1936+ _append_indirect_attachments_to_results (
1937+ result_per_metric , "sample" , metric , metric_value , "usage" , "total_tokens"
1938+ )
19041939 elif metric_key .endswith ("_prompt_tokens" ):
19051940 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
19061941 if metric not in result_per_metric :
@@ -1915,6 +1950,9 @@ def _convert_results_to_aoai_evaluation_results(
19151950 result_per_metric [metric ]["sample" ]["usage" ] = {"prompt_tokens" : metric_value }
19161951 else :
19171952 result_per_metric [metric ]["sample" ]["usage" ]["prompt_tokens" ] = metric_value
1953+ _append_indirect_attachments_to_results (
1954+ result_per_metric , "sample" , metric , metric_value , "usage" , "prompt_tokens"
1955+ )
19181956 elif metric_key .endswith ("_completion_tokens" ):
19191957 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
19201958 if metric not in result_per_metric :
@@ -1929,6 +1967,9 @@ def _convert_results_to_aoai_evaluation_results(
19291967 result_per_metric [metric ]["sample" ]["usage" ] = {"completion_tokens" : metric_value }
19301968 else :
19311969 result_per_metric [metric ]["sample" ]["usage" ]["completion_tokens" ] = metric_value
1970+ _append_indirect_attachments_to_results (
1971+ result_per_metric , "sample" , metric , metric_value , "usage" , "completion_tokens"
1972+ )
19321973 elif not any (
19331974 metric_key .endswith (suffix )
19341975 for suffix in [
@@ -1970,6 +2011,18 @@ def _convert_results_to_aoai_evaluation_results(
19702011 "metric" : metric if metric is not None else criteria_name , # Use criteria name as metric
19712012 }
19722013 # Add optional fields
2014+ if (metric in _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS ["indirect_attack" ]
2015+ or metric in _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS ["code_vulnerability" ]
2016+ or metric in _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS ["protected_material" ]):
2017+ copy_label = label
2018+ if copy_label is not None and isinstance (copy_label , bool ) and copy_label == True :
2019+ label = "fail"
2020+ score = 0.0
2021+ passed = False
2022+ else :
2023+ label = "pass"
2024+ score = 1.0
2025+ passed = True
19732026 result_obj ["score" ] = score
19742027 result_obj ["label" ] = label
19752028 result_obj ["reason" ] = reason
@@ -2043,6 +2096,65 @@ def _convert_results_to_aoai_evaluation_results(
20432096 f"Summary statistics calculated for { len (converted_rows )} rows, eval_id: { eval_id } , eval_run_id: { eval_run_id } "
20442097 )
20452098
2099+ def _append_indirect_attachments_to_results (current_result_dict : Dict [str , Any ],
2100+ result_name : str ,
2101+ metric : str ,
2102+ metric_value : Any ,
2103+ nested_result_name : Optional [str ] = None ,
2104+ secondnested_result_name : Optional [str ] = None ) -> None :
2105+ """
2106+ Append indirect attachments to the current result dictionary.
2107+
2108+ :param current_result_dict: The current result dictionary to update
2109+ :type current_result_dict: Dict[str, Any]
2110+ :param result_name: The result name
2111+ :type result_name: str
2112+ :param metric: The metric name
2113+ :type metric: str
2114+ :param metric_value: The value of the metric
2115+ :type metric_value: Any
2116+ """
2117+ if metric == "xpia" and result_name :
2118+ for metric_extended in ["xpia_manipulated_content" , "xpia_intrusion" , "xpia_information_gathering" ]:
2119+ if nested_result_name is None :
2120+ if metric_extended not in current_result_dict :
2121+ current_result_dict [metric_extended ] = { result_name : metric_value }
2122+ else :
2123+ current_result_dict [metric_extended ][result_name ] = metric_value
2124+ elif nested_result_name is not None and secondnested_result_name is None :
2125+ if metric_extended not in current_result_dict :
2126+ current_result_dict [metric_extended ] = {result_name : {nested_result_name : metric_value }}
2127+ elif (metric_extended in current_result_dict
2128+ and result_name not in current_result_dict [metric_extended ]
2129+ ):
2130+ current_result_dict [metric_extended ][result_name ] = {nested_result_name : metric_value }
2131+ elif (
2132+ metric_extended in current_result_dict
2133+ and result_name in current_result_dict [metric_extended ]
2134+ and nested_result_name not in current_result_dict [metric_extended ][result_name ]
2135+ ):
2136+ current_result_dict [metric_extended ][result_name ][nested_result_name ] = metric_value
2137+ elif nested_result_name is not None and secondnested_result_name is not None :
2138+ if metric_extended not in current_result_dict :
2139+ current_result_dict [metric_extended ] = {
2140+ result_name : {nested_result_name : {secondnested_result_name : metric_value }}
2141+ }
2142+ elif (metric_extended in current_result_dict
2143+ and result_name not in current_result_dict [metric_extended ]
2144+ ):
2145+ current_result_dict [metric_extended ][result_name ] = {
2146+ nested_result_name : {secondnested_result_name : metric_value }
2147+ }
2148+ elif (
2149+ metric_extended in current_result_dict
2150+ and result_name in current_result_dict [metric_extended ]
2151+ and nested_result_name not in current_result_dict [metric_extended ][result_name ]
2152+ ):
2153+ current_result_dict [metric_extended ][result_name ][nested_result_name ] = {
2154+ secondnested_result_name : metric_value
2155+ }
2156+ else :
2157+ current_result_dict [metric_extended ][result_name ][nested_result_name ][secondnested_result_name ] = metric_value
20462158
20472159def _get_metric_from_criteria (testing_criteria_name : str , metric_key : str , metric_list : List [str ]) -> str :
20482160 """
@@ -2058,6 +2170,16 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
20582170 :rtype: str
20592171 """
20602172 metric = None
2173+
2174+ if metric_key == "xpia_manipulated_content" :
2175+ metric = "xpia_manipulated_content"
2176+ return metric
2177+ elif metric_key == "xpia_intrusion" :
2178+ metric = "xpia_intrusion"
2179+ return metric
2180+ elif metric_key == "xpia_information_gathering" :
2181+ metric = "xpia_information_gathering"
2182+ return metric
20612183 for expected_metric in metric_list :
20622184 if metric_key .startswith (expected_metric ):
20632185 metric = expected_metric
@@ -2124,9 +2246,12 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
21242246
21252247 # Extract usage statistics from aoai_result.sample
21262248 sample_data_list = []
2249+ dup_usage_list = _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS ["indirect_attack" ].copy ()
2250+ dup_usage_list .remove ("xpia" )
21272251 if isinstance (aoai_result , dict ) and aoai_result ["results" ] and isinstance (aoai_result ["results" ], list ):
21282252 for result_item in aoai_result ["results" ]:
2129- if isinstance (result_item , dict ) and "sample" in result_item and result_item ["sample" ]:
2253+ if (isinstance (result_item , dict ) and "sample" in result_item and result_item ["sample" ]
2254+ and result_item ["metric" ] not in dup_usage_list ):
21302255 sample_data_list .append (result_item ["sample" ])
21312256
21322257 for sample_data in sample_data_list :
0 commit comments