@@ -1736,6 +1736,8 @@ def _convert_results_to_aoai_evaluation_results(
17361736 criteria_type = criteria_name_types_from_meta [criteria_name ].get ("type" , None )
17371737 evaluator_name = criteria_name_types_from_meta [criteria_name ].get ("evaluator_name" , None )
17381738 if evaluator_name :
1739+ if criteria_type == "azure_ai_evaluator" and evaluator_name .startswith ("builtin." ):
1740+ evaluator_name = evaluator_name .replace ("builtin." , "" )
17391741 metrics_mapped = _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS .get (evaluator_name , [])
17401742 if metrics_mapped and len (metrics_mapped ) > 0 :
17411743 metrics .extend (metrics_mapped )
@@ -1798,6 +1800,7 @@ def _convert_results_to_aoai_evaluation_results(
17981800 result_per_metric [metric ] = {"score" : metric_value }
17991801 else :
18001802 result_per_metric [metric ]["score" ] = metric_value
1803+ _append_indirect_attachments_to_results (result_per_metric , "score" , metric , metric_value )
18011804 elif metric_key .endswith ("_result" ) or metric_key == "result" or metric_key .endswith ("_label" ):
18021805 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18031806 label = metric_value
@@ -1809,6 +1812,8 @@ def _convert_results_to_aoai_evaluation_results(
18091812 else :
18101813 result_per_metric [metric ]["label" ] = metric_value
18111814 result_per_metric [metric ]["passed" ] = passed
1815+ _append_indirect_attachments_to_results (result_per_metric , "label" , metric , label )
1816+ _append_indirect_attachments_to_results (result_per_metric , "passed" , metric , passed )
18121817 elif (
18131818 metric_key .endswith ("_reason" ) and not metric_key .endswith ("_finish_reason" )
18141819 ) or metric_key == "reason" :
@@ -1817,18 +1822,21 @@ def _convert_results_to_aoai_evaluation_results(
18171822 result_per_metric [metric ] = {"reason" : metric_value }
18181823 else :
18191824 result_per_metric [metric ]["reason" ] = metric_value
1825+ _append_indirect_attachments_to_results (result_per_metric , "reason" , metric , metric_value )
18201826 elif metric_key .endswith ("_threshold" ) or metric_key == "threshold" :
18211827 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18221828 if metric not in result_per_metric :
18231829 result_per_metric [metric ] = {"threshold" : metric_value }
18241830 else :
18251831 result_per_metric [metric ]["threshold" ] = metric_value
1832+ _append_indirect_attachments_to_results (result_per_metric , "threshold" , metric , metric_value )
18261833 elif metric_key == "sample" :
18271834 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18281835 if metric not in result_per_metric :
18291836 result_per_metric [metric ] = {"sample" : metric_value }
18301837 else :
18311838 result_per_metric [metric ]["sample" ] = metric_value
1839+ _append_indirect_attachments_to_results (result_per_metric , "sample" , metric , metric_value )
18321840 elif metric_key .endswith ("_finish_reason" ):
18331841 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18341842 if metric not in result_per_metric :
@@ -1841,6 +1849,9 @@ def _convert_results_to_aoai_evaluation_results(
18411849 and "finish_reason" not in result_per_metric [metric ]["sample" ]
18421850 ):
18431851 result_per_metric [metric ]["sample" ]["finish_reason" ] = metric_value
1852+ _append_indirect_attachments_to_results (
1853+ result_per_metric , "sample" , metric , metric_value , "finish_reason"
1854+ )
18441855 elif metric_key .endswith ("_model" ):
18451856 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18461857 if metric not in result_per_metric :
@@ -1853,6 +1864,7 @@ def _convert_results_to_aoai_evaluation_results(
18531864 and "model" not in result_per_metric [metric ]["sample" ]
18541865 ):
18551866 result_per_metric [metric ]["sample" ]["model" ] = metric_value
1867+ _append_indirect_attachments_to_results (result_per_metric , "sample" , metric , metric_value , "model" )
18561868 elif metric_key .endswith ("_sample_input" ):
18571869 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18581870 input_metric_val_json : Optional [List [Dict [str , Any ]]] = []
@@ -1870,6 +1882,9 @@ def _convert_results_to_aoai_evaluation_results(
18701882 and "input" not in result_per_metric [metric ]["sample" ]
18711883 ):
18721884 result_per_metric [metric ]["sample" ]["input" ] = input_metric_val_json
1885+ _append_indirect_attachments_to_results (
1886+ result_per_metric , "sample" , metric , input_metric_val_json , "input"
1887+ )
18731888 elif metric_key .endswith ("_sample_output" ):
18741889 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18751890 output_metric_val_json : Optional [List [Dict [str , Any ]]] = []
@@ -1887,6 +1902,9 @@ def _convert_results_to_aoai_evaluation_results(
18871902 and "output" not in result_per_metric [metric ]["sample" ]
18881903 ):
18891904 result_per_metric [metric ]["sample" ]["output" ] = output_metric_val_json
1905+ _append_indirect_attachments_to_results (
1906+ result_per_metric , "sample" , metric , output_metric_val_json , "output"
1907+ )
18901908 elif metric_key .endswith ("_total_tokens" ):
18911909 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
18921910 if metric not in result_per_metric :
@@ -1901,6 +1919,9 @@ def _convert_results_to_aoai_evaluation_results(
19011919 result_per_metric [metric ]["sample" ]["usage" ] = {"total_tokens" : metric_value }
19021920 else :
19031921 result_per_metric [metric ]["sample" ]["usage" ]["total_tokens" ] = metric_value
1922+ _append_indirect_attachments_to_results (
1923+ result_per_metric , "sample" , metric , metric_value , "usage" , "total_tokens"
1924+ )
19041925 elif metric_key .endswith ("_prompt_tokens" ):
19051926 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
19061927 if metric not in result_per_metric :
@@ -1915,6 +1936,9 @@ def _convert_results_to_aoai_evaluation_results(
19151936 result_per_metric [metric ]["sample" ]["usage" ] = {"prompt_tokens" : metric_value }
19161937 else :
19171938 result_per_metric [metric ]["sample" ]["usage" ]["prompt_tokens" ] = metric_value
1939+ _append_indirect_attachments_to_results (
1940+ result_per_metric , "sample" , metric , metric_value , "usage" , "prompt_tokens"
1941+ )
19181942 elif metric_key .endswith ("_completion_tokens" ):
19191943 metric = _get_metric_from_criteria (criteria_name , metric_key , expected_metrics )
19201944 if metric not in result_per_metric :
@@ -1929,6 +1953,9 @@ def _convert_results_to_aoai_evaluation_results(
19291953 result_per_metric [metric ]["sample" ]["usage" ] = {"completion_tokens" : metric_value }
19301954 else :
19311955 result_per_metric [metric ]["sample" ]["usage" ]["completion_tokens" ] = metric_value
1956+ _append_indirect_attachments_to_results (
1957+ result_per_metric , "sample" , metric , metric_value , "usage" , "completion_tokens"
1958+ )
19321959 elif not any (
19331960 metric_key .endswith (suffix )
19341961 for suffix in [
@@ -1970,6 +1997,20 @@ def _convert_results_to_aoai_evaluation_results(
19701997 "metric" : metric if metric is not None else criteria_name , # Use criteria name as metric
19711998 }
19721999 # Add optional fields
2000+ if (
2001+ metric in _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS ["indirect_attack" ]
2002+ or metric in _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS ["code_vulnerability" ]
2003+ or metric in _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS ["protected_material" ]
2004+ ):
2005+ copy_label = label
2006+ if copy_label is not None and isinstance (copy_label , bool ) and copy_label == True :
2007+ label = "fail"
2008+ score = 0.0
2009+ passed = False
2010+ else :
2011+ label = "pass"
2012+ score = 1.0
2013+ passed = True
19732014 result_obj ["score" ] = score
19742015 result_obj ["label" ] = label
19752016 result_obj ["reason" ] = reason
@@ -2044,6 +2085,67 @@ def _convert_results_to_aoai_evaluation_results(
20442085 )
20452086
20462087
2088+ def _append_indirect_attachments_to_results (
2089+ current_result_dict : Dict [str , Any ],
2090+ result_name : str ,
2091+ metric : str ,
2092+ metric_value : Any ,
2093+ nested_result_name : Optional [str ] = None ,
2094+ secondnested_result_name : Optional [str ] = None ,
2095+ ) -> None :
2096+ """
2097+ Append indirect attachments to the current result dictionary.
2098+
2099+ :param current_result_dict: The current result dictionary to update
2100+ :type current_result_dict: Dict[str, Any]
2101+ :param result_name: The result name
2102+ :type result_name: str
2103+ :param metric: The metric name
2104+ :type metric: str
2105+ :param metric_value: The value of the metric
2106+ :type metric_value: Any
2107+ """
2108+ if metric == "xpia" and result_name :
2109+ for metric_extended in ["xpia_manipulated_content" , "xpia_intrusion" , "xpia_information_gathering" ]:
2110+ if nested_result_name is None :
2111+ if metric_extended not in current_result_dict :
2112+ current_result_dict [metric_extended ] = {result_name : metric_value }
2113+ else :
2114+ current_result_dict [metric_extended ][result_name ] = metric_value
2115+ elif nested_result_name is not None and secondnested_result_name is None :
2116+ if metric_extended not in current_result_dict :
2117+ current_result_dict [metric_extended ] = {result_name : {nested_result_name : metric_value }}
2118+ elif metric_extended in current_result_dict and result_name not in current_result_dict [metric_extended ]:
2119+ current_result_dict [metric_extended ][result_name ] = {nested_result_name : metric_value }
2120+ elif (
2121+ metric_extended in current_result_dict
2122+ and result_name in current_result_dict [metric_extended ]
2123+ and nested_result_name not in current_result_dict [metric_extended ][result_name ]
2124+ ):
2125+ current_result_dict [metric_extended ][result_name ][nested_result_name ] = metric_value
2126+ elif nested_result_name is not None and secondnested_result_name is not None :
2127+ if metric_extended not in current_result_dict :
2128+ current_result_dict [metric_extended ] = {
2129+ result_name : {nested_result_name : {secondnested_result_name : metric_value }}
2130+ }
2131+ elif metric_extended in current_result_dict and result_name not in current_result_dict [metric_extended ]:
2132+ current_result_dict [metric_extended ][result_name ] = {
2133+ nested_result_name : {secondnested_result_name : metric_value }
2134+ }
2135+ elif (
2136+ metric_extended in current_result_dict
2137+ and result_name in current_result_dict [metric_extended ]
2138+ and nested_result_name not in current_result_dict [metric_extended ][result_name ]
2139+ ):
2140+ current_result_dict [metric_extended ][result_name ][nested_result_name ] = {
2141+ secondnested_result_name : metric_value
2142+ }
2143+ else :
2144+ (
2145+ current_result_dict [metric_extended ][result_name ][nested_result_name ][secondnested_result_name ]
2146+ ) = metric_value
2147+
2148+
20472149def _get_metric_from_criteria (testing_criteria_name : str , metric_key : str , metric_list : List [str ]) -> str :
20482150 """
20492151 Get the metric name from the testing criteria and metric key.
@@ -2058,6 +2160,16 @@ def _get_metric_from_criteria(testing_criteria_name: str, metric_key: str, metri
20582160 :rtype: str
20592161 """
20602162 metric = None
2163+
2164+ if metric_key == "xpia_manipulated_content" :
2165+ metric = "xpia_manipulated_content"
2166+ return metric
2167+ elif metric_key == "xpia_intrusion" :
2168+ metric = "xpia_intrusion"
2169+ return metric
2170+ elif metric_key == "xpia_information_gathering" :
2171+ metric = "xpia_information_gathering"
2172+ return metric
20612173 for expected_metric in metric_list :
20622174 if metric_key .startswith (expected_metric ):
20632175 metric = expected_metric
@@ -2124,9 +2236,16 @@ def _calculate_aoai_evaluation_summary(aoai_results: list, logger: logging.Logge
21242236
21252237 # Extract usage statistics from aoai_result.sample
21262238 sample_data_list = []
2239+ dup_usage_list = _EvaluatorMetricMapping .EVALUATOR_NAME_METRICS_MAPPINGS ["indirect_attack" ].copy ()
2240+ dup_usage_list .remove ("xpia" )
21272241 if isinstance (aoai_result , dict ) and aoai_result ["results" ] and isinstance (aoai_result ["results" ], list ):
21282242 for result_item in aoai_result ["results" ]:
2129- if isinstance (result_item , dict ) and "sample" in result_item and result_item ["sample" ]:
2243+ if (
2244+ isinstance (result_item , dict )
2245+ and "sample" in result_item
2246+ and result_item ["sample" ]
2247+ and result_item ["metric" ] not in dup_usage_list
2248+ ):
21302249 sample_data_list .append (result_item ["sample" ])
21312250
21322251 for sample_data in sample_data_list :
0 commit comments