Skip to content

Commit eec6eb2

Browse files
authored
Jessli/convert Fix test failure (#43518)
* add eval result converter * Add result converter * update converter params to optional * add eval meta data * fix type * remove useless file * get eval meta data as input * fix build errors * remove useless import * resolve comments * update * update comments * fix checker failure * add error msg and error code * Surface evaluator error msg * update UT * fix usage * make eval_meta_data optional * remove useless lines * update param name to add underscore * parse updated annotation results * update trace_id * expose sample data for sdk evaluators * update * update * fix UT * fix tests * fix test
1 parent ffbbacb commit eec6eb2

File tree

3 files changed

+18
-11
lines changed

3 files changed

+18
-11
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,13 @@ def _parse_content_harm_response(
533533
except Exception: # pylint: disable=broad-exception-caught
534534
harm_response = response[metric_name]
535535

536+
total_tokens = 0
537+
prompt_tokens = 0
538+
completion_tokens = 0
539+
finish_reason = ""
540+
sample_input = ""
541+
sample_output = ""
542+
model = ""
536543
if harm_response != "" and isinstance(harm_response, dict):
537544
# check if "output" is one key in harm_response
538545
if "output" in harm_response:

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ def test_evaluate_multimodal(
388388
# imageurls_with_target has 1 extra column: outputs.conversation due to the target mapping
389389
assert len(row_result_df.keys()) >= 33
390390
else:
391-
assert len(row_result_df.keys()) == 32
391+
assert len(row_result_df.keys()) == 88
392392
known_keys = [
393393
"outputs.content_safety.hate_unfairness",
394394
"outputs.content_safety.hate_unfairness_score",

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,21 +1156,21 @@ def run_test():
11561156
assert "metrics" in converted_results
11571157
assert "rows" in converted_results
11581158
assert "studio_url" in converted_results
1159-
assert "evaluation_results_list" in converted_results
1160-
assert "evaluation_summary" in converted_results
1159+
assert "_evaluation_results_list" in converted_results
1160+
assert "_evaluation_summary" in converted_results
11611161

11621162
# Verify metrics preserved
11631163
assert converted_results["metrics"]["overall_score"] == 0.75
11641164

11651165
# Verify studio URL preserved
11661166
assert converted_results["studio_url"] == "https://test-studio.com"
11671167

1168-
# Verify evaluation_results_list is same as rows (converted format)
1169-
assert len(converted_results["evaluation_results_list"]) == len(test_rows)
1170-
assert len(converted_results["evaluation_results_list"]) == len(converted_results["rows"])
1168+
# Verify _evaluation_results_list is same as rows (converted format)
1169+
assert len(converted_results["_evaluation_results_list"]) == len(test_rows)
1170+
assert len(converted_results["_evaluation_results_list"]) == len(converted_results["rows"])
11711171

11721172
# Verify conversion structure for each row
1173-
for i, converted_row in enumerate(converted_results["evaluation_results_list"]):
1173+
for i, converted_row in enumerate(converted_results["_evaluation_results_list"]):
11741174
# Check RunOutputItem structure
11751175
assert "object" in converted_row
11761176
assert converted_row["object"] == "eval.run.output_item"
@@ -1213,8 +1213,8 @@ def run_test():
12131213
assert "name" in result
12141214
assert "metric" in result
12151215

1216-
# Verify evaluation summary structure
1217-
summary = converted_results["evaluation_summary"]
1216+
# Verify _evaluation_summary structure
1217+
summary = converted_results["_evaluation_summary"]
12181218
assert "result_counts" in summary
12191219
assert "per_model_usage" in summary
12201220
assert "per_testing_criteria_results" in summary
@@ -1262,8 +1262,8 @@ def run_test():
12621262
empty_converted = empty_results
12631263

12641264
assert len(empty_converted["rows"]) == 0
1265-
assert len(empty_converted["evaluation_results_list"]) == 0
1266-
assert empty_converted["evaluation_summary"]["result_counts"]["total"] == 0
1265+
assert len(empty_converted["_evaluation_results_list"]) == 0
1266+
assert empty_converted["_evaluation_summary"]["result_counts"]["total"] == 0
12671267

12681268

12691269
@pytest.mark.unittest

0 commit comments

Comments
 (0)