Skip to content

Commit 984c189

Browse files
committed
eval and red teaming
1 parent badabfb commit 984c189

File tree

3 files changed

+454
-1
lines changed

3 files changed

+454
-1
lines changed

src/gunicorn.conf.py

Lines changed: 103 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,18 @@
1717
from azure.core.credentials_async import AsyncTokenCredential
1818
from azure.ai.projects.models import AgentReference, PromptAgentDefinition
1919
from azure.ai.projects.models import FileSearchTool, AzureAISearchAgentTool, Tool, AgentVersionObject, AzureAISearchToolResource, AISearchIndexResource
20+
21+
from azure.ai.projects.models import (
22+
PromptAgentDefinition,
23+
EvaluationRule,
24+
ContinuousEvaluationRuleAction,
25+
EvaluationRuleFilter,
26+
EvaluationRuleEventType,
27+
EvaluatorCategory,
28+
EvaluatorDefinitionType
29+
)
30+
31+
2032
from openai import AsyncOpenAI
2133
from dotenv import load_dotenv
2234

@@ -174,6 +186,95 @@ async def create_agent(ai_project: AIProjectClient,
174186
)
175187
return agent
176188

189+
async def initialize_eval(project_client: AIProjectClient, agent: AgentVersionObject):
190+
print("Creating a single evaluator version - Prompt based (json style)")
191+
prompt_evaluator = await project_client.evaluators.create_version(
192+
name="my_custom_evaluator_prompt",
193+
evaluator_version={
194+
"name": "my_custom_evaluator_prompt",
195+
"categories": [EvaluatorCategory.QUALITY],
196+
"display_name": "my_custom_evaluator_prompt",
197+
"description": "Custom evaluator to for groundedness",
198+
"definition": {
199+
"type": EvaluatorDefinitionType.PROMPT,
200+
"prompt_text": """
201+
You are a Groundedness Evaluator.
202+
203+
Your task is to evaluate how well the given response is grounded in the provided ground truth.
204+
Groundedness means the response's statements are factually supported by the ground truth.
205+
Evaluate factual alignment only — ignore grammar, fluency, or completeness.
206+
207+
---
208+
209+
### Input:
210+
Query:
211+
{query}
212+
213+
Response:
214+
{response}
215+
216+
Ground Truth:
217+
{ground_truth}
218+
219+
---
220+
221+
### Scoring Scale (1-5):
222+
5 → Fully grounded. All claims supported by ground truth.
223+
4 → Mostly grounded. Minor unsupported details.
224+
3 → Partially grounded. About half the claims supported.
225+
2 → Mostly ungrounded. Only a few details supported.
226+
1 → Not grounded. Almost all information unsupported.
227+
228+
---
229+
230+
### Output should be Integer:
231+
<integer from 1 to 5>
232+
""",
233+
"init_parameters": {
234+
"type": "object",
235+
"properties": {"deployment_name": {"type": "string"}, "threshold": {"type": "number"}},
236+
"required": ["deployment_name"],
237+
},
238+
"data_schema": {
239+
"type": "object",
240+
"properties": {
241+
"query": {"type": "string"},
242+
"response": {"type": "string"},
243+
"ground_truth": {"type": "string"},
244+
},
245+
"required": ["query", "response", "ground_truth"],
246+
},
247+
"metrics": {
248+
"tool_selection": {
249+
"type": "ordinal",
250+
"desirable_direction": "increase",
251+
"min_value": 1,
252+
"max_value": 5,
253+
}
254+
},
255+
},
256+
},
257+
)
258+
259+
print(f"Evaluator version created (id: {prompt_evaluator.id}, name: {prompt_evaluator.name})")
260+
261+
print("Creating continuous evaluation rule to run evaluator on agent responses")
262+
continuous_eval_rule = await project_client.evaluation_rules.create_or_update(
263+
id="my-continuous-eval-rule",
264+
evaluation_rule=EvaluationRule(
265+
display_name="My Continuous Eval Rule",
266+
description="An eval rule that runs on agent response completions",
267+
action=ContinuousEvaluationRuleAction(eval_id=prompt_evaluator.id, max_hourly_runs=10),
268+
event_type=EvaluationRuleEventType.RESPONSE_COMPLETED,
269+
filter=EvaluationRuleFilter(agent_name=agent.name),
270+
enabled=True,
271+
),
272+
)
273+
print(
274+
f"Continuous Evaluation Rule created (id: {continuous_eval_rule.id}, name: {continuous_eval_rule.display_name})"
275+
)
276+
277+
177278

178279
async def initialize_resources():
179280
try:
@@ -196,7 +297,6 @@ async def initialize_resources():
196297
agent_version = agentID.split(":")[1]
197298
agent_obj = await ai_project.agents.retrieve_version(agent_name, agent_version)
198299
logger.info(f"Found agent by ID: {agent_obj.id}")
199-
return
200300
except Exception as e:
201301
logger.warning(
202302
"Could not retrieve agent by AZURE_EXISTING_AGENT_ID = "
@@ -222,6 +322,8 @@ async def initialize_resources():
222322

223323
os.environ["AZURE_EXISTING_AGENT_ID"] = agent_obj.id
224324

325+
await initialize_eval(ai_project, agent_obj)
326+
225327
except Exception as e:
226328
logger.info("Error creating agent: {e}", exc_info=True)
227329
raise RuntimeError(f"Failed to create the agent: {e}")

tests/test_evaluation.py

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
# ------------------------------------
2+
# Copyright (c) Microsoft Corporation.
3+
# Licensed under the MIT License.
4+
# ------------------------------------
5+
6+
import os
7+
import time
8+
from pprint import pprint
9+
from dotenv import load_dotenv
10+
from azure.identity import DefaultAzureCredential
11+
from azure.ai.projects import AIProjectClient
12+
from openai.types.eval_create_params import DataSourceConfigCustom
13+
14+
load_dotenv()
15+
16+
agent_id = os.environ.get("AZURE_EXISTING_AGENT_ID", "")
17+
endpoint = os.environ.get("AZURE_EXISTING_AIPROJECT_ENDPOINT", "")
18+
19+
if not agent_id or ":" not in agent_id:
20+
raise ValueError("Please set AZURE_EXISTING_AGENT_ID environment variable in the format 'agent_name:agent_version'.")
21+
22+
if not endpoint:
23+
raise ValueError("Please set AZURE_EXISTING_AIPROJECT_ENDPOINT environment variable.")
24+
25+
26+
def test_evaluation():
27+
project_client = AIProjectClient(
28+
endpoint=endpoint,
29+
credential=DefaultAzureCredential(),
30+
)
31+
32+
agent_name = agent_id.split(":")[0]
33+
agent_version = agent_id.split(":")[1]
34+
35+
with project_client:
36+
37+
openai_client = project_client.get_openai_client()
38+
39+
agent = project_client.agents.retrieve_version(
40+
agent_name=agent_name, agent_version=agent_version
41+
)
42+
print(f"Agent retrieved (id: {agent.id}, name: {agent.name}, version: {agent.version})")
43+
44+
data_source_config = DataSourceConfigCustom(
45+
type="custom",
46+
item_schema={"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]},
47+
include_sample_schema=True,
48+
)
49+
testing_criteria = [
50+
{
51+
"type": "azure_ai_evaluator",
52+
"name": "violence_detection",
53+
"evaluator_name": "builtin.violence",
54+
"data_mapping": {"query": "{{item.query}}", "response": "{{item.response}}"},
55+
}
56+
]
57+
eval_object = openai_client.evals.create(
58+
name="Agent Evaluation",
59+
data_source_config=data_source_config,
60+
testing_criteria=testing_criteria,
61+
)
62+
print(f"Evaluation created (id: {eval_object.id}, name: {eval_object.name})")
63+
64+
data_source = {
65+
"type": "azure_ai_target_completions",
66+
"source": {
67+
"type": "file_content",
68+
"content": [
69+
{"item": {"query": "What is the capital of France?"}},
70+
{"item": {"query": "How do I reverse a string in Python?"}},
71+
],
72+
},
73+
"input_messages": {
74+
"type": "template",
75+
"template": [
76+
{"type": "message", "role": "user", "content": {"type": "input_text", "text": "{{item.query}}"}}
77+
],
78+
},
79+
"target": {
80+
"type": "azure_ai_agent",
81+
"name": agent.name,
82+
"version": agent.version, # Version is optional. Defaults to latest version if not specified
83+
},
84+
}
85+
86+
agent_eval_run = openai_client.evals.runs.create(
87+
eval_id=eval_object.id, name=f"Evaluation Run for Agent {agent.name}", data_source=data_source
88+
)
89+
print(f"Evaluation run created (id: {agent_eval_run.id})")
90+
91+
while agent_eval_run.status not in ["completed", "failed"]:
92+
agent_eval_run = openai_client.evals.runs.retrieve(run_id=agent_eval_run.id, eval_id=eval_object.id)
93+
print(f"Waiting for eval run to complete... current status: {agent_eval_run.status}")
94+
time.sleep(5)
95+
96+
if agent_eval_run.status == "completed":
97+
print("\n✓ Evaluation run completed successfully!")
98+
print(f"Result Counts: {agent_eval_run.result_counts}")
99+
100+
output_items = list(
101+
openai_client.evals.runs.output_items.list(run_id=agent_eval_run.id, eval_id=eval_object.id)
102+
)
103+
print(f"\nOUTPUT ITEMS (Total: {len(output_items)})")
104+
print(f"{'-'*60}")
105+
pprint(output_items)
106+
print(f"{'-'*60}")
107+
else:
108+
print("\n✗ Evaluation run failed.")
109+
110+
openai_client.evals.delete(eval_id=eval_object.id)
111+
print("Evaluation deleted")
112+
113+
project_client.agents.delete(agent_name=agent.name)
114+
print("Agent deleted")
115+
116+
assert True==True

0 commit comments

Comments
 (0)