Skip to content

Commit 978d90d

Browse files
committed
docs: updated strands evals examples
1 parent 471f120 commit 978d90d

23 files changed

+1177
-8
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from strands import Agent
2+
3+
from strands_evals import ActorSimulator, Case, Experiment
4+
from strands_evals.evaluators import HelpfulnessEvaluator
5+
from strands_evals.mappers import StrandsInMemorySessionMapper
6+
from strands_evals.telemetry import StrandsEvalsTelemetry
7+
8+
9+
# Setup telemetry
10+
telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
11+
memory_exporter = telemetry.in_memory_exporter
12+
13+
# 1. Define a task function
14+
def task_function(case: Case) -> dict:
15+
# Create simulator
16+
user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=3)
17+
18+
# Create target agent
19+
agent = Agent(
20+
trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
21+
system_prompt="You are a helpful travel assistant.",
22+
callback_handler=None,
23+
)
24+
25+
user_message = case.input
26+
while user_sim.has_next():
27+
# Clear before each target agent call to ensure we don't capture simulator traces.
28+
memory_exporter.clear()
29+
agent_response = agent(user_message)
30+
agent_message = str(agent_response)
31+
user_result = user_sim.act(agent_message)
32+
user_message = str(user_result.structured_output.message)
33+
34+
mapper = StrandsInMemorySessionMapper()
35+
finished_spans = memory_exporter.get_finished_spans()
36+
session = mapper.map_to_session(finished_spans, session_id=case.session_id)
37+
38+
return {"output": agent_message, "trajectory": session}
39+
40+
# 2. Create test cases
41+
test_cases = [
42+
Case[str, str](
43+
name="booking-simple",
44+
input="I need to book a flight to Paris next week",
45+
metadata={"category": "booking", "task_description": "Flight booking confirmed"},
46+
)
47+
]
48+
49+
# 3. Create evaluators
50+
evaluators = [HelpfulnessEvaluator()]
51+
52+
# 4. Create an experiment
53+
experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators)
54+
55+
# 5. Run evaluations
56+
reports = experiment.run_evaluations(task_function)
57+
reports[0].run_display()
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
import asyncio
2+
import datetime
3+
4+
from langchain.evaluation.criteria import CriteriaEvalChain
5+
6+
## Using a third party evaluator
7+
from langchain_aws import BedrockLLM
8+
from strands import Agent
9+
10+
from strands_evals import Case, Experiment
11+
from strands_evals.evaluators import Evaluator
12+
from strands_evals.types import EvaluationData, EvaluationOutput
13+
14+
## Need to install $pip install langchain langchain_aws ##
15+
16+
17+
def third_party_example():
18+
"""
19+
Demonstrates integrating a third-party evaluator (LangChain) with the evaluation framework.
20+
21+
This example:
22+
1. Defines a task function that uses an agent to generate responses
23+
2. Creates test cases with expected outputs
24+
3. Creates a custom evaluator that wraps LangChain's CriteriaEvalChain
25+
4. Creates a dataset with the test cases and evaluator
26+
5. Runs evaluations and returns the report
27+
28+
Returns:
29+
EvaluationReport: The evaluation results
30+
"""
31+
32+
# 1. Define a task function
33+
def get_response(case: Case) -> str:
34+
agent = Agent(callback_handler=None)
35+
return str(agent(case.input))
36+
37+
# 2. Create test cases
38+
test_case1 = Case[str, str](
39+
name="knowledge-1",
40+
input="What is the capital of France?",
41+
expected_output="The capital of France is Paris.",
42+
metadata={"category": "knowledge"},
43+
)
44+
45+
test_case2 = Case[str, str](
46+
name="knowledge-2",
47+
input="What color is the ocean?",
48+
expected_output="The ocean is blue.",
49+
metadata={"category": "knowledge"},
50+
)
51+
test_case3 = Case(input="When was World War 2?")
52+
test_case4 = Case(input="Who was the first president of the United States?")
53+
54+
# 3. Create evaluators
55+
class LangChainCriteriaEvaluator(Evaluator[str, str]):
56+
def evaluate(self, evaluation_case: EvaluationData[str, str]) -> EvaluationOutput:
57+
## Follow LangChain's Docs: https://python.langchain.com/api_reference/langchain/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html
58+
# Initialize Bedrock LLM
59+
bedrock_llm = BedrockLLM(
60+
model_id="anthropic.claude-v2", # or other Bedrock models
61+
model_kwargs={
62+
"max_tokens_to_sample": 256,
63+
"temperature": 0.7,
64+
},
65+
)
66+
67+
criteria = {"correctness": "Is the actual answer correct?", "relevance": "Is the response relevant?"}
68+
69+
evaluator = CriteriaEvalChain.from_llm(llm=bedrock_llm, criteria=criteria)
70+
71+
# Pass in required context for evaluator (look at LangChain's docs)
72+
result = evaluator.evaluate_strings(prediction=evaluation_case.actual_output, input=evaluation_case.input)
73+
74+
# Make sure to return the correct type
75+
return EvaluationOutput(
76+
score=result["score"], test_pass=True if result["score"] > 0.5 else False, reason=result["reasoning"]
77+
)
78+
79+
# 4. Create an experiment
80+
experiment = Experiment[str, str](
81+
cases=[test_case1, test_case2, test_case3, test_case4], evaluators=[LangChainCriteriaEvaluator()]
82+
)
83+
84+
experiment.to_file("third_party_dataset", "json")
85+
86+
# 5. Run evaluations
87+
reports = experiment.run_evaluations(get_response)
88+
return reports[0]
89+
90+
91+
async def async_third_party_example():
92+
"""
93+
Demonstrates integrating a third-party evaluator (LangChain) with the evaluation framework asynchronously.
94+
95+
This example:
96+
1. Defines a task function that uses an agent to generate responses
97+
2. Creates test cases with expected outputs
98+
3. Creates a custom evaluator that wraps LangChain's CriteriaEvalChain
99+
4. Creates a dataset with the test cases and evaluator
100+
5. Runs evaluations and returns the report
101+
102+
Returns:
103+
EvaluationReport: The evaluation results
104+
"""
105+
106+
# 1. Define a task function
107+
async def get_response(case: Case) -> str:
108+
agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
109+
response = await agent.invoke_async(case.input)
110+
return str(response)
111+
112+
# 2. Create test cases
113+
test_case1 = Case[str, str](
114+
name="knowledge-1",
115+
input="What is the capital of France?",
116+
expected_output="The capital of France is Paris.",
117+
metadata={"category": "knowledge"},
118+
)
119+
120+
test_case2 = Case[str, str](
121+
name="knowledge-2",
122+
input="What color is the ocean?",
123+
expected_output="The ocean is blue.",
124+
metadata={"category": "knowledge"},
125+
)
126+
test_case3 = Case(input="When was World War 2?")
127+
test_case4 = Case(input="Who was the first president of the United States?")
128+
129+
# 3. Create evaluators
130+
class LangChainCriteriaEvaluator(Evaluator[str, str]):
131+
def evaluate(self, evaluation_case: EvaluationData[str, str]) -> EvaluationOutput:
132+
## Follow LangChain's Docs: https://python.langchain.com/api_reference/langchain/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html
133+
# Initialize Bedrock LLM
134+
bedrock_llm = BedrockLLM(
135+
model_id="anthropic.claude-v2", # or other Bedrock models
136+
model_kwargs={
137+
"max_tokens_to_sample": 256,
138+
"temperature": 0.7,
139+
},
140+
)
141+
142+
criteria = {
143+
"correctness": "Is the actual answer correct?",
144+
"relevance": "Is the response relevant?",
145+
"conciseness": "Is the response short and to the point?",
146+
}
147+
148+
evaluator = CriteriaEvalChain.from_llm(llm=bedrock_llm, criteria=criteria)
149+
150+
# Pass in required context for evaluator (look at LangChain's docs)
151+
result = evaluator.evaluate_strings(prediction=evaluation_case.actual_output, input=evaluation_case.input)
152+
153+
# Make sure to return the correct type
154+
return EvaluationOutput(
155+
score=result["score"], test_pass=True if result["score"] > 0.5 else False, reason=result["reasoning"]
156+
)
157+
158+
async def evaluate_async(self, evaluation_case: EvaluationData[str, str]) -> EvaluationOutput:
159+
return self.evaluate(evaluation_case)
160+
161+
# 4. Create an experiment
162+
experiment = Experiment[str, str](
163+
cases=[test_case1, test_case2, test_case3, test_case4], evaluators=[LangChainCriteriaEvaluator()]
164+
)
165+
166+
# 4.5. (Optional) Save the experiment
167+
experiment.to_file("async_third_party_dataset")
168+
169+
# 5. Run evaluations
170+
reports = await experiment.run_evaluations_async(get_response)
171+
return reports[0]
172+
173+
174+
if __name__ == "__main__":
175+
start = datetime.datetime.now()
176+
report = asyncio.run(async_third_party_example())
177+
end = datetime.datetime.now()
178+
print("Async: ", end - start) # Async: 0:00:24.050895
179+
report.to_file("async_third_party_report")
180+
report.run_display(include_actual_output=True)
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import asyncio
2+
import datetime
3+
4+
from strands import Agent
5+
from strands.multiagent import GraphBuilder
6+
7+
from strands_evals import Case, Experiment
8+
from strands_evals.evaluators import InteractionsEvaluator, TrajectoryEvaluator
9+
from strands_evals.extractors import graph_extractor
10+
11+
12+
async def async_graph_example():
13+
"""
14+
Demonstrates evaluating graph-based agent workflows for research tasks.
15+
16+
This example:
17+
1. Defines a task function with a graph of specialized research agents
18+
2. Creates test cases for research and report generation scenarios
19+
3. Creates TrajectoryEvaluator and InteractionsEvaluator to assess graph execution
20+
4. Creates datasets with the test cases and evaluators
21+
5. Runs evaluations and analyzes the reports
22+
23+
Returns:
24+
tuple[EvaluationReport, EvaluationReport]: The trajectory and interaction evaluation results
25+
"""
26+
27+
### Step 1: Define task ###
28+
def research_graph(case: Case):
29+
# Create specialized agents
30+
researcher = Agent(name="researcher", system_prompt="You are a research specialist...")
31+
analyst = Agent(name="analyst", system_prompt="You are a data analysis specialist...")
32+
fact_checker = Agent(name="fact_checker", system_prompt="You are a fact checking specialist...")
33+
report_writer = Agent(name="report_writer", system_prompt="You are a report writing specialist...")
34+
35+
# Create a graph with these agents
36+
builder = GraphBuilder()
37+
# Add nodes
38+
builder.add_node(researcher, "research")
39+
builder.add_node(analyst, "analysis")
40+
builder.add_node(fact_checker, "fact_check")
41+
builder.add_node(report_writer, "report")
42+
43+
# Add edges (dependencies)
44+
builder.add_edge("research", "analysis")
45+
builder.add_edge("research", "fact_check")
46+
builder.add_edge("analysis", "report")
47+
builder.add_edge("fact_check", "report")
48+
49+
# Set entry points (optional - will be auto-detected if not specified)
50+
builder.set_entry_point("research")
51+
52+
# Build the graph
53+
graph = builder.build()
54+
55+
result = graph(case.input)
56+
interactions = graph_extractor.extract_graph_interactions(result)
57+
58+
return {"interactions": interactions, "trajectory": [node.node_id for node in result.execution_order]}
59+
60+
### Step 2: Create test cases ###
61+
test1 = Case(
62+
input="Research the impact of AI on healthcare and create a short report",
63+
expected_interactions=[
64+
{"node_name": "research", "dependencies": []},
65+
{"node_name": "fact_check", "dependencies": ["research"]},
66+
{"node_name": "analysis", "dependencies": ["research"]},
67+
{"node_name": "report", "dependencies": ["fact_check", "analysis"]},
68+
],
69+
)
70+
test2 = Case(input="Research the impact of robotics on healthcare and create a short report")
71+
72+
### Step 2: Create evaluator ###
73+
rubric = {
74+
"research": "The research node should be the starting point and generate a query about the topic.",
75+
"fact_check": "The fact check node should come after research and verify the accuracy of the generated query.",
76+
"analysis": "The analysis node should come after research and generate a summary of the findings.",
77+
"report": "The report node should come after analysis"
78+
" and fact check and synthesize the information into a coherent report.",
79+
}
80+
# if want to use the same rubric
81+
basic_rubric = (
82+
"The graph system should ultilized the agents as expected with relevant information."
83+
" The actual interactions should include more information than expected."
84+
)
85+
interaction_evaluator = InteractionsEvaluator(rubric=rubric)
86+
trajectory_eval = TrajectoryEvaluator(rubric=basic_rubric)
87+
88+
### Step 4: Create dataset ###
89+
interaction_experiment = Experiment(cases=[test1, test2], evaluators=[interaction_evaluator])
90+
trajectory_experiment = Experiment(cases=[test1, test2], evaluators=[trajectory_eval])
91+
92+
### Step 5: Run evaluation ###
93+
interaction_reports = await interaction_experiment.run_evaluations_async(research_graph)
94+
trajectory_reports = await trajectory_experiment.run_evaluations_async(research_graph)
95+
interaction_report = interaction_reports[0]
96+
trajectory_report = trajectory_reports[0]
97+
98+
return trajectory_report, interaction_report
99+
100+
101+
if __name__ == "__main__":
102+
# run the file as a module: eg. python -m examples.evaluate_graph
103+
start = datetime.datetime.now()
104+
trajectory_report, interaction_report = asyncio.run(async_graph_example())
105+
end = datetime.datetime.now()
106+
print("Async node interactions", end - start)
107+
108+
trajectory_report.to_file("research_graph_report_trajectory")
109+
trajectory_report.display(include_actual_trajectory=True)
110+
111+
interaction_report.to_file("research_graph_report_interactions")
112+
interaction_report.display(include_actual_interactions=True)

0 commit comments

Comments
 (0)