strands-agents
diff --git a/‎docs/examples/evals-sdk/actor_simulator.py‎
Lines changed: 57 additions & 0 deletions b/‎docs/examples/evals-sdk/actor_simulator.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎docs/examples/evals-sdk/custom_evaluator.py‎
Lines changed: 180 additions & 0 deletions b/‎docs/examples/evals-sdk/custom_evaluator.py‎
Lines changed: 180 additions & 0 deletions
diff --git a/‎docs/examples/evals-sdk/evaluate_graph.py‎
Lines changed: 112 additions & 0 deletions b/‎docs/examples/evals-sdk/evaluate_graph.py‎
Lines changed: 112 additions & 0 deletions
@@ -0,0 +1,57 @@
+from strands import Agent
+
+from strands_evals import ActorSimulator, Case, Experiment
+from strands_evals.evaluators import HelpfulnessEvaluator
+from strands_evals.mappers import StrandsInMemorySessionMapper
+from strands_evals.telemetry import StrandsEvalsTelemetry
+
+
+# Setup telemetry
+telemetry = StrandsEvalsTelemetry().setup_in_memory_exporter()
+memory_exporter = telemetry.in_memory_exporter
+
+# 1. Define a task function
+def task_function(case: Case) -> dict:
+    # Create simulator
+    user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=3)
+
+    # Create target agent
+    agent = Agent(
+        trace_attributes={"gen_ai.conversation.id": case.session_id, "session.id": case.session_id},
+        system_prompt="You are a helpful travel assistant.",
+        callback_handler=None,
+    )
+
+    user_message = case.input
+    while user_sim.has_next():
+        # Clear before each target agent call to ensure we don't capture simulator traces.
+        memory_exporter.clear()
+        agent_response = agent(user_message)
+        agent_message = str(agent_response)
+        user_result = user_sim.act(agent_message)
+        user_message = str(user_result.structured_output.message)
+
+    mapper = StrandsInMemorySessionMapper()
+    finished_spans = memory_exporter.get_finished_spans()
+    session = mapper.map_to_session(finished_spans, session_id=case.session_id)
+
+    return {"output": agent_message, "trajectory": session}
+
+# 2. Create test cases
+test_cases = [
+    Case[str, str](
+        name="booking-simple",
+        input="I need to book a flight to Paris next week",
+        metadata={"category": "booking", "task_description": "Flight booking confirmed"},
+    )
+]
+
+# 3. Create evaluators
+evaluators = [HelpfulnessEvaluator()]
+
+# 4. Create an experiment
+experiment = Experiment[str, str](cases=test_cases, evaluators=evaluators)
+
+# 5. Run evaluations
+reports = experiment.run_evaluations(task_function)
+reports[0].run_display()
@@ -0,0 +1,180 @@
+import asyncio
+import datetime
+
+from langchain.evaluation.criteria import CriteriaEvalChain
+
+## Using a third party evaluator
+from langchain_aws import BedrockLLM
+from strands import Agent
+
+from strands_evals import Case, Experiment
+from strands_evals.evaluators import Evaluator
+from strands_evals.types import EvaluationData, EvaluationOutput
+
+## Need to install $pip install langchain langchain_aws ##
+
+
+def third_party_example():
+    """
+    Demonstrates integrating a third-party evaluator (LangChain) with the evaluation framework.
+
+    This example:
+    1. Defines a task function that uses an agent to generate responses
+    2. Creates test cases with expected outputs
+    3. Creates a custom evaluator that wraps LangChain's CriteriaEvalChain
+    4. Creates a dataset with the test cases and evaluator
+    5. Runs evaluations and returns the report
+
+    Returns:
+        EvaluationReport: The evaluation results
+    """
+
+    # 1. Define a task function
+    def get_response(case: Case) -> str:
+        agent = Agent(callback_handler=None)
+        return str(agent(case.input))
+
+    # 2. Create test cases
+    test_case1 = Case[str, str](
+        name="knowledge-1",
+        input="What is the capital of France?",
+        expected_output="The capital of France is Paris.",
+        metadata={"category": "knowledge"},
+    )
+
+    test_case2 = Case[str, str](
+        name="knowledge-2",
+        input="What color is the ocean?",
+        expected_output="The ocean is blue.",
+        metadata={"category": "knowledge"},
+    )
+    test_case3 = Case(input="When was World War 2?")
+    test_case4 = Case(input="Who was the first president of the United States?")
+
+    # 3. Create evaluators
+    class LangChainCriteriaEvaluator(Evaluator[str, str]):
+        def evaluate(self, evaluation_case: EvaluationData[str, str]) -> EvaluationOutput:
+            ## Follow LangChain's Docs: https://python.langchain.com/api_reference/langchain/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html
+            # Initialize Bedrock LLM
+            bedrock_llm = BedrockLLM(
+                model_id="anthropic.claude-v2",  # or other Bedrock models
+                model_kwargs={
+                    "max_tokens_to_sample": 256,
+                    "temperature": 0.7,
+                },
+            )
+
+            criteria = {"correctness": "Is the actual answer correct?", "relevance": "Is the response relevant?"}
+
+            evaluator = CriteriaEvalChain.from_llm(llm=bedrock_llm, criteria=criteria)
+
+            # Pass in required context for evaluator (look at LangChain's docs)
+            result = evaluator.evaluate_strings(prediction=evaluation_case.actual_output, input=evaluation_case.input)
+
+            # Make sure to return the correct type
+            return EvaluationOutput(
+                score=result["score"], test_pass=True if result["score"] > 0.5 else False, reason=result["reasoning"]
+            )
+
+    # 4. Create an experiment
+    experiment = Experiment[str, str](
+        cases=[test_case1, test_case2, test_case3, test_case4], evaluators=[LangChainCriteriaEvaluator()]
+    )
+
+    experiment.to_file("third_party_dataset", "json")
+
+    # 5. Run evaluations
+    reports = experiment.run_evaluations(get_response)
+    return reports[0]
+
+
+async def async_third_party_example():
+    """
+    Demonstrates integrating a third-party evaluator (LangChain) with the evaluation framework asynchronously.
+
+    This example:
+    1. Defines a task function that uses an agent to generate responses
+    2. Creates test cases with expected outputs
+    3. Creates a custom evaluator that wraps LangChain's CriteriaEvalChain
+    4. Creates a dataset with the test cases and evaluator
+    5. Runs evaluations and returns the report
+
+    Returns:
+        EvaluationReport: The evaluation results
+    """
+
+    # 1. Define a task function
+    async def get_response(case: Case) -> str:
+        agent = Agent(system_prompt="Be as concise as possible", callback_handler=None)
+        response = await agent.invoke_async(case.input)
+        return str(response)
+
+    # 2. Create test cases
+    test_case1 = Case[str, str](
+        name="knowledge-1",
+        input="What is the capital of France?",
+        expected_output="The capital of France is Paris.",
+        metadata={"category": "knowledge"},
+    )
+
+    test_case2 = Case[str, str](
+        name="knowledge-2",
+        input="What color is the ocean?",
+        expected_output="The ocean is blue.",
+        metadata={"category": "knowledge"},
+    )
+    test_case3 = Case(input="When was World War 2?")
+    test_case4 = Case(input="Who was the first president of the United States?")
+
+    # 3. Create evaluators
+    class LangChainCriteriaEvaluator(Evaluator[str, str]):
+        def evaluate(self, evaluation_case: EvaluationData[str, str]) -> EvaluationOutput:
+            ## Follow LangChain's Docs: https://python.langchain.com/api_reference/langchain/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html
+            # Initialize Bedrock LLM
+            bedrock_llm = BedrockLLM(
+                model_id="anthropic.claude-v2",  # or other Bedrock models
+                model_kwargs={
+                    "max_tokens_to_sample": 256,
+                    "temperature": 0.7,
+                },
+            )
+
+            criteria = {
+                "correctness": "Is the actual answer correct?",
+                "relevance": "Is the response relevant?",
+                "conciseness": "Is the response short and to the point?",
+            }
+
+            evaluator = CriteriaEvalChain.from_llm(llm=bedrock_llm, criteria=criteria)
+
+            # Pass in required context for evaluator (look at LangChain's docs)
+            result = evaluator.evaluate_strings(prediction=evaluation_case.actual_output, input=evaluation_case.input)
+
+            # Make sure to return the correct type
+            return EvaluationOutput(
+                score=result["score"], test_pass=True if result["score"] > 0.5 else False, reason=result["reasoning"]
+            )
+
+        async def evaluate_async(self, evaluation_case: EvaluationData[str, str]) -> EvaluationOutput:
+            return self.evaluate(evaluation_case)
+
+    # 4. Create an experiment
+    experiment = Experiment[str, str](
+        cases=[test_case1, test_case2, test_case3, test_case4], evaluators=[LangChainCriteriaEvaluator()]
+    )
+
+    # 4.5. (Optional) Save the experiment
+    experiment.to_file("async_third_party_dataset")
+
+    # 5. Run evaluations
+    reports = await experiment.run_evaluations_async(get_response)
+    return reports[0]
+
+
+if __name__ == "__main__":
+    start = datetime.datetime.now()
+    report = asyncio.run(async_third_party_example())
+    end = datetime.datetime.now()
+    print("Async: ", end - start)  # Async:  0:00:24.050895
+    report.to_file("async_third_party_report")
+    report.run_display(include_actual_output=True)
@@ -0,0 +1,112 @@
+import asyncio
+import datetime
+
+from strands import Agent
+from strands.multiagent import GraphBuilder
+
+from strands_evals import Case, Experiment
+from strands_evals.evaluators import InteractionsEvaluator, TrajectoryEvaluator
+from strands_evals.extractors import graph_extractor
+
+
+async def async_graph_example():
+    """
+    Demonstrates evaluating graph-based agent workflows for research tasks.
+
+    This example:
+    1. Defines a task function with a graph of specialized research agents
+    2. Creates test cases for research and report generation scenarios
+    3. Creates TrajectoryEvaluator and InteractionsEvaluator to assess graph execution
+    4. Creates datasets with the test cases and evaluators
+    5. Runs evaluations and analyzes the reports
+
+    Returns:
+        tuple[EvaluationReport, EvaluationReport]: The trajectory and interaction evaluation results
+    """
+
+    ### Step 1: Define task ###
+    def research_graph(case: Case):
+        # Create specialized agents
+        researcher = Agent(name="researcher", system_prompt="You are a research specialist...")
+        analyst = Agent(name="analyst", system_prompt="You are a data analysis specialist...")
+        fact_checker = Agent(name="fact_checker", system_prompt="You are a fact checking specialist...")
+        report_writer = Agent(name="report_writer", system_prompt="You are a report writing specialist...")
+
+        # Create a graph with these agents
+        builder = GraphBuilder()
+        # Add nodes
+        builder.add_node(researcher, "research")
+        builder.add_node(analyst, "analysis")
+        builder.add_node(fact_checker, "fact_check")
+        builder.add_node(report_writer, "report")
+
+        # Add edges (dependencies)
+        builder.add_edge("research", "analysis")
+        builder.add_edge("research", "fact_check")
+        builder.add_edge("analysis", "report")
+        builder.add_edge("fact_check", "report")
+
+        # Set entry points (optional - will be auto-detected if not specified)
+        builder.set_entry_point("research")
+
+        # Build the graph
+        graph = builder.build()
+
+        result = graph(case.input)
+        interactions = graph_extractor.extract_graph_interactions(result)
+
+        return {"interactions": interactions, "trajectory": [node.node_id for node in result.execution_order]}
+
+    ### Step 2: Create test cases ###
+    test1 = Case(
+        input="Research the impact of AI on healthcare and create a short report",
+        expected_interactions=[
+            {"node_name": "research", "dependencies": []},
+            {"node_name": "fact_check", "dependencies": ["research"]},
+            {"node_name": "analysis", "dependencies": ["research"]},
+            {"node_name": "report", "dependencies": ["fact_check", "analysis"]},
+        ],
+    )
+    test2 = Case(input="Research the impact of robotics on healthcare and create a short report")
+
+    ### Step 2: Create evaluator ###
+    rubric = {
+        "research": "The research node should be the starting point and generate a query about the topic.",
+        "fact_check": "The fact check node should come after research and verify the accuracy of the generated query.",
+        "analysis": "The analysis node should come after research and generate a summary of the findings.",
+        "report": "The report node should come after analysis"
+        " and fact check and synthesize the information into a coherent report.",
+    }
+    # if want to use the same rubric
+    basic_rubric = (
+        "The graph system should ultilized the agents as expected with relevant information."
+        " The actual interactions should include more information than expected."
+    )
+    interaction_evaluator = InteractionsEvaluator(rubric=rubric)
+    trajectory_eval = TrajectoryEvaluator(rubric=basic_rubric)
+
+    ### Step 4: Create dataset ###
+    interaction_experiment = Experiment(cases=[test1, test2], evaluators=[interaction_evaluator])
+    trajectory_experiment = Experiment(cases=[test1, test2], evaluators=[trajectory_eval])
+
+    ### Step 5: Run evaluation ###
+    interaction_reports = await interaction_experiment.run_evaluations_async(research_graph)
+    trajectory_reports = await trajectory_experiment.run_evaluations_async(research_graph)
+    interaction_report = interaction_reports[0]
+    trajectory_report = trajectory_reports[0]
+
+    return trajectory_report, interaction_report
+
+
+if __name__ == "__main__":
+    # run the file as a module: eg. python -m examples.evaluate_graph
+    start = datetime.datetime.now()
+    trajectory_report, interaction_report = asyncio.run(async_graph_example())
+    end = datetime.datetime.now()
+    print("Async node interactions", end - start)
+
+    trajectory_report.to_file("research_graph_report_trajectory")
+    trajectory_report.display(include_actual_trajectory=True)
+
+    interaction_report.to_file("research_graph_report_interactions")
+    interaction_report.display(include_actual_interactions=True)