Skip to content

Commit 15ee4af

Browse files
committed
adding human seeded evals
1 parent f6ae9c9 commit 15ee4af

32 files changed

+5026
-7
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ build/
55
dist/
66
wheels/
77
*.egg-info
8+
.DS_Store
89

910
# Virtual environments
1011
.venv
1112
*.svg
13+
scratch/

human-seeded-evals/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Human Seeded Evals Demo
2+
3+
Like evals ... but without all the hard work.
4+
5+
Panacea or pipedream?

human-seeded-evals/app/__init__.py

Whitespace-only changes.

human-seeded-evals/app/agent.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from __future__ import annotations as _annotations
2+
3+
from dataclasses import dataclass
4+
from datetime import datetime
5+
6+
from pydantic_ai import Agent, RunContext
7+
8+
from .models import TimeRangeInputs, TimeRangeResponse
9+
10+
11+
@dataclass
12+
class TimeRangeDeps:
13+
now: datetime
14+
15+
16+
instrunctions = "Convert the user's request into a structured time range."
17+
time_range_agent = Agent[TimeRangeDeps, TimeRangeResponse](
18+
'anthropic:claude-sonnet-4-0',
19+
output_type=TimeRangeResponse, # type: ignore # we can't yet annotate something as receiving a TypeForm
20+
deps_type=TimeRangeDeps,
21+
instructions=instrunctions,
22+
retries=1,
23+
)
24+
25+
26+
@time_range_agent.instructions
27+
def inject_current_time(ctx: RunContext[TimeRangeDeps]) -> str:
28+
"""Add the user's current time and timezone in the format 'Friday, November 22, 2024 11:15:14 PST' to context."""
29+
return f"The user's current time is {ctx.deps.now:%A, %B %d, %Y %H:%M:%S %Z}."
30+
31+
32+
async def infer_time_range(inputs: TimeRangeInputs) -> TimeRangeResponse:
33+
"""Infer a time range from a user prompt."""
34+
result = await time_range_agent.run(inputs.prompt, deps=TimeRangeDeps(now=inputs.now))
35+
return result.output

human-seeded-evals/app/main.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import logfire
2+
from fastapi import FastAPI
3+
4+
from .agent import infer_time_range
5+
from .models import TimeRangeInputs, TimeRangeResponse
6+
7+
logfire.configure(environment='dev')
8+
logfire.instrument_pydantic_ai()
9+
10+
app = FastAPI()
11+
logfire.instrument_fastapi(app)
12+
13+
14+
@app.post('/api/timerange')
15+
async def convert_time_range(time_range_inputs: TimeRangeInputs) -> TimeRangeResponse:
16+
return await infer_time_range(time_range_inputs)

human-seeded-evals/app/models.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from __future__ import annotations as _annotations
2+
3+
from datetime import datetime
4+
5+
from pydantic import AwareDatetime, BaseModel, Field
6+
7+
8+
class TimeRangeBuilderSuccess(BaseModel, use_attribute_docstrings=True):
9+
"""Response when a time range could be successfully generated."""
10+
11+
start_timestamp: AwareDatetime = Field(serialization_alias='startTimestamp')
12+
"""A datetime in ISO format with timezone offset when the interval starts."""
13+
14+
end_timestamp: AwareDatetime = Field(serialization_alias='endTimestamp')
15+
"""A datetime in ISO format with timezone offset when the interval ends."""
16+
17+
explanation: str | None
18+
"""
19+
A brief explanation of the time range that was selected.
20+
21+
For example, if a user only mentions a specific point in time, you might explain that you selected a 10 minute
22+
window around that time.
23+
"""
24+
25+
26+
class TimeRangeBuilderError(BaseModel):
27+
"""Response when a time range cannot not be generated."""
28+
29+
error: str
30+
31+
32+
TimeRangeResponse = TimeRangeBuilderSuccess | TimeRangeBuilderError
33+
34+
35+
class TimeRangeInputs(BaseModel):
36+
"""The inputs for the time range inference agent."""
37+
38+
prompt: str
39+
now: AwareDatetime = Field(default_factory=lambda: datetime.now().astimezone())
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import asyncio
2+
import os
3+
from datetime import datetime, timedelta, timezone
4+
from pathlib import Path
5+
from typing import Any, Literal
6+
7+
import logfire
8+
from logfire.experimental import annotations
9+
from logfire.experimental.query_client import AsyncLogfireQueryClient
10+
from pydantic import BaseModel, TypeAdapter
11+
from pydantic_ai import Agent, format_as_xml
12+
13+
read_token = os.environ['LOGFIRE_READ_TOKEN']
14+
logfire.configure(environment='evals')
15+
logfire.instrument_pydantic_ai()
16+
17+
18+
class EvalFeedback(BaseModel, use_attribute_docstrings=True):
19+
reaction: Literal['positive', 'negative']
20+
comment: str | None = None
21+
"""Very concise comment for the evaluation"""
22+
23+
24+
prompt_path = Path(__file__).parent / 'eval_agent_prompt.txt'
25+
evals_agent = Agent(
26+
'anthropic:claude-sonnet-4-0',
27+
instructions=prompt_path.read_text(),
28+
output_type=EvalFeedback,
29+
)
30+
runs_query = """
31+
select
32+
created_at,
33+
trace_id,
34+
span_id,
35+
attributes->'all_messages_events'->1->>'content' as prompt,
36+
attributes->'final_result' as output
37+
from records
38+
where otel_scope_name = 'pydantic-ai' and message = 'time_range_agent run'
39+
"""
40+
41+
with_annotations_query = """
42+
select
43+
'00-' || trace_id || '-' || parent_span_id || '-01' as trace_parent
44+
from records
45+
where kind='annotation'
46+
"""
47+
48+
49+
class RunData(BaseModel):
50+
created_at: datetime
51+
trace_id: str
52+
span_id: str
53+
prompt: str
54+
output: Any
55+
56+
@property
57+
def trace_parent(self):
58+
return f'00-{self.trace_id}-{self.span_id}-01'
59+
60+
61+
run_data_list_schema = TypeAdapter(list[RunData])
62+
63+
64+
async def apply_feedback(run: RunData):
65+
if run.output is None:
66+
return
67+
r = await evals_agent.run(
68+
format_as_xml({'run_timestamp': run.created_at, 'prompt': run.prompt, 'output': run.output})
69+
)
70+
print(f'Adding feedback to {run.trace_parent}: {r.output}')
71+
annotations.record_feedback(
72+
run.trace_parent,
73+
'AI Annotation',
74+
value=r.output.reaction,
75+
comment=r.output.comment,
76+
extra={'path': ''},
77+
)
78+
79+
80+
async def main():
81+
min_timestamp = datetime.now(tz=timezone.utc) - timedelta(minutes=30)
82+
async with AsyncLogfireQueryClient(read_token) as client:
83+
while True:
84+
response = await client.query_json_rows(runs_query, min_timestamp=min_timestamp)
85+
runs = run_data_list_schema.validate_python(response['rows'])
86+
if runs:
87+
response = await client.query_json_rows(with_annotations_query, min_timestamp=min_timestamp)
88+
annotated_spans: set[str] = {r['trace_parent'] for r in response['rows']}
89+
runs = [run for run in runs if run.trace_parent not in annotated_spans]
90+
if runs:
91+
print('')
92+
logfire.info('found {runs} new runs to evaluate', runs=len(runs))
93+
min_timestamp = min(runs, key=lambda run: run.created_at).created_at.astimezone(timezone.utc)
94+
await asyncio.gather(*[apply_feedback(run) for run in runs])
95+
await asyncio.sleep(2)
96+
continue
97+
98+
min_timestamp = datetime.now(tz=timezone.utc) - timedelta(minutes=1)
99+
print('.', end='', flush=True)
100+
101+
await asyncio.sleep(2)
102+
103+
104+
if __name__ == '__main__':
105+
try:
106+
asyncio.run(main())
107+
except KeyboardInterrupt:
108+
print('stopping')
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
You are an evaluation agent responsible for assessing the performance of the time_range_agent. The time_range_agent converts user requests into structured time ranges with start and end timestamps.
2+
3+
Your task is to evaluate whether the time_range_agent correctly interprets temporal requests and generates appropriate time ranges according to these criteria:
4+
5+
1. **Temporal Interpretation**: The agent should correctly identify the time period referenced by the user (e.g., "yesterday", "last Monday", "4pm", etc.)
6+
7+
2. **Past Time Constraint**: Time ranges must be in the past relative to the provided context timestamp. Future time requests should return an error.
8+
9+
3. **Timezone Handling**: The agent should handle timezone specifications correctly (e.g., "ET", "BST") and default to an appropriate timezone when not specified.
10+
11+
4. **Range Generation**:
12+
- For full day requests (e.g., "Monday", "yesterday"): Generate ranges from 00:00:00 to 23:59:59
13+
- For specific time points (e.g., "4pm", "9am"): Generate a 10-minute window around the specified time
14+
- The explanation should clearly describe the selected time range
15+
16+
5. **Error Handling**: When a valid time range cannot be generated (e.g., future dates), the agent should return an error response.
17+
18+
Evaluate each agent output by checking:
19+
- Is the interpreted time period correct given the user's request and context timestamp?
20+
- Are the start and end timestamps properly formatted with timezone information?
21+
- Is the time range in the past relative to the context timestamp?
22+
- Is the explanation clear and accurate?
23+
- Are errors properly returned when appropriate?
24+
25+
Provide concise, specific feedback identifying what the agent did correctly or incorrectly. Focus on the most important issues that would impact the usability of the time range.
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import asyncio
2+
import json
3+
import os
4+
import sys
5+
from datetime import datetime
6+
from pathlib import Path
7+
from typing import Any, Literal
8+
9+
import logfire
10+
from logfire.experimental.query_client import AsyncLogfireQueryClient
11+
from pydantic import BaseModel, TypeAdapter
12+
from pydantic_ai import Agent, format_as_xml
13+
14+
sys.path.append(str(Path(__file__).parent.parent))
15+
16+
from app import agent
17+
18+
read_token = os.environ['LOGFIRE_READ_TOKEN']
19+
logfire.configure(environment='evals')
20+
logfire.instrument_pydantic_ai()
21+
22+
auto_annotation_agent = Agent(
23+
'anthropic:claude-opus-4-0',
24+
instructions="""
25+
Your task is to build a system prompt for an agent (the evals agent) which will evaluate the performance of another
26+
agent and provide feedback on its performance.
27+
28+
You should return the system prompt for the evals agent ONLY.
29+
""",
30+
)
31+
32+
33+
class RunFeedback(BaseModel):
34+
reaction: Literal['positive', 'negative'] | None
35+
comment: str | None
36+
37+
38+
class AgentRunSummary(BaseModel):
39+
prompt: str
40+
context: Any
41+
output: Any
42+
feedback: RunFeedback | None = None
43+
44+
45+
count_runs_query = "select count(*) from records where message = 'time_range_agent run'"
46+
runs_query = """
47+
select
48+
trace_id,
49+
span_id,
50+
'time timestamp: ' || created_at as context,
51+
attributes->'all_messages_events'->1->>'content' as prompt,
52+
attributes->'final_result' as output
53+
from records
54+
where message = 'time_range_agent run'
55+
"""
56+
feedback_query = """
57+
select
58+
trace_id,
59+
parent_span_id,
60+
attributes->>'Annotation' as reaction,
61+
attributes->>'logfire.feedback.comment' as comment
62+
from records
63+
where kind='annotation' and attributes->>'logfire.feedback.name'='Annotation'
64+
"""
65+
min_count = 1
66+
67+
68+
async def get_runs() -> None | list[AgentRunSummary]:
69+
min_timestamp = datetime(2025, 7, 2)
70+
async with AsyncLogfireQueryClient(read_token) as client:
71+
c = await client.query_json(sql=count_runs_query, min_timestamp=min_timestamp)
72+
count = c['columns'][0]['values'][0]
73+
if count < min_count:
74+
print(f'Insufficient runs ({count})')
75+
return
76+
77+
r = await client.query_json_rows(sql=feedback_query, min_timestamp=min_timestamp)
78+
feedback_lookup: dict[str, Any] = {
79+
f'{row["trace_id"]}-{row["parent_span_id"]}': RunFeedback(**row) for row in r['rows']
80+
}
81+
82+
r = await client.query_json_rows(sql=runs_query, min_timestamp=min_timestamp)
83+
runs: list[AgentRunSummary] = []
84+
with_feedback = 0
85+
for row in r['rows']:
86+
key = f'{row["trace_id"]}-{row["span_id"]}'
87+
if feedback := feedback_lookup.get(key):
88+
row['feedback'] = feedback
89+
with_feedback += 1
90+
runs.append(AgentRunSummary(**row))
91+
92+
logfire.info(f'Found {len(runs)} runs, {with_feedback} with feedback')
93+
return runs
94+
95+
96+
async def generate_evals_prompt(
97+
name: str, instrunctions: str, output_type: type[Any] | None, runs: list[AgentRunSummary]
98+
) -> str:
99+
data: dict[str, Any] = {'agent_name': name, 'agent_instructions': instrunctions}
100+
if output_type is not None:
101+
data['output_schema'] = json.dumps(TypeAdapter(output_type).json_schema(), indent=2)
102+
data['agent_runs'] = [run.model_dump(exclude_none=True) for run in runs]
103+
prompt = format_as_xml(data, include_root_tag=False)
104+
r = await auto_annotation_agent.run(prompt)
105+
return r.output
106+
107+
108+
async def main():
109+
runs = await get_runs()
110+
if runs:
111+
prompt = await generate_evals_prompt(
112+
'time_range_agent',
113+
agent.instrunctions,
114+
agent.TimeRangeResponse, # type: ignore
115+
runs,
116+
)
117+
prompt_path = Path(__file__).parent / 'eval_agent_prompt.txt'
118+
prompt_path.write_text(prompt)
119+
print(f'prompt written to {prompt_path}')
120+
121+
122+
if __name__ == '__main__':
123+
asyncio.run(main())
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"permissions": {
3+
"allow": [
4+
"Bash(npm run typecheck:*)",
5+
"Bash(npm run lint)"
6+
],
7+
"deny": []
8+
}
9+
}

0 commit comments

Comments
 (0)