1313# limitations under the License.
1414
1515import asyncio
16+ import collections
1617from contextlib import asynccontextmanager
1718from datetime import datetime
1819import logging
1920import os
2021import tempfile
22+ from typing import AsyncGenerator
23+ from typing import Coroutine
2124from typing import Optional
2225
2326import click
2730from . import cli_create
2831from . import cli_deploy
2932from .. import version
33+ from ..evaluation .local_eval_set_results_manager import LocalEvalSetResultsManager
34+ from ..sessions .in_memory_session_service import InMemorySessionService
3035from .cli import run_cli
3136from .cli_eval import MISSING_EVAL_DEPENDENCIES_MESSAGE
3237from .fast_api import get_fast_api_app
@@ -306,7 +311,7 @@ def cli_eval(
306311 EvalMetric (metric_name = metric_name , threshold = threshold )
307312 )
308313
309- print (f"Using evaluation creiteria : { evaluation_criteria } " )
314+ print (f"Using evaluation criteria : { evaluation_criteria } " )
310315
311316 root_agent = get_root_agent (agent_module_file_path )
312317 reset_func = try_get_reset_func (agent_module_file_path )
@@ -325,21 +330,47 @@ def cli_eval(
325330 e for e in eval_set .eval_cases if e .eval_id in eval_case_ids
326331 ]
327332
328- eval_set_id_to_eval_cases [eval_set_file_path ] = eval_cases
333+ eval_set_id_to_eval_cases [eval_set . eval_set_id ] = eval_cases
329334
330335 async def _collect_eval_results () -> list [EvalCaseResult ]:
331- return [
332- result
333- async for result in run_evals (
334- eval_set_id_to_eval_cases , root_agent , reset_func , eval_metrics
335- )
336- ]
336+ session_service = InMemorySessionService ()
337+ eval_case_results = []
338+ async for eval_case_result in run_evals (
339+ eval_set_id_to_eval_cases ,
340+ root_agent ,
341+ reset_func ,
342+ eval_metrics ,
343+ session_service = session_service ,
344+ ):
345+ eval_case_result .session_details = await session_service .get_session (
346+ app_name = os .path .basename (agent_module_file_path ),
347+ user_id = eval_case_result .user_id ,
348+ session_id = eval_case_result .session_id ,
349+ )
350+ eval_case_results .append (eval_case_result )
351+ return eval_case_results
337352
338353 try :
339354 eval_results = asyncio .run (_collect_eval_results ())
340355 except ModuleNotFoundError :
341356 raise click .ClickException (MISSING_EVAL_DEPENDENCIES_MESSAGE )
342357
358+ # Write eval set results.
359+ local_eval_set_results_manager = LocalEvalSetResultsManager (
360+ agent_dir = os .path .dirname (agent_module_file_path )
361+ )
362+ eval_set_id_to_eval_results = collections .defaultdict (list )
363+ for eval_case_result in eval_results :
364+ eval_set_id = eval_case_result .eval_set_id
365+ eval_set_id_to_eval_results [eval_set_id ].append (eval_case_result )
366+
367+ for eval_set_id , eval_case_results in eval_set_id_to_eval_results .items ():
368+ local_eval_set_results_manager .save_eval_set_result (
369+ app_name = os .path .basename (agent_module_file_path ),
370+ eval_set_id = eval_set_id ,
371+ eval_case_results = eval_case_results ,
372+ )
373+
343374 print ("*********************************************************************" )
344375 eval_run_summary = {}
345376
0 commit comments