2222from typing import Optional
2323import uuid
2424
25- from google .genai .types import Content
26- from google .genai .types import Part
2725from typing_extensions import override
2826
2927from ..agents .base_agent import BaseAgent
5149from .evaluation_generator import EvaluationGenerator
5250from .evaluator import EvalStatus
5351from .evaluator import EvaluationResult
52+ from .evaluator import PerInvocationResult
5453from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
5554from .metric_evaluator_registry import MetricEvaluatorRegistry
5655from .user_simulator_provider import UserSimulatorProvider
@@ -222,69 +221,51 @@ async def _evaluate_single_inference_result(
222221 else 'test_user_id'
223222 )
224223
225- if eval_case .conversation_scenario :
226- logger .warning (
227- 'Skipping evaluation of variable-length conversation scenario in eval'
228- ' set/case %s/%s.' ,
229- inference_result .eval_set_id ,
230- inference_result .eval_case_id ,
231- )
232- for actual_invocation in inference_result .inferences :
233- eval_metric_result_per_invocation .append (
234- EvalMetricResultPerInvocation (
235- actual_invocation = actual_invocation ,
236- expected_invocation = Invocation (
237- user_content = actual_invocation .user_content ,
238- final_response = Content (
239- parts = [Part (text = 'N/A' )], role = 'model'
240- ),
241- ),
242- )
243- )
244- eval_case_result = EvalCaseResult (
245- eval_set_file = inference_result .eval_set_id ,
246- eval_set_id = inference_result .eval_set_id ,
247- eval_id = inference_result .eval_case_id ,
248- final_eval_status = EvalStatus .NOT_EVALUATED ,
249- overall_eval_metric_results = overall_eval_metric_results ,
250- eval_metric_result_per_invocation = eval_metric_result_per_invocation ,
251- session_id = inference_result .session_id ,
252- session_details = await self ._session_service .get_session (
253- app_name = inference_result .app_name ,
254- user_id = user_id ,
255- session_id = inference_result .session_id ,
256- ),
257- user_id = user_id ,
258- )
259- return (inference_result , eval_case_result )
260-
261- if len (inference_result .inferences ) != len (eval_case .conversation ):
224+ if eval_case .conversation_scenario is None and len (
225+ inference_result .inferences
226+ ) != len (eval_case .conversation ):
262227 raise ValueError (
263228 'Inferences should match conversations in eval case. Found'
264229 f'{ len (inference_result .inferences )} inferences '
265230 f'{ len (eval_case .conversation )} conversations in eval cases.'
266231 )
267232
268233 # Pre-creating the EvalMetricResults entries for each invocation.
269- for actual , expected in zip (
270- inference_result .inferences , eval_case .conversation
271- ):
234+ for idx , actual in enumerate (inference_result .inferences ):
272235 eval_metric_result_per_invocation .append (
273236 EvalMetricResultPerInvocation (
274237 actual_invocation = actual ,
275- expected_invocation = expected ,
238+ expected_invocation = eval_case .conversation [idx ]
239+ if eval_case .conversation
240+ else None ,
276241 # We will fill this as we evaluate each metric per invocation.
277242 eval_metric_results = [],
278243 )
279244 )
280245
281246 for eval_metric in evaluate_config .eval_metrics :
282247 # Perform evaluation of the metric.
283- evaluation_result = await self ._evaluate_metric (
284- eval_metric = eval_metric ,
285- actual_invocations = inference_result .inferences ,
286- expected_invocations = eval_case .conversation ,
287- )
248+ try :
249+ evaluation_result = await self ._evaluate_metric (
250+ eval_metric = eval_metric ,
251+ actual_invocations = inference_result .inferences ,
252+ expected_invocations = eval_case .conversation ,
253+ )
254+ except Exception as e :
255+ # We intentionally catch the Exception as we don't want failures to
256+ # affect other metric evaluation.
257+ logger .error (
258+ "Metric evaluation failed for metric `%s` for eval case id '%s'"
259+ ' with following error `%s`' ,
260+ eval_metric .metric_name ,
261+ eval_case .eval_id ,
262+ e ,
263+ exc_info = True ,
264+ )
265+ # We use an empty result.
266+ evaluation_result = EvaluationResult (
267+ overall_eval_status = EvalStatus .NOT_EVALUATED
268+ )
288269
289270 # Track overall scrore across all invocations.
290271 eval_metric_result_details = EvalMetricResultDetails (
@@ -299,8 +280,10 @@ async def _evaluate_single_inference_result(
299280 )
300281 )
301282
302- if len (evaluation_result .per_invocation_results ) != len (
303- eval_metric_result_per_invocation
283+ if (
284+ evaluation_result .overall_eval_status != EvalStatus .NOT_EVALUATED
285+ and len (evaluation_result .per_invocation_results )
286+ != len (eval_metric_result_per_invocation )
304287 ):
305288 raise ValueError (
306289 'Eval metric should return results for each invocation. Found '
@@ -309,10 +292,14 @@ async def _evaluate_single_inference_result(
309292 )
310293
311294 # Track score across individual invocations.
312- for invocation_result , invocation in zip (
313- evaluation_result .per_invocation_results ,
314- eval_metric_result_per_invocation ,
315- ):
295+ for idx , invocation in enumerate (eval_metric_result_per_invocation ):
296+ invocation_result = (
297+ evaluation_result .per_invocation_results [idx ]
298+ if evaluation_result .overall_eval_status != EvalStatus .NOT_EVALUATED
299+ else PerInvocationResult (
300+ actual_invocation = invocation .actual_invocation
301+ )
302+ )
316303 eval_metric_result_details = EvalMetricResultDetails (
317304 rubric_scores = invocation_result .rubric_scores
318305 )
@@ -351,7 +338,7 @@ async def _evaluate_metric(
351338 self ,
352339 eval_metric : EvalMetric ,
353340 actual_invocations : list [Invocation ],
354- expected_invocations : list [Invocation ],
341+ expected_invocations : Optional [ list [Invocation ] ],
355342 ) -> EvaluationResult :
356343 """Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
357344
0 commit comments