chore: Marked expected_invocation as optional field on evaluator interface

ankursharmas · copybara-github · commit b17c8f19e5fc · 2025-10-28T10:27:47.000-07:00
ADK already has a set of metrics that don't rely expected_invocations. Also, for eval cases with conversation scenario, this would be the main line case.

PiperOrigin-RevId: 825101481
diff --git a/src/google/adk/cli/cli_eval.py b/src/google/adk/cli/cli_eval.py
@@ -210,21 +210,23 @@ def pretty_print_eval_result(eval_result: EvalCaseResult):
 
   data = []
   for per_invocation_result in eval_result.eval_metric_result_per_invocation:
+    actual_invocation = per_invocation_result.actual_invocation
+    expected_invocation = per_invocation_result.expected_invocation
     row_data = {
-        "prompt": _convert_content_to_text(
-            per_invocation_result.expected_invocation.user_content
-        ),
+        "prompt": _convert_content_to_text(actual_invocation.user_content),
         "expected_response": _convert_content_to_text(
-            per_invocation_result.expected_invocation.final_response
+            expected_invocation.final_response if expected_invocation else None
         ),
         "actual_response": _convert_content_to_text(
-            per_invocation_result.actual_invocation.final_response
+            actual_invocation.final_response
         ),
         "expected_tool_calls": _convert_tool_calls_to_text(
-            per_invocation_result.expected_invocation.intermediate_data
+            expected_invocation.intermediate_data
+            if expected_invocation
+            else None
         ),
         "actual_tool_calls": _convert_tool_calls_to_text(
-            per_invocation_result.actual_invocation.intermediate_data
+            actual_invocation.intermediate_data
         ),
     }
     for metric_result in per_invocation_result.eval_metric_results:
diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
@@ -216,10 +216,11 @@ class EvalMetricResultPerInvocation(EvalBaseModel):
       )
   )
 
-  expected_invocation: Invocation = Field(
+  expected_invocation: Optional[Invocation] = Field(
+      default=None,
       description=(
           "The expected invocation, usually the reference or golden invocation."
-      )
+      ),
   )
 
   eval_metric_results: list[EvalMetricResult] = Field(
diff --git a/src/google/adk/evaluation/evaluator.py b/src/google/adk/evaluation/evaluator.py
@@ -33,7 +33,7 @@ class PerInvocationResult(BaseModel):
   """Metric evaluation score per invocation."""
 
   actual_invocation: Invocation
-  expected_invocation: Invocation
+  expected_invocation: Optional[Invocation] = None
   score: Optional[float] = None
   eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
   rubric_scores: Optional[list[RubricScore]] = None
@@ -61,7 +61,16 @@ class Evaluator(ABC):
   def evaluate_invocations(
       self,
       actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
   ) -> EvaluationResult:
-    """Returns EvaluationResult after performing evaluations using actual and expected invocations."""
+    """Returns EvaluationResult after performing evaluations using actual and expected invocations.
+
+    Args:
+      actual_invocations: These are the invocations that are obtained from the
+        agent under test.
+      expected_invocations: An optional list of invocations, if specified,
+        usually act as a benchmark/golden response. If these are specified
+        usually the expectation is that the length of this list and actual
+        invocaiton is the same.
+    """
     raise NotImplementedError()
diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
@@ -59,8 +59,11 @@ def get_metric_info() -> MetricInfo:
   def evaluate_invocations(
       self,
       actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
   ) -> EvaluationResult:
+    if expected_invocations is None:
+      raise ValueError("expected_invocations is required for this metric.")
+
     total_score = 0.0
     num_invocations = 0
     per_invocation_results = []
diff --git a/src/google/adk/evaluation/final_response_match_v2.py b/src/google/adk/evaluation/final_response_match_v2.py
@@ -147,7 +147,11 @@ def __init__(
       self,
       eval_metric: EvalMetric,
   ):
-    super().__init__(eval_metric, FinalResponseMatchV2Evaluator.criterion_type)
+    super().__init__(
+        eval_metric,
+        FinalResponseMatchV2Evaluator.criterion_type,
+        expected_invocations_required=True,
+    )
     self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT
 
   @staticmethod
@@ -166,8 +170,13 @@ def get_metric_info() -> MetricInfo:
 
   @override
   def format_auto_rater_prompt(
-      self, actual_invocation: Invocation, expected_invocation: Invocation
+      self,
+      actual_invocation: Invocation,
+      expected_invocation: Optional[Invocation],
   ) -> str:
+    if expected_invocation is None:
+      raise ValueError("expected_invocation is required for this metric.")
+
     reference = get_text_from_content(expected_invocation.final_response)
     response = get_text_from_content(actual_invocation.final_response)
     user_prompt = get_text_from_content(expected_invocation.user_content)
diff --git a/src/google/adk/evaluation/hallucinations_v1.py b/src/google/adk/evaluation/hallucinations_v1.py
@@ -395,7 +395,8 @@ def _create_context_for_step(
         },
         {
           "name": "get_weather",
-          "description": '''Gets the weather of the given place at the given time.
+          "description": '''Gets the weather of the given place at the given
+          time.
 
     Args:
       location: The location for which to retrieve weather information.
@@ -408,7 +409,8 @@ def _create_context_for_step(
             "type": "object",
             "properties": {
               "location": {
-                "description": "The location for which to retrieve weather information.",
+                "description": "The location for which to retrieve weather
+                information.",
                 "type": "string"
               },
               "time": {
@@ -711,8 +713,15 @@ def _aggregate_invocation_results(
   async def evaluate_invocations(
       self,
       actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
   ) -> EvaluationResult:
+    # expected_invocations are not required by the metric and if they are not
+    # supplied, we provide an a list of None to rest of the code.
+    expected_invocations = (
+        [None] * len(actual_invocations)
+        if expected_invocations is None
+        else expected_invocations
+    )
     per_invocation_results = []
     for actual, expected in zip(actual_invocations, expected_invocations):
       step_evaluations = self._get_steps_to_evaluate(actual)
diff --git a/src/google/adk/evaluation/llm_as_judge.py b/src/google/adk/evaluation/llm_as_judge.py
@@ -60,9 +60,13 @@ class LlmAsJudge(Evaluator):
   """
 
   def __init__(
-      self, eval_metric: EvalMetric, criterion_type: type[BaseCriterion]
+      self,
+      eval_metric: EvalMetric,
+      criterion_type: type[BaseCriterion],
+      expected_invocations_required=False,
   ):
     self._eval_metric = eval_metric
+    self._expected_invocations_required = expected_invocations_required
 
     expected_criterion_type_error = ValueError(
         f"`{eval_metric.metric_name}` metric expects a criterion of type"
@@ -84,7 +88,7 @@ def __init__(
 
   @abstractmethod
   def format_auto_rater_prompt(
-      self, actual: Invocation, expected: Invocation
+      self, actual: Invocation, expected: Optional[Invocation]
   ) -> str:
     """Formats the auto-rater prompt to evaluate the given invocation."""
 
@@ -112,8 +116,19 @@ def aggregate_invocation_results(
   async def evaluate_invocations(
       self,
       actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
   ) -> EvaluationResult:
+    if self._expected_invocations_required and expected_invocations is None:
+      raise ValueError("expected_invocations is needed by this metric.")
+
+    # If expected_invocation are not required by the metric and if they are not
+    # supplied, we provide an a list of None.
+    expected_invocations = (
+        [None] * len(actual_invocations)
+        if expected_invocations is None
+        else expected_invocations
+    )
+
     per_invocation_results = []
     for actual, expected in zip(actual_invocations, expected_invocations):
       auto_rater_prompt = self.format_auto_rater_prompt(actual, expected)
diff --git a/src/google/adk/evaluation/local_eval_service.py b/src/google/adk/evaluation/local_eval_service.py
@@ -22,8 +22,6 @@
 from typing import Optional
 import uuid
 
-from google.genai.types import Content
-from google.genai.types import Part
 from typing_extensions import override
 
 from ..agents.base_agent import BaseAgent
@@ -51,6 +49,7 @@
 from .evaluation_generator import EvaluationGenerator
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
+from .evaluator import PerInvocationResult
 from .metric_evaluator_registry import DEFAULT_METRIC_EVALUATOR_REGISTRY
 from .metric_evaluator_registry import MetricEvaluatorRegistry
 from .user_simulator_provider import UserSimulatorProvider
@@ -222,69 +221,51 @@ async def _evaluate_single_inference_result(
         else 'test_user_id'
     )
 
-    if eval_case.conversation_scenario:
-      logger.warning(
-          'Skipping evaluation of variable-length conversation scenario in eval'
-          ' set/case %s/%s.',
-          inference_result.eval_set_id,
-          inference_result.eval_case_id,
-      )
-      for actual_invocation in inference_result.inferences:
-        eval_metric_result_per_invocation.append(
-            EvalMetricResultPerInvocation(
-                actual_invocation=actual_invocation,
-                expected_invocation=Invocation(
-                    user_content=actual_invocation.user_content,
-                    final_response=Content(
-                        parts=[Part(text='N/A')], role='model'
-                    ),
-                ),
-            )
-        )
-      eval_case_result = EvalCaseResult(
-          eval_set_file=inference_result.eval_set_id,
-          eval_set_id=inference_result.eval_set_id,
-          eval_id=inference_result.eval_case_id,
-          final_eval_status=EvalStatus.NOT_EVALUATED,
-          overall_eval_metric_results=overall_eval_metric_results,
-          eval_metric_result_per_invocation=eval_metric_result_per_invocation,
-          session_id=inference_result.session_id,
-          session_details=await self._session_service.get_session(
-              app_name=inference_result.app_name,
-              user_id=user_id,
-              session_id=inference_result.session_id,
-          ),
-          user_id=user_id,
-      )
-      return (inference_result, eval_case_result)
-
-    if len(inference_result.inferences) != len(eval_case.conversation):
+    if eval_case.conversation_scenario is None and len(
+        inference_result.inferences
+    ) != len(eval_case.conversation):
       raise ValueError(
           'Inferences should match conversations in eval case. Found'
           f'{len(inference_result.inferences)} inferences '
           f'{len(eval_case.conversation)} conversations in eval cases.'
       )
 
     # Pre-creating the EvalMetricResults entries for each invocation.
-    for actual, expected in zip(
-        inference_result.inferences, eval_case.conversation
-    ):
+    for idx, actual in enumerate(inference_result.inferences):
       eval_metric_result_per_invocation.append(
           EvalMetricResultPerInvocation(
               actual_invocation=actual,
-              expected_invocation=expected,
+              expected_invocation=eval_case.conversation[idx]
+              if eval_case.conversation
+              else None,
               # We will fill this as we evaluate each metric per invocation.
               eval_metric_results=[],
           )
       )
 
     for eval_metric in evaluate_config.eval_metrics:
       # Perform evaluation of the metric.
-      evaluation_result = await self._evaluate_metric(
-          eval_metric=eval_metric,
-          actual_invocations=inference_result.inferences,
-          expected_invocations=eval_case.conversation,
-      )
+      try:
+        evaluation_result = await self._evaluate_metric(
+            eval_metric=eval_metric,
+            actual_invocations=inference_result.inferences,
+            expected_invocations=eval_case.conversation,
+        )
+      except Exception as e:
+        # We intentionally catch the Exception as we don't want failures to
+        # affect other metric evaluation.
+        logger.error(
+            "Metric evaluation failed for metric `%s` for eval case id '%s'"
+            ' with following error `%s`',
+            eval_metric.metric_name,
+            eval_case.eval_id,
+            e,
+            exc_info=True,
+        )
+        # We use an empty result.
+        evaluation_result = EvaluationResult(
+            overall_eval_status=EvalStatus.NOT_EVALUATED
+        )
 
       # Track overall scrore across all invocations.
       eval_metric_result_details = EvalMetricResultDetails(
@@ -299,8 +280,10 @@ async def _evaluate_single_inference_result(
           )
       )
 
-      if len(evaluation_result.per_invocation_results) != len(
-          eval_metric_result_per_invocation
+      if (
+          evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
+          and len(evaluation_result.per_invocation_results)
+          != len(eval_metric_result_per_invocation)
       ):
         raise ValueError(
             'Eval metric should return results for each invocation. Found '
@@ -309,10 +292,14 @@ async def _evaluate_single_inference_result(
         )
 
       # Track score across individual invocations.
-      for invocation_result, invocation in zip(
-          evaluation_result.per_invocation_results,
-          eval_metric_result_per_invocation,
-      ):
+      for idx, invocation in enumerate(eval_metric_result_per_invocation):
+        invocation_result = (
+            evaluation_result.per_invocation_results[idx]
+            if evaluation_result.overall_eval_status != EvalStatus.NOT_EVALUATED
+            else PerInvocationResult(
+                actual_invocation=invocation.actual_invocation
+            )
+        )
         eval_metric_result_details = EvalMetricResultDetails(
             rubric_scores=invocation_result.rubric_scores
         )
@@ -351,7 +338,7 @@ async def _evaluate_metric(
       self,
       eval_metric: EvalMetric,
       actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
   ) -> EvaluationResult:
     """Returns EvaluationResult obtained from evaluating a metric using an Evaluator."""
 
diff --git a/src/google/adk/evaluation/response_evaluator.py b/src/google/adk/evaluation/response_evaluator.py
@@ -100,7 +100,7 @@ def get_metric_info(metric_name: str) -> MetricInfo:
   def evaluate_invocations(
       self,
       actual_invocations: list[Invocation],
-      expected_invocations: list[Invocation],
+      expected_invocations: Optional[list[Invocation]],
   ) -> EvaluationResult:
     # If the metric is response_match_score, just use the RougeEvaluator.
     if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value:
@@ -112,5 +112,7 @@ def evaluate_invocations(
       )
 
     return _VertexAiEvalFacade(
-        threshold=self._threshold, metric_name=self._metric_name
+        threshold=self._threshold,
+        metric_name=self._metric_name,
+        expected_invocations_required=True,
     ).evaluate_invocations(actual_invocations, expected_invocations)
diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -16,6 +16,7 @@
 
 import logging
 from typing import ClassVar
+from typing import Optional
 
 from typing_extensions import override
 
@@ -281,7 +282,7 @@ def get_metric_info() -> MetricInfo:
 
   @override
   def format_auto_rater_prompt(
-      self, actual_invocation: Invocation, _: Invocation
+      self, actual_invocation: Invocation, _: Optional[Invocation]
   ) -> str:
     """Returns the autorater prompt."""
 
diff --git a/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py b/src/google/adk/evaluation/rubric_based_tool_use_quality_v1.py
diff --git a/src/google/adk/evaluation/safety_evaluator.py b/src/google/adk/evaluation/safety_evaluator.py
diff --git a/src/google/adk/evaluation/trajectory_evaluator.py b/src/google/adk/evaluation/trajectory_evaluator.py
diff --git a/src/google/adk/evaluation/vertex_ai_eval_facade.py b/src/google/adk/evaluation/vertex_ai_eval_facade.py
diff --git a/tests/unittests/evaluation/test_local_eval_service.py b/tests/unittests/evaluation/test_local_eval_service.py

Original file line number	Diff line number	Diff line change
`@@ -216,10 +216,11 @@ class EvalMetricResultPerInvocation(EvalBaseModel):`
`216`	`216`	`)`
`217`	`217`	`)`
`218`	`218`
`219`		`- expected_invocation: Invocation = Field(`
	`219`	`+ expected_invocation: Optional[Invocation] = Field(`
	`220`	`+ default=None,`
`220`	`221`	`description=(`
`221`	`222`	`"The expected invocation, usually the reference or golden invocation."`
`222`		`- )`
	`223`	`+ ),`
`223`	`224`	`)`
`224`	`225`
`225`	`226`	`eval_metric_results: list[EvalMetricResult] = Field(`