google
diff --git a/‎src/google/adk/cli/cli_eval.py‎
Lines changed: 4 additions & 6 deletions b/‎src/google/adk/cli/cli_eval.py‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/google/adk/cli/cli_tools_click.py‎
Lines changed: 1 addition & 1 deletion b/‎src/google/adk/cli/cli_tools_click.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/google/adk/evaluation/agent_evaluator.py‎
Lines changed: 99 additions & 10 deletions b/‎src/google/adk/evaluation/agent_evaluator.py‎
Lines changed: 99 additions & 10 deletions
diff --git a/‎src/google/adk/evaluation/constants.py‎
Lines changed: 20 additions & 0 deletions b/‎src/google/adk/evaluation/constants.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/evaluation_generator.py‎
Lines changed: 1 addition & 1 deletion b/‎src/google/adk/evaluation/evaluation_generator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/integration/fixture/trip_planner_agent/initial.session.json‎
Lines changed: 0 additions & 13 deletions b/‎tests/integration/fixture/trip_planner_agent/initial.session.json‎
Lines changed: 0 additions & 13 deletions
@@ -26,6 +26,7 @@
 
 from ..agents import Agent
 from ..artifacts.base_artifact_service import BaseArtifactService
+from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.eval_case import EvalCase
 from ..evaluation.eval_metrics import EvalMetric
 from ..evaluation.eval_metrics import EvalMetricResult
@@ -38,10 +39,6 @@
 logger = logging.getLogger("google_adk." + __name__)
 
 
-MISSING_EVAL_DEPENDENCIES_MESSAGE = (
- "Eval module is not installed, please install via `pip install"
- " google-adk[eval]`."
-)
 TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
 RESPONSE_MATCH_SCORE_KEY = "response_match_score"
 # This evaluation is not very stable.
@@ -150,7 +147,7 @@ async def run_evals(
  artifact_service: The artifact service to use during inferencing.
  """
  try:
- from ..evaluation.agent_evaluator import EvaluationGenerator
+ from ..evaluation.evaluation_generator import EvaluationGenerator
  except ModuleNotFoundError as e:
  raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
 
@@ -252,7 +249,8 @@ async def run_evals(
  result = "❌ Failed"
 
  print(f"Result: {result}\n")
-
+ except ModuleNotFoundError as e:
+ raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
  except Exception:
  # Catching the general exception, so that we don't block other eval
  # cases.
 
@@ -31,12 +31,12 @@
 from . import cli_create
 from . import cli_deploy
 from .. import version
+from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from ..evaluation.gcs_eval_set_results_manager import GcsEvalSetResultsManager
 from ..evaluation.gcs_eval_sets_manager import GcsEvalSetsManager
 from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
 from ..sessions.in_memory_session_service import InMemorySessionService
 from .cli import run_cli
-from .cli_eval import MISSING_EVAL_DEPENDENCIES_MESSAGE
 from .fast_api import get_fast_api_app
 from .utils import envs
 from .utils import evals
 
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import annotations
+
 import json
 import logging
 import os
@@ -23,16 +25,16 @@
 from typing import Union
 import uuid
 
+from google.genai import types as genai_types
 from pydantic import ValidationError
 
+from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
+from .eval_case import IntermediateData
 from .eval_set import EvalSet
-from .evaluation_generator import EvaluationGenerator
 from .evaluator import EvalStatus
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
-from .response_evaluator import ResponseEvaluator
-from .trajectory_evaluator import TrajectoryEvaluator
 
 logger = logging.getLogger("google_adk." + __name__)
 
@@ -96,6 +98,7 @@ async def evaluate_eval_set(
  criteria: dict[str, float],
  num_runs=NUM_RUNS,
  agent_name=None,
+ print_detailed_results: bool = True,
  ):
  """Evaluates an agent using the given EvalSet.
 
@@ -109,14 +112,22 @@ async def evaluate_eval_set(
  num_runs: Number of times all entries in the eval dataset should be
  assessed.
  agent_name: The name of the agent.
+ print_detailed_results: Whether to print detailed results for each metric
+ evaluation.
  """
+ try:
+ from .evaluation_generator import EvaluationGenerator
+ except ModuleNotFoundError as e:
+ raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
  eval_case_responses_list = await EvaluationGenerator.generate_responses(
  eval_set=eval_set,
  agent_module_path=agent_module,
  repeat_num=num_runs,
  agent_name=agent_name,
  )
 
+ failures = []
+
  for eval_case_responses in eval_case_responses_list:
  actual_invocations = [
  invocation
@@ -139,10 +150,25 @@ async def evaluate_eval_set(
  )
  )
 
- assert evaluation_result.overall_eval_status == EvalStatus.PASSED, (
- f"{metric_name} for {agent_module} Failed. Expected {threshold},"
- f" but got {evaluation_result.overall_score}."
- )
+ if print_detailed_results:
+ AgentEvaluator._print_details(
+ evaluation_result=evaluation_result,
+ metric_name=metric_name,
+ threshold=threshold,
+ )
+
+ # Gather all the failures.
+ if evaluation_result.overall_eval_status != EvalStatus.PASSED:
+ failures.append(
+ f"{metric_name} for {agent_module} Failed. Expected {threshold},"
+ f" but got {evaluation_result.overall_score}."
+ )
+
+ assert not failures, (
+ "Following are all the test failures. If you looking to get more"
+ " details on the failures, then please re-run this test with"
+ " `print_details` set to `True`.\n{}".format("\n".join(failures))
+ )
 
  @staticmethod
  async def evaluate(
@@ -158,9 +184,10 @@ async def evaluate(
  agent_module: The path to python module that contains the definition of
  the agent. There is convention in place here, where the code is going to
  look for 'root_agent' in the loaded module.
- eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
- full path to the file containing eval dataset, or a directory that is
- recursively explored for all files that have a `.test.json` suffix.
+ eval_dataset_file_path_or_dir: The eval data set. This can be either a
+ string representing full path to the file containing eval dataset, or a
+ directory that is recursively explored for all files that have a
+ `.test.json` suffix.
  num_runs: Number of times all entries in the eval dataset should be
  assessed.
  agent_name: The name of the agent.
@@ -358,6 +385,11 @@ def _validate_input(eval_dataset, criteria):
 
  @staticmethod
  def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
+ try:
+ from .response_evaluator import ResponseEvaluator
+ from .trajectory_evaluator import TrajectoryEvaluator
+ except ModuleNotFoundError as e:
+ raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
  if metric_name == TOOL_TRAJECTORY_SCORE_KEY:
  return TrajectoryEvaluator(threshold=threshold)
  elif (
@@ -367,3 +399,60 @@ def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
  return ResponseEvaluator(threshold=threshold, metric_name=metric_name)
 
  raise ValueError(f"Unsupported eval metric: {metric_name}")
+
+ @staticmethod
+ def _print_details(
+ evaluation_result: EvaluationResult, metric_name: str, threshold: float
+ ):
+ try:
+ from pandas import pandas as pd
+ from tabulate import tabulate
+ except ModuleNotFoundError as e:
+ raise ModuleNotFoundError(MISSING_EVAL_DEPENDENCIES_MESSAGE) from e
+ print(
+ f"Summary: `{evaluation_result.overall_eval_status}` for Metric:"
+ f" `{metric_name}`. Expected threshold: `{threshold}`, actual value:"
+ f" `{evaluation_result.overall_score}`."
+ )
+
+ data = []
+ for per_invocation_result in evaluation_result.per_invocation_results:
+ data.append({
+ "eval_status": per_invocation_result.eval_status,
+ "score": per_invocation_result.score,
+ "threshold": threshold,
+ "prompt": AgentEvaluator._convert_content_to_text(
+ per_invocation_result.expected_invocation.user_content
+ ),
+ "expected_response": AgentEvaluator._convert_content_to_text(
+ per_invocation_result.expected_invocation.final_response
+ ),
+ "actual_response": AgentEvaluator._convert_content_to_text(
+ per_invocation_result.actual_invocation.final_response
+ ),
+ "expected_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+ per_invocation_result.expected_invocation.intermediate_data
+ ),
+ "actual_tool_calls": AgentEvaluator._convert_tool_calls_to_text(
+ per_invocation_result.actual_invocation.intermediate_data
+ ),
+ })
+
+ print(tabulate(pd.DataFrame(data), headers="keys", tablefmt="grid"))
+ print("\n\n") # Few empty lines for visual clarity
+
+ @staticmethod
+ def _convert_content_to_text(content: Optional[genai_types.Content]) -> str:
+ if content and content.parts:
+ return "\n".join([p.text for p in content.parts if p.text])
+
+ return ""
+
+ @staticmethod
+ def _convert_tool_calls_to_text(
+ intermediate_data: Optional[IntermediateData],
+ ) -> str:
+ if intermediate_data and intermediate_data.tool_uses:
+ return "\n".join([str(t) for t in intermediate_data.tool_uses])
+
+ return ""
@@ -0,0 +1,20 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+MISSING_EVAL_DEPENDENCIES_MESSAGE = (
+ "Eval module is not installed, please install via `pip install"
+ " google-adk[eval]`."
+)
@@ -182,7 +182,7 @@ async def _generate_inferences_from_root_agent(
  tool_uses = []
  invocation_id = ""
 
- for event in runner.run(
+ async for event in runner.run_async(
  user_id=user_id, session_id=session_id, new_message=user_content
  ):
  invocation_id = (