1212# See the License for the specific language governing permissions and
1313# limitations under the License.
1414
15+ from __future__ import annotations
16+
1517import json
1618import logging
1719import os
2325from typing import Union
2426import uuid
2527
28+ from google .genai import types as genai_types
2629from pydantic import ValidationError
2730
31+ from .constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
32+ from .eval_case import IntermediateData
2833from .eval_set import EvalSet
29- from .evaluation_generator import EvaluationGenerator
3034from .evaluator import EvalStatus
3135from .evaluator import EvaluationResult
3236from .evaluator import Evaluator
3337from .local_eval_sets_manager import convert_eval_set_to_pydanctic_schema
34- from .response_evaluator import ResponseEvaluator
35- from .trajectory_evaluator import TrajectoryEvaluator
3638
3739logger = logging .getLogger ("google_adk." + __name__ )
3840
@@ -96,6 +98,7 @@ async def evaluate_eval_set(
9698 criteria : dict [str , float ],
9799 num_runs = NUM_RUNS ,
98100 agent_name = None ,
101+ print_detailed_results : bool = True ,
99102 ):
100103 """Evaluates an agent using the given EvalSet.
101104
@@ -109,14 +112,22 @@ async def evaluate_eval_set(
109112 num_runs: Number of times all entries in the eval dataset should be
110113 assessed.
111114 agent_name: The name of the agent.
115+ print_detailed_results: Whether to print detailed results for each metric
116+ evaluation.
112117 """
118+ try :
119+ from .evaluation_generator import EvaluationGenerator
120+ except ModuleNotFoundError as e :
121+ raise ModuleNotFoundError (MISSING_EVAL_DEPENDENCIES_MESSAGE ) from e
113122 eval_case_responses_list = await EvaluationGenerator .generate_responses (
114123 eval_set = eval_set ,
115124 agent_module_path = agent_module ,
116125 repeat_num = num_runs ,
117126 agent_name = agent_name ,
118127 )
119128
129+ failures = []
130+
120131 for eval_case_responses in eval_case_responses_list :
121132 actual_invocations = [
122133 invocation
@@ -139,10 +150,25 @@ async def evaluate_eval_set(
139150 )
140151 )
141152
142- assert evaluation_result .overall_eval_status == EvalStatus .PASSED , (
143- f"{ metric_name } for { agent_module } Failed. Expected { threshold } ,"
144- f" but got { evaluation_result .overall_score } ."
145- )
153+ if print_detailed_results :
154+ AgentEvaluator ._print_details (
155+ evaluation_result = evaluation_result ,
156+ metric_name = metric_name ,
157+ threshold = threshold ,
158+ )
159+
160+ # Gather all the failures.
161+ if evaluation_result .overall_eval_status != EvalStatus .PASSED :
162+ failures .append (
163+ f"{ metric_name } for { agent_module } Failed. Expected { threshold } ,"
164+ f" but got { evaluation_result .overall_score } ."
165+ )
166+
167+ assert not failures , (
168+ "Following are all the test failures. If you looking to get more"
169+ " details on the failures, then please re-run this test with"
170+ " `print_details` set to `True`.\n {}" .format ("\n " .join (failures ))
171+ )
146172
147173 @staticmethod
148174 async def evaluate (
@@ -158,9 +184,10 @@ async def evaluate(
158184 agent_module: The path to python module that contains the definition of
159185 the agent. There is convention in place here, where the code is going to
160186 look for 'root_agent' in the loaded module.
161- eval_dataset_file_path_or_dir: The eval data set. This can be either a string representing
162- full path to the file containing eval dataset, or a directory that is
163- recursively explored for all files that have a `.test.json` suffix.
187+ eval_dataset_file_path_or_dir: The eval data set. This can be either a
188+ string representing full path to the file containing eval dataset, or a
189+ directory that is recursively explored for all files that have a
190+ `.test.json` suffix.
164191 num_runs: Number of times all entries in the eval dataset should be
165192 assessed.
166193 agent_name: The name of the agent.
@@ -358,6 +385,11 @@ def _validate_input(eval_dataset, criteria):
358385
359386 @staticmethod
360387 def _get_metric_evaluator (metric_name : str , threshold : float ) -> Evaluator :
388+ try :
389+ from .response_evaluator import ResponseEvaluator
390+ from .trajectory_evaluator import TrajectoryEvaluator
391+ except ModuleNotFoundError as e :
392+ raise ModuleNotFoundError (MISSING_EVAL_DEPENDENCIES_MESSAGE ) from e
361393 if metric_name == TOOL_TRAJECTORY_SCORE_KEY :
362394 return TrajectoryEvaluator (threshold = threshold )
363395 elif (
@@ -367,3 +399,60 @@ def _get_metric_evaluator(metric_name: str, threshold: float) -> Evaluator:
367399 return ResponseEvaluator (threshold = threshold , metric_name = metric_name )
368400
369401 raise ValueError (f"Unsupported eval metric: { metric_name } " )
402+
403+ @staticmethod
404+ def _print_details (
405+ evaluation_result : EvaluationResult , metric_name : str , threshold : float
406+ ):
407+ try :
408+ from pandas import pandas as pd
409+ from tabulate import tabulate
410+ except ModuleNotFoundError as e :
411+ raise ModuleNotFoundError (MISSING_EVAL_DEPENDENCIES_MESSAGE ) from e
412+ print (
413+ f"Summary: `{ evaluation_result .overall_eval_status } ` for Metric:"
414+ f" `{ metric_name } `. Expected threshold: `{ threshold } `, actual value:"
415+ f" `{ evaluation_result .overall_score } `."
416+ )
417+
418+ data = []
419+ for per_invocation_result in evaluation_result .per_invocation_results :
420+ data .append ({
421+ "eval_status" : per_invocation_result .eval_status ,
422+ "score" : per_invocation_result .score ,
423+ "threshold" : threshold ,
424+ "prompt" : AgentEvaluator ._convert_content_to_text (
425+ per_invocation_result .expected_invocation .user_content
426+ ),
427+ "expected_response" : AgentEvaluator ._convert_content_to_text (
428+ per_invocation_result .expected_invocation .final_response
429+ ),
430+ "actual_response" : AgentEvaluator ._convert_content_to_text (
431+ per_invocation_result .actual_invocation .final_response
432+ ),
433+ "expected_tool_calls" : AgentEvaluator ._convert_tool_calls_to_text (
434+ per_invocation_result .expected_invocation .intermediate_data
435+ ),
436+ "actual_tool_calls" : AgentEvaluator ._convert_tool_calls_to_text (
437+ per_invocation_result .actual_invocation .intermediate_data
438+ ),
439+ })
440+
441+ print (tabulate (pd .DataFrame (data ), headers = "keys" , tablefmt = "grid" ))
442+ print ("\n \n " ) # Few empty lines for visual clarity
443+
444+ @staticmethod
445+ def _convert_content_to_text (content : Optional [genai_types .Content ]) -> str :
446+ if content and content .parts :
447+ return "\n " .join ([p .text for p in content .parts if p .text ])
448+
449+ return ""
450+
451+ @staticmethod
452+ def _convert_tool_calls_to_text (
453+ intermediate_data : Optional [IntermediateData ],
454+ ) -> str :
455+ if intermediate_data and intermediate_data .tool_uses :
456+ return "\n " .join ([str (t ) for t in intermediate_data .tool_uses ])
457+
458+ return ""
0 commit comments