google
diff --git a/‎src/google/adk/evaluation/app_details.py‎
Lines changed: 14 additions & 0 deletions b/‎src/google/adk/evaluation/app_details.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/eval_case.py‎
Lines changed: 51 additions & 0 deletions b/‎src/google/adk/evaluation/eval_case.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/eval_metrics.py‎
Lines changed: 4 additions & 0 deletions b/‎src/google/adk/evaluation/eval_metrics.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/evaluator.py‎
Lines changed: 6 additions & 0 deletions b/‎src/google/adk/evaluation/evaluator.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/final_response_match_v2.py‎
Lines changed: 6 additions & 5 deletions b/‎src/google/adk/evaluation/final_response_match_v2.py‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/google/adk/evaluation/llm_as_judge.py‎
Lines changed: 14 additions & 4 deletions b/‎src/google/adk/evaluation/llm_as_judge.py‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎src/google/adk/evaluation/llm_as_judge_utils.py‎
Lines changed: 101 additions & 0 deletions b/‎src/google/adk/evaluation/llm_as_judge_utils.py‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎src/google/adk/evaluation/local_eval_service.py‎
Lines changed: 11 additions & 4 deletions b/‎src/google/adk/evaluation/local_eval_service.py‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎src/google/adk/evaluation/metric_evaluator_registry.py‎
Lines changed: 5 additions & 0 deletions b/‎src/google/adk/evaluation/metric_evaluator_registry.py‎
Lines changed: 5 additions & 0 deletions
@@ -47,3 +47,17 @@ class AppDetails(EvalBaseModel):
  default_factory=dict,
  )
  """A mapping from the agent name to the details of that agent."""
+
+ def get_developer_instructions(self, agent_name: str) -> str:
+ """Returns a string containing the developer instructions."""
+ if agent_name not in self.agent_details:
+ raise ValueError(f"`{agent_name}` not found in the agentic system.")
+
+ return self.agent_details[agent_name].instructions
+
+ def get_tools_by_agent_name(self) -> dict[str, genai_types.ToolListUnion]:
+ """Returns a dictionary of tools available to an agent in the App, keyed to the name of the Agent."""
+ return {
+ name: details.tool_declarations
+ for name, details in self.agent_details.items()
+ }
@@ -168,3 +168,54 @@ def get_all_tool_calls(
  )
 
  return tool_calls
+
+
+def get_all_tool_responses(
+ intermediate_data: Optional[IntermediateDataType],
+) -> list[genai_types.FunctionResponse]:
+ """A utility method to retrieve tools responses from intermediate data."""
+ if not intermediate_data:
+ return []
+
+ tool_responses = []
+ if isinstance(intermediate_data, IntermediateData):
+ tool_responses = intermediate_data.tool_responses
+ elif isinstance(intermediate_data, InvocationEvents):
+ # Go over each event in the list of events
+ for invocation_event in intermediate_data.invocation_events:
+ # Check if the event has content and some parts.
+ if invocation_event.content and invocation_event.content.parts:
+ for p in invocation_event.content.parts:
+ # For each part, we check if any of those part is a function response.
+ if p.function_response:
+ tool_responses.append(p.function_response)
+ else:
+ raise ValueError(
+ f"Unsupported type for intermediate_data `{intermediate_data}`"
+ )
+
+ return tool_responses
+
+
+ToolCallAndResponse: TypeAlias = tuple[
+ genai_types.FunctionCall, Optional[genai_types.FunctionResponse]
+]
+"""A Tuple representing a Function call and corresponding optional function response."""
+
+
+def get_all_tool_calls_with_responses(
+ intermediate_data: Optional[IntermediateDataType],
+) -> list[ToolCallAndResponse]:
+ """Returns tool calls with the corresponding responses, if available."""
+ tool_responses_by_call_id: dict[str, genai_types.FunctionResponse] = {
+ tool_response.id: tool_response
+ for tool_response in get_all_tool_responses(intermediate_data)
+ }
+
+ tool_call_and_responses: list[ToolCallAndResponse] = []
+
+ for tool_call in get_all_tool_calls(intermediate_data):
+ response = tool_responses_by_call_id.get(tool_call.id, None)
+ tool_call_and_responses.append((tool_call, response))
+
+ return tool_call_and_responses
@@ -48,6 +48,10 @@ class PrebuiltMetrics(Enum):
 
  FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
 
+ RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1 = (
+ "rubric_based_final_response_quality_v1"
+ )
+
 
 MetricName: TypeAlias = Union[str, PrebuiltMetrics]
 Threshold: TypeAlias = float
 
@@ -23,6 +23,7 @@
 from .eval_case import Invocation
 from .eval_metrics import BaseCriterion
 from .eval_metrics import EvalStatus
+from .eval_rubrics import RubricScore
 
 # Redefining the type here for backward compatibility.
 EvalStatus: TypeAlias = EvalStatus
@@ -35,6 +36,7 @@ class PerInvocationResult(BaseModel):
  expected_invocation: Invocation
  score: Optional[float] = None
  eval_status: EvalStatus = EvalStatus.NOT_EVALUATED
+ rubric_scores: Optional[list[RubricScore]] = None
 
 
 class EvaluationResult(BaseModel):
@@ -45,6 +47,10 @@ class EvaluationResult(BaseModel):
  """Overall status, based on each invocation."""
 
  per_invocation_results: list[PerInvocationResult] = []
+ """Detailed results per invocation."""
+
+ overall_rubric_scores: Optional[list[RubricScore]] = None
+ """Overall rubric, based on each invocation."""
 
 
 class Evaluator(ABC):
 
@@ -33,6 +33,7 @@
 from .eval_metrics import PrebuiltMetrics
 from .evaluator import EvaluationResult
 from .evaluator import PerInvocationResult
+from .llm_as_judge import AutoRaterScore
 from .llm_as_judge import LlmAsJudge
 from .llm_as_judge_utils import get_eval_status
 from .llm_as_judge_utils import get_text_from_content
@@ -179,17 +180,17 @@ def format_auto_rater_prompt(
  @override
  def convert_auto_rater_response_to_score(
  self, llm_response: LlmResponse
- ) -> Optional[float]:
+ ) -> AutoRaterScore:
  response_text = get_text_from_content(llm_response.content)
  if response_text is None:
- return None
+ return AutoRaterScore()
  label = _parse_critique(response_text)
  if label == Label.VALID:
- return 1.0
+ return AutoRaterScore(score=1.0)
  elif label == Label.INVALID:
- return 0.0
+ return AutoRaterScore(score=0.0)
  else:
- return None
+ return AutoRaterScore()
 
  @override
  def aggregate_per_invocation_samples(
 
@@ -26,15 +26,22 @@
 from ..models.llm_response import LlmResponse
 from ..models.registry import LLMRegistry
 from ..utils.context_utils import Aclosing
+from .common import EvalBaseModel
 from .eval_case import Invocation
 from .eval_metrics import BaseCriterion
 from .eval_metrics import EvalMetric
+from .eval_metrics import RubricScore
 from .evaluator import EvaluationResult
 from .evaluator import Evaluator
 from .evaluator import PerInvocationResult
 from .llm_as_judge_utils import get_eval_status
 
 
+class AutoRaterScore(EvalBaseModel):
+ score: Optional[float] = None
+ rubric_scores: Optional[list[RubricScore]] = None
+
+
 class LlmAsJudge(Evaluator):
  """Evaluator based on a LLM.
 
@@ -82,7 +89,7 @@ def format_auto_rater_prompt(
  @abstractmethod
  def convert_auto_rater_response_to_score(
  self, auto_rater_response: LlmResponse
- ) -> Optional[float]:
+ ) -> AutoRaterScore:
  """Parses auto_rater_response and returns the corresponding score, or None if the score cannot be determined."""
 
  @abstractmethod
@@ -126,15 +133,18 @@ async def evaluate_invocations(
  ) as agen:
  async for llm_response in agen:
  # Non-streaming call, so there is only one response content.
- score = self.convert_auto_rater_response_to_score(llm_response)
+ auto_rater_score = self.convert_auto_rater_response_to_score(
+ llm_response
+ )
  invocation_result_samples.append(
  PerInvocationResult(
  actual_invocation=actual,
  expected_invocation=expected,
- score=score,
+ score=auto_rater_score.score,
  eval_status=get_eval_status(
- score, self._criterion.threshold
+ auto_rater_score.score, self._eval_metric.threshold
  ),
+ rubric_scores=auto_rater_score.rubric_scores,
  )
  )
  if not invocation_result_samples:
 
@@ -15,10 +15,17 @@
 from __future__ import annotations
 
 import enum
+import statistics
 from typing import Optional
+from typing import Union
 
 from google.genai import types as genai_types
 
+from .app_details import AppDetails
+from .common import EvalBaseModel
+from .eval_case import get_all_tool_calls_with_responses
+from .eval_case import IntermediateDataType
+from .eval_metrics import RubricScore
 from .evaluator import EvalStatus
 
 
@@ -46,3 +53,97 @@ def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus:
  if score is None:
  return EvalStatus.NOT_EVALUATED
  return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
+
+
+def get_average_rubric_score(
+ rubric_scores: list[RubricScore],
+) -> Optional[float]:
+ """Returns a single score value from the given list of rubric scores.
+
+ It is possible that none of the rubric score actually contain a score value,
+ if that happens then None is returned.
+
+ If non-zero score values are present, then a mean value is returned as the
+ aggregated value.
+ """
+ rubric_scores = [
+ rubric_score.score
+ for rubric_score in rubric_scores
+ if rubric_score.score is not None
+ ]
+
+ return statistics.mean(rubric_scores) if rubric_scores else None
+
+
+class _ToolDeclarations(EvalBaseModel):
+ """Internal data model used for serializing Tool declarations."""
+
+ tool_declarations: dict[str, genai_types.ToolListUnion]
+
+
+def get_tool_declarations_as_json_str(
+ app_details: AppDetails,
+) -> str:
+ """Returns a JSON string representation of Tool declarations.
+
+ The output of this method is usually intended to be sent to the LLM.
+ """
+ tool_declarations = _ToolDeclarations(
+ tool_declarations=app_details.get_tools_by_agent_name()
+ )
+ return tool_declarations.model_dump_json(
+ indent=2,
+ exclude_unset=True,
+ exclude_defaults=True,
+ exclude_none=True,
+ )
+
+
+class _ToolCallAndResponse(EvalBaseModel):
+ """Internal data model to capture one single tool call and response."""
+
+ step: int
+ tool_call: genai_types.FunctionCall
+ tool_response: Union[genai_types.FunctionResponse, str]
+
+
+class _ToolCallsAndResponses(EvalBaseModel):
+ """Internal data model used for serializing Tool call and responses."""
+
+ tool_calls_and_response: list[_ToolCallAndResponse]
+
+
+def get_tool_calls_and_responses_as_json_str(
+ intermediate_data: Optional[IntermediateDataType],
+) -> str:
+ """Returns a JSON string representation of tool calls and corresponding responses.
+
+ The output of this method is usually intended to be sent to the LLM.
+ """
+ raw_tool_calls_and_response = get_all_tool_calls_with_responses(
+ intermediate_data
+ )
+
+ if not raw_tool_calls_and_response:
+ return "No intermediate steps were taken."
+
+ tool_calls_and_responses = []
+ for idx, (tool_call, tool_response) in enumerate(raw_tool_calls_and_response):
+ tool_calls_and_responses.append(
+ _ToolCallAndResponse(
+ step=idx,
+ tool_call=tool_call,
+ tool_response=tool_response if tool_response else "None",
+ )
+ )
+
+ internal_tool_calls_and_responses = _ToolCallsAndResponses(
+ tool_calls_and_response=tool_calls_and_responses
+ )
+
+ return internal_tool_calls_and_responses.model_dump_json(
+ indent=2,
+ exclude_unset=True,
+ exclude_defaults=True,
+ exclude_none=True,
+ )
@@ -40,6 +40,7 @@
 from .eval_case import Invocation
 from .eval_metrics import EvalMetric
 from .eval_metrics import EvalMetricResult
+from .eval_metrics import EvalMetricResultDetails
 from .eval_metrics import EvalMetricResultPerInvocation
 from .eval_result import EvalCaseResult
 from .eval_set import EvalCase
@@ -239,12 +240,15 @@ async def _evaluate_single_inference_result(
  )
 
  # Track overall scrore across all invocations.
+ eval_metric_result_details = EvalMetricResultDetails(
+ rubric_scores=evaluation_result.overall_rubric_scores
+ )
  overall_eval_metric_results.append(
  EvalMetricResult(
- metric_name=eval_metric.metric_name,
- threshold=eval_metric.threshold,
  score=evaluation_result.overall_score,
  eval_status=evaluation_result.overall_eval_status,
+ details=eval_metric_result_details,
+ **eval_metric.model_dump(),
  )
  )
 
@@ -262,12 +266,15 @@ async def _evaluate_single_inference_result(
  evaluation_result.per_invocation_results,
  eval_metric_result_per_invocation,
  ):
+ eval_metric_result_details = EvalMetricResultDetails(
+ rubric_scores=invocation_result.rubric_scores
+ )
  invocation.eval_metric_results.append(
  EvalMetricResult(
- metric_name=eval_metric.metric_name,
- threshold=eval_metric.threshold,
  score=invocation_result.score,
  eval_status=invocation_result.eval_status,
+ details=eval_metric_result_details,
+ **eval_metric.model_dump(),
  )
  )
 
 
@@ -24,6 +24,7 @@
 from .evaluator import Evaluator
 from .final_response_match_v2 import FinalResponseMatchV2Evaluator
 from .response_evaluator import ResponseEvaluator
+from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
 from .safety_evaluator import SafetyEvaluatorV1
 from .trajectory_evaluator import TrajectoryEvaluator
 
@@ -111,6 +112,10 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
  metric_info=FinalResponseMatchV2Evaluator.get_metric_info(),
  evaluator=FinalResponseMatchV2Evaluator,
  )
+ metric_evaluator_registry.register_evaluator(
+ metric_info=RubricBasedFinalResponseQualityV1Evaluator.get_metric_info(),
+ evaluator=RubricBasedFinalResponseQualityV1Evaluator,
+ )
 
  return metric_evaluator_registry