googleapis
diff --git a/‎tests/unit/vertexai/genai/replays/test_create_evaluation_run.py‎
Lines changed: 70 additions & 66 deletions b/‎tests/unit/vertexai/genai/replays/test_create_evaluation_run.py‎
Lines changed: 70 additions & 66 deletions
diff --git a/‎tests/unit/vertexai/genai/replays/test_evaluate_instances.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/unit/vertexai/genai/replays/test_evaluate_instances.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/unit/vertexai/genai/test_evals.py‎
Lines changed: 2 additions & 2 deletions b/‎tests/unit/vertexai/genai/test_evals.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎vertexai/_genai/_evals_common.py‎
Lines changed: 0 additions & 1 deletion b/‎vertexai/_genai/_evals_common.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎vertexai/_genai/_evals_data_converters.py‎
Lines changed: 0 additions & 4 deletions b/‎vertexai/_genai/_evals_data_converters.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎vertexai/_genai/_evals_visualization.py‎
Lines changed: 34 additions & 1 deletion b/‎vertexai/_genai/_evals_visualization.py‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎vertexai/_genai/evals.py‎
Lines changed: 26 additions & 22 deletions b/‎vertexai/_genai/evals.py‎
Lines changed: 26 additions & 22 deletions
@@ -48,70 +48,69 @@
 )
 
 
-# TODO(b/431231205): Re-enable once Unified Metrics are in prod.
-# def test_create_eval_run_data_source_evaluation_set(client):
-# """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
-# client._api_client._http_options.api_version = "v1beta1"
-# tool = genai_types.Tool(
-# function_declarations=[
-# genai_types.FunctionDeclaration(
-# name="get_weather",
-# description="Get weather in a location",
-# parameters={
-# "type": "object",
-# "properties": {"location": {"type": "string"}},
-# },
-# )
-# ]
-# )
-# evaluation_run = client.evals.create_evaluation_run(
-# name="test4",
-# display_name="test4",
-# dataset=types.EvaluationRunDataSource(
-# evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-# ),
-# dest=GCS_DEST,
-# metrics=[
-# UNIVERSAL_AR_METRIC,
-# types.RubricMetric.FINAL_RESPONSE_QUALITY,
-# LLM_METRIC
-# ],
-# agent_info=types.AgentInfo(
-# agent="project/123/locations/us-central1/reasoningEngines/456",
-# name="agent-1",
-# instruction="agent-1 instruction",
-# tool_declarations=[tool],
-# ),
-# labels={"label1": "value1"},
-# )
-# assert isinstance(evaluation_run, types.EvaluationRun)
-# assert evaluation_run.display_name == "test4"
-# assert evaluation_run.state == types.EvaluationRunState.PENDING
-# assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
-# assert evaluation_run.data_source.evaluation_set == (
-# "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
-# )
-# assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
-# output_config=genai_types.OutputConfig(
-# gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
-# ),
-# metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
-# )
-# assert evaluation_run.inference_configs[
-# "agent-1"
-# ] == types.EvaluationRunInferenceConfig(
-# agent_config=types.EvaluationRunAgentConfig(
-# developer_instruction=genai_types.Content(
-# parts=[genai_types.Part(text="agent-1 instruction")]
-# ),
-# tools=[tool],
-# )
-# )
-# assert evaluation_run.labels == {
-# "vertex-ai-evaluation-agent-engine-id": "456",
-# "label1": "value1",
-# }
-# assert evaluation_run.error is None
+def test_create_eval_run_data_source_evaluation_set(client):
+ """Tests that create_evaluation_run() creates a correctly structured EvaluationRun."""
+ client._api_client._http_options.api_version = "v1beta1"
+ tool = genai_types.Tool(
+ function_declarations=[
+ genai_types.FunctionDeclaration(
+ name="get_weather",
+ description="Get weather in a location",
+ parameters={
+ "type": "object",
+ "properties": {"location": {"type": "string"}},
+ },
+ )
+ ]
+ )
+ evaluation_run = client.evals.create_evaluation_run(
+ name="test4",
+ display_name="test4",
+ dataset=types.EvaluationRunDataSource(
+ evaluation_set="projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+ ),
+ dest=GCS_DEST,
+ metrics=[
+ UNIVERSAL_AR_METRIC,
+ types.RubricMetric.FINAL_RESPONSE_QUALITY,
+ LLM_METRIC,
+ ],
+ agent_info=types.evals.AgentInfo(
+ agent="project/123/locations/us-central1/reasoningEngines/456",
+ name="agent-1",
+ instruction="agent-1 instruction",
+ tool_declarations=[tool],
+ ),
+ labels={"label1": "value1"},
+ )
+ assert isinstance(evaluation_run, types.EvaluationRun)
+ assert evaluation_run.display_name == "test4"
+ assert evaluation_run.state == types.EvaluationRunState.PENDING
+ assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
+ assert evaluation_run.data_source.evaluation_set == (
+ "projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
+ )
+ assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
+ output_config=genai_types.OutputConfig(
+ gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
+ ),
+ metrics=[UNIVERSAL_AR_METRIC, FINAL_RESPONSE_QUALITY_METRIC, LLM_METRIC],
+ )
+ assert evaluation_run.inference_configs[
+ "agent-1"
+ ] == types.EvaluationRunInferenceConfig(
+ agent_config=types.EvaluationRunAgentConfig(
+ developer_instruction=genai_types.Content(
+ parts=[genai_types.Part(text="agent-1 instruction")]
+ ),
+ tools=[tool],
+ )
+ )
+ assert evaluation_run.labels == {
+ "vertex-ai-evaluation-agent-engine-id": "456",
+ "label1": "value1",
+ }
+ assert evaluation_run.error is None
 
 
 def test_create_eval_run_data_source_bigquery_request_set(client):
@@ -132,6 +131,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
  ),
  labels={"label1": "value1"},
  dest=GCS_DEST,
+ metrics=[UNIVERSAL_AR_METRIC],
  )
  assert isinstance(evaluation_run, types.EvaluationRun)
  assert evaluation_run.display_name == "test5"
@@ -152,6 +152,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
  output_config=genai_types.OutputConfig(
  gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
  ),
+ metrics=[UNIVERSAL_AR_METRIC],
  )
  assert evaluation_run.inference_configs is None
  assert evaluation_run.labels == {
@@ -160,7 +161,7 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
  assert evaluation_run.error is None
 
 
-# Test fails in replay mode because of the timestamp issue
+# Test fails in replay mode because of UUID generation mismatch.
 # def test_create_eval_run_data_source_evaluation_dataset(client):
 # """Tests that create_evaluation_run() creates a correctly structured EvaluationRun with EvaluationDataset."""
 # input_df = pd.DataFrame(
@@ -215,7 +216,8 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
 # candidate_name="candidate_1",
 # eval_dataset_df=input_df,
 # ),
-# dest="gs://lakeyk-limited-bucket/eval_run_output",
+# dest=GCS_DEST,
+# metrics=[UNIVERSAL_AR_METRIC],
 # )
 # assert isinstance(evaluation_run, types.EvaluationRun)
 # assert evaluation_run.display_name == "test6"
@@ -276,6 +278,7 @@ async def test_create_eval_run_async(client):
  )
  ),
  dest=GCS_DEST,
+ metrics=[UNIVERSAL_AR_METRIC],
  )
  assert isinstance(evaluation_run, types.EvaluationRun)
  assert evaluation_run.display_name == "test8"
@@ -292,6 +295,7 @@ async def test_create_eval_run_async(client):
  output_config=genai_types.OutputConfig(
  gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
  ),
+ metrics=[UNIVERSAL_AR_METRIC],
  )
  assert evaluation_run.error is None
  assert evaluation_run.inference_configs is None
 
@@ -246,7 +246,7 @@ def test_run_inference_with_agent(client):
  agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",
  src=test_df,
  )
- assert inference_result.candidate_name == "agent"
+ assert inference_result.candidate_name is None
  assert inference_result.gcs_source is None
 
 
 
@@ -1132,7 +1132,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict(
  }
  ),
  )
- assert inference_result.candidate_name == "agent"
+ assert inference_result.candidate_name is None
  assert inference_result.gcs_source is None
 
  @mock.patch.object(_evals_metric_loaders, "EvalDatasetLoader")
@@ -1211,7 +1211,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string(
  }
  ),
  )
- assert inference_result.candidate_name == "agent"
+ assert inference_result.candidate_name is None
  assert inference_result.gcs_source is None
 
  @mock.patch.object(_evals_utils, "EvalDatasetLoader")
 
@@ -798,7 +798,6 @@ def _execute_inference(
 
  evaluation_dataset = types.EvaluationDataset(
  eval_dataset_df=results_df,
- candidate_name="agent",
  )
  else:
  raise ValueError("Either model or agent_engine must be provided.")
 
@@ -366,10 +366,6 @@ def convert(self, raw_data: list[dict[str, Any]]) -> types.EvaluationDataset:
 
  intermediate_events: Optional[list[types.Event]] = None
  if intermediate_events_data:
- logger.warning(
- "intermediate_events attribute is experimental and may change in "
- "future versions."
- )
  if isinstance(intermediate_events_data, list):
  intermediate_events = []
  for event in intermediate_events_data:
 
@@ -280,7 +280,7 @@ def _get_evaluation_html(eval_result_json: str) -> str:
 
  // If we have agent info, render as trace
  if(agentInfo) {{
- let traceHtml = `<div class="trace-event-row"><div class="name"><span class="icon">🏃</span>agent_run</div></div>`;
+ let traceHtml = `<div class="trace-event-row"><div class="name"><span class="icon">🤖</span>agent_run</div></div>`;
  eventsArray.forEach(event => {{
  if (event.content && event.content.parts && event.content.parts.length > 0) {{
  event.content.parts.forEach(part => {{
@@ -1073,3 +1073,36 @@ def display_evaluation_dataset(eval_dataset_obj: types.EvaluationDataset) -> Non
  dataframe_json_string = json.dumps(processed_rows, ensure_ascii=False, default=str)
  html_content = _get_inference_html(dataframe_json_string)
  display.display(display.HTML(html_content))
+
+
+def _get_status_html(status: str, error_message: Optional[str] = None) -> str:
+ """Returns a simple HTML string for displaying a status and optional error."""
+ error_html = ""
+ if error_message:
+ error_html = f"""
+ <p>
+ <b>Error:</b>
+ <pre style="white-space: pre-wrap; word-wrap: break-word;">{error_message}</pre>
+ </p>
+ """
+
+ return f"""
+ <div>
+ <p><b>Status:</b> {status}</p>
+ {error_html}
+ </div>
+ """
+
+
+def display_evaluation_run_status(eval_run_obj: "types.EvaluationRun") -> None:
+ """Displays the status of an evaluation run in an IPython environment."""
+ if not _is_ipython_env():
+ logger.warning("Skipping display: not in an IPython environment.")
+ return
+ else:
+ from IPython import display
+
+ status = eval_run_obj.state.name if eval_run_obj.state else "UNKNOWN"
+ error_message = str(eval_run_obj.error) if eval_run_obj.error else None
+ html_content = _get_status_html(status, error_message)
+ display.display(display.HTML(html_content))
@@ -1334,11 +1334,9 @@ def create_evaluation_run(
  *,
  dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
  dest: str,
+ metrics: list[types.EvaluationRunMetricOrDict],
  name: Optional[str] = None,
  display_name: Optional[str] = None,
- metrics: Optional[
- list[types.EvaluationRunMetricOrDict]
- ] = None, # TODO: Make required unified metrics available in prod.
  agent_info: Optional[types.evals.AgentInfoOrDict] = None,
  labels: Optional[dict[str, str]] = None,
  config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
@@ -1348,25 +1346,32 @@ def create_evaluation_run(
  Args:
  dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
  dest: The GCS URI prefix to write the evaluation results to.
+ metrics: The list of metrics to evaluate.
  name: The name of the evaluation run.
  display_name: The display name of the evaluation run.
- metrics: The list of metrics to evaluate.
  agent_info: The agent info to evaluate.
  labels: The labels to apply to the evaluation run.
  config: The configuration for the evaluation run.
 
  Returns:
  The created evaluation run.
  """
+ if agent_info and isinstance(agent_info, dict):
+ agent_info = types.evals.AgentInfo.model_validate(agent_info)
  if type(dataset).__name__ == "EvaluationDataset":
- logger.warning(
- "EvaluationDataset input is experimental and may change in future versions."
- )
  if dataset.eval_dataset_df is None:
  raise ValueError(
  "EvaluationDataset must have eval_dataset_df populated."
  )
- if dataset.candidate_name is None and agent_info:
+ if (
+ dataset.candidate_name
+ and agent_info.name
+ and dataset.candidate_name != agent_info.name
+ ):
+ logger.warning(
+ "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
+ )
+ elif dataset.candidate_name is None and agent_info:
  dataset.candidate_name = agent_info.name
  eval_set = _evals_common._create_evaluation_set_from_dataframe(
  self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
@@ -1383,9 +1388,6 @@ def create_evaluation_run(
  )
  inference_configs = {}
  if agent_info:
- logger.warning(
- "The agent_info field is experimental and may change in future versions."
- )
  if isinstance(agent_info, dict):
  agent_info = types.evals.AgentInfo.model_validate(agent_info)
  if (
@@ -2187,11 +2189,9 @@ async def create_evaluation_run(
  *,
  dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
  dest: str,
+ metrics: list[types.EvaluationRunMetricOrDict],
  name: Optional[str] = None,
  display_name: Optional[str] = None,
- metrics: Optional[
- list[types.EvaluationRunMetricOrDict]
- ] = None, # TODO: Make required unified metrics available in prod.
  agent_info: Optional[types.evals.AgentInfo] = None,
  labels: Optional[dict[str, str]] = None,
  config: Optional[types.CreateEvaluationRunConfigOrDict] = None,
@@ -2201,25 +2201,32 @@ async def create_evaluation_run(
  Args:
  dataset: The dataset to evaluate. Either an EvaluationRunDataSource or an EvaluationDataset.
  dest: The GCS URI prefix to write the evaluation results to.
+ metrics: The list of metrics to evaluate.
  name: The name of the evaluation run.
  display_name: The display name of the evaluation run.
- metrics: The list of metrics to evaluate.
  agent_info: The agent info to evaluate.
  labels: The labels to apply to the evaluation run.
  config: The configuration for the evaluation run.
 
  Returns:
  The created evaluation run.
  """
+ if agent_info and isinstance(agent_info, dict):
+ agent_info = types.evals.AgentInfo.model_validate(agent_info)
  if type(dataset).__name__ == "EvaluationDataset":
- logger.warning(
- "EvaluationDataset input is experimental and may change in future versions."
- )
  if dataset.eval_dataset_df is None:
  raise ValueError(
  "EvaluationDataset must have eval_dataset_df populated."
  )
- if dataset.candidate_name is None and agent_info:
+ if (
+ dataset.candidate_name
+ and agent_info.name
+ and dataset.candidate_name != agent_info.name
+ ):
+ logger.warning(
+ "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
+ )
+ elif dataset.candidate_name is None and agent_info:
  dataset.candidate_name = agent_info.name
  eval_set = _evals_common._create_evaluation_set_from_dataframe(
  self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
@@ -2236,9 +2243,6 @@ async def create_evaluation_run(
  )
  inference_configs = {}
  if agent_info:
- logger.warning(
- "The agent_info field is experimental and may change in future versions."
- )
  if isinstance(agent_info, dict):
  agent_info = types.evals.AgentInfo.model_validate(agent_info)
  if (
Original file line number	Diff line number	Diff line change
`@@ -246,7 +246,7 @@ def test_run_inference_with_agent(client):`
`246`	`246`	`agent="projects/977012026409/locations/us-central1/reasoningEngines/7188347537655332864",`
`247`	`247`	`src=test_df,`
`248`	`248`	`)`
`249`		`- assert inference_result.candidate_name == "agent"`
	`249`	`+ assert inference_result.candidate_name is None`
`250`	`250`	`assert inference_result.gcs_source is None`
`251`	`251`
`252`	`252`
Original file line number	Diff line number	Diff line change
`@@ -1132,7 +1132,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_dict(`
`1132`	`1132`	`}`
`1133`	`1133`	`),`
`1134`	`1134`	`)`
`1135`		`- assert inference_result.candidate_name == "agent"`
	`1135`	`+ assert inference_result.candidate_name is None`
`1136`	`1136`	`assert inference_result.gcs_source is None`
`1137`	`1137`
`1138`	`1138`	`@mock.patch.object(_evals_metric_loaders, "EvalDatasetLoader")`
`@@ -1211,7 +1211,7 @@ def test_run_inference_with_agent_engine_and_session_inputs_literal_string(`
`1211`	`1211`	`}`
`1212`	`1212`	`),`
`1213`	`1213`	`)`
`1214`		`- assert inference_result.candidate_name == "agent"`
	`1214`	`+ assert inference_result.candidate_name is None`
`1215`	`1215`	`assert inference_result.gcs_source is None`
`1216`	`1216`
`1217`	`1217`	`@mock.patch.object(_evals_utils, "EvalDatasetLoader")`
Original file line number	Diff line number	Diff line change
`@@ -798,7 +798,6 @@ def _execute_inference(`
`798`	`798`
`799`	`799`	`evaluation_dataset = types.EvaluationDataset(`
`800`	`800`	`eval_dataset_df=results_df,`
`801`		`- candidate_name="agent",`
`802`	`801`	`)`
`803`	`802`	`else:`
`804`	`803`	`raise ValueError("Either model or agent_engine must be provided.")`