Skip to content

Commit 0c932b9

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add show method for EvaluationRun class in Vertex AI GenAI SDK evals
PiperOrigin-RevId: 815745954
1 parent d02099c commit 0c932b9

File tree

3 files changed

+86
-0
lines changed

3 files changed

+86
-0
lines changed

tests/unit/vertexai/genai/replays/test_get_evaluation_run.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from tests.unit.vertexai.genai.replays import pytest_helper
1818
from vertexai import types
19+
from vertexai._genai import _evals_visualization
1920
import datetime
2021
import pytest
2122

@@ -188,6 +189,38 @@ def check_run_1957799200510967808(
188189
)
189190
)
190191
assert evaluation_run.error is None
192+
eval_result = _evals_visualization._get_eval_result_from_eval_run(
193+
evaluation_run.evaluation_results
194+
)
195+
assert isinstance(eval_result, types.EvaluationResult)
196+
assert eval_result.summary_metrics == [
197+
types.AggregatedMetricResult(
198+
metric_name="checkpoint_1/universal",
199+
mean_score=0.986633250587865,
200+
stdev_score=0.0393092386127714,
201+
),
202+
types.AggregatedMetricResult(
203+
metric_name="checkpoint_2/universal",
204+
mean_score=0.9438178790243048,
205+
stdev_score=0.07597187617837561,
206+
),
207+
types.AggregatedMetricResult(
208+
metric_name="gemini-2.0-flash-001@default/universal",
209+
mean_score=0.6943817985685249,
210+
stdev_score=0.17738341388587855,
211+
),
212+
types.AggregatedMetricResult(
213+
metric_name="checkpoint_1/user_defined", mean_score=5, stdev_score=0
214+
),
215+
types.AggregatedMetricResult(
216+
metric_name="checkpoint_2/user_defined", mean_score=5, stdev_score=0
217+
),
218+
types.AggregatedMetricResult(
219+
metric_name="gemini-2.0-flash-001@default/user_defined",
220+
mean_score=4.736842105263158,
221+
stdev_score=0.6359497880839245,
222+
),
223+
]
191224

192225

193226
pytestmark = pytest_helper.setup(

vertexai/_genai/_evals_visualization.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -727,3 +727,42 @@ def display_evaluation_dataset(eval_dataset_obj: types.EvaluationDataset) -> Non
727727
dataframe_json_string = json.dumps(processed_rows, ensure_ascii=False, default=str)
728728
html_content = _get_inference_html(dataframe_json_string)
729729
display.display(display.HTML(html_content))
730+
731+
732+
def _get_eval_result_from_eval_run(
733+
results: types.EvaluationRunResults,
734+
) -> types.EvaluationResult:
735+
"""Retrieves an EvaluationResult from the resource name."""
736+
if (
737+
not results
738+
or not results.summary_metrics
739+
or not results.summary_metrics.metrics
740+
):
741+
return types.EvaluationResult()
742+
743+
aggregated_metrics_dict = {}
744+
for name, value in results.summary_metrics.metrics.items():
745+
result = name.rsplit("/", 1)
746+
full_metric_name = result[0]
747+
aggregated_metric_name = result[1]
748+
if full_metric_name not in aggregated_metrics_dict:
749+
aggregated_metrics_dict[full_metric_name] = {}
750+
aggregated_metrics_dict[full_metric_name]["sub_metric_name"] = (
751+
full_metric_name.split("/")[-1]
752+
)
753+
aggregated_metrics_dict[full_metric_name][aggregated_metric_name] = value
754+
755+
items_sorted = sorted(
756+
aggregated_metrics_dict.items(),
757+
key=lambda item: (item[1]["sub_metric_name"], item[0]),
758+
)
759+
760+
aggregated_metrics = [
761+
types.AggregatedMetricResult(
762+
metric_name=name,
763+
mean_score=values.get("AVERAGE"),
764+
stdev_score=values.get("STANDARD_DEVIATION"),
765+
)
766+
for name, values in items_sorted
767+
]
768+
return types.EvaluationResult(summary_metrics=aggregated_metrics)

vertexai/_genai/types.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -578,6 +578,20 @@ class EvaluationRun(_common.BaseModel):
578578
default=None, description="""The results for the evaluation run."""
579579
)
580580

581+
def show(self) -> None:
582+
"""Shows the evaluation result."""
583+
from . import _evals_visualization
584+
585+
if self.state == "SUCCEEDED":
586+
eval_result = _evals_visualization._get_eval_result_from_eval_run(
587+
self.evaluation_results
588+
)
589+
_evals_visualization.display_evaluation_result(eval_result, None)
590+
else:
591+
logger.warning(f"Evaluation Run state: {self.state}.")
592+
if self.error:
593+
logger.warning(f"Evaluation Run error: {self.error.message}")
594+
581595

582596
class EvaluationRunDict(TypedDict, total=False):
583597
"""Represents an evaluation run."""

0 commit comments

Comments
 (0)