Skip to content

Commit 9a46e67

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Client(evals) - Add retry to predefine metric
PiperOrigin-RevId: 826984457
1 parent 13faa27 commit 9a46e67

File tree

2 files changed

+132
-3
lines changed

2 files changed

+132
-3
lines changed

tests/unit/vertexai/genai/test_evals.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from vertexai._genai import evals
3434
from vertexai._genai import types as vertexai_genai_types
3535
from google.genai import client
36+
from google.genai import errors as genai_errors
3637
from google.genai import types as genai_types
3738
import pandas as pd
3839
import pytest
@@ -4861,6 +4862,110 @@ def test_execute_evaluation_adds_creation_timestamp(
48614862
assert result.metadata is not None
48624863
assert result.metadata.creation_timestamp == mock_now
48634864

4865+
@mock.patch(
4866+
"vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS",
4867+
frozenset(["summarization_quality"]),
4868+
)
4869+
@mock.patch("time.sleep", return_value=None)
4870+
@mock.patch("vertexai._genai.evals.Evals._evaluate_instances")
4871+
def test_predefined_metric_retry_on_resource_exhausted(
4872+
self,
4873+
mock_private_evaluate_instances,
4874+
mock_sleep,
4875+
mock_api_client_fixture,
4876+
):
4877+
dataset_df = pd.DataFrame(
4878+
[{"prompt": "Test prompt", "response": "Test response"}]
4879+
)
4880+
input_dataset = vertexai_genai_types.EvaluationDataset(
4881+
eval_dataset_df=dataset_df
4882+
)
4883+
metric = vertexai_genai_types.Metric(name="summarization_quality")
4884+
metric_result = vertexai_genai_types.MetricResult(
4885+
score=0.9,
4886+
explanation="Mocked predefined explanation",
4887+
rubric_verdicts=[],
4888+
error=None,
4889+
)
4890+
error_response_json = {
4891+
"error": {
4892+
"code": 429,
4893+
"message": ("Judge model resource exhausted. Please try again later."),
4894+
"status": "RESOURCE_EXHAUSTED",
4895+
}
4896+
}
4897+
mock_private_evaluate_instances.side_effect = [
4898+
genai_errors.ClientError(code=429, response_json=error_response_json),
4899+
genai_errors.ClientError(code=429, response_json=error_response_json),
4900+
vertexai_genai_types.EvaluateInstancesResponse(
4901+
metric_results=[metric_result]
4902+
),
4903+
]
4904+
4905+
result = _evals_common._execute_evaluation(
4906+
api_client=mock_api_client_fixture,
4907+
dataset=input_dataset,
4908+
metrics=[metric],
4909+
)
4910+
4911+
assert mock_private_evaluate_instances.call_count == 3
4912+
assert mock_sleep.call_count == 2
4913+
assert len(result.summary_metrics) == 1
4914+
summary_metric = result.summary_metrics[0]
4915+
assert summary_metric.metric_name == "summarization_quality"
4916+
assert summary_metric.mean_score == 0.9
4917+
4918+
@mock.patch(
4919+
"vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS",
4920+
frozenset(["summarization_quality"]),
4921+
)
4922+
@mock.patch("time.sleep", return_value=None)
4923+
@mock.patch("vertexai._genai.evals.Evals._evaluate_instances")
4924+
def test_predefined_metric_retry_fail_on_resource_exhausted(
4925+
self,
4926+
mock_private_evaluate_instances,
4927+
mock_sleep,
4928+
mock_api_client_fixture,
4929+
):
4930+
dataset_df = pd.DataFrame(
4931+
[{"prompt": "Test prompt", "response": "Test response"}]
4932+
)
4933+
input_dataset = vertexai_genai_types.EvaluationDataset(
4934+
eval_dataset_df=dataset_df
4935+
)
4936+
error_response_json = {
4937+
"error": {
4938+
"code": 429,
4939+
"message": ("Judge model resource exhausted. Please try again later."),
4940+
"status": "RESOURCE_EXHAUSTED",
4941+
}
4942+
}
4943+
metric = vertexai_genai_types.Metric(name="summarization_quality")
4944+
mock_private_evaluate_instances.side_effect = [
4945+
genai_errors.ClientError(code=429, response_json=error_response_json),
4946+
genai_errors.ClientError(code=429, response_json=error_response_json),
4947+
genai_errors.ClientError(code=429, response_json=error_response_json),
4948+
]
4949+
4950+
result = _evals_common._execute_evaluation(
4951+
api_client=mock_api_client_fixture,
4952+
dataset=input_dataset,
4953+
metrics=[metric],
4954+
)
4955+
4956+
assert mock_private_evaluate_instances.call_count == 3
4957+
assert mock_sleep.call_count == 2
4958+
assert len(result.summary_metrics) == 1
4959+
summary_metric = result.summary_metrics[0]
4960+
assert summary_metric.metric_name == "summarization_quality"
4961+
assert summary_metric.mean_score is None
4962+
assert summary_metric.num_cases_error == 1
4963+
assert (
4964+
"Judge model resource exhausted after 3 retries"
4965+
) in result.eval_case_results[0].response_candidate_results[0].metric_results[
4966+
"summarization_quality"
4967+
].error_message
4968+
48644969

48654970
class TestEvaluationDataset:
48664971
"""Contains set of tests for the EvaluationDataset class methods."""

vertexai/_genai/_evals_metric_handlers.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@
2020
import json
2121
import logging
2222
import statistics
23+
import time
2324
from typing import Any, Callable, Optional, TypeVar, Union
2425

26+
from google.genai import errors as genai_errors
2527
from google.genai import _common
2628
from google.genai import types as genai_types
2729
from tqdm import tqdm
@@ -34,6 +36,7 @@
3436

3537

3638
logger = logging.getLogger(__name__)
39+
_MAX_RETRIES = 3
3740

3841

3942
def _extract_text_from_content(
@@ -964,9 +967,30 @@ def get_metric_result(
964967
metric_name = self.metric.name
965968
try:
966969
payload = self._build_request_payload(eval_case, response_index)
967-
api_response = self.module._evaluate_instances(
968-
metrics=[self.metric], instance=payload.get("instance")
969-
)
970+
for attempt in range(_MAX_RETRIES):
971+
try:
972+
api_response = self.module._evaluate_instances(
973+
metrics=[self.metric], instance=payload.get("instance")
974+
)
975+
break
976+
except genai_errors.ClientError as e:
977+
if e.code == 429:
978+
logger.warning(
979+
"Resource Exhausted error on attempt %d/%d: %s. Retrying in %s"
980+
" seconds...",
981+
attempt + 1,
982+
_MAX_RETRIES,
983+
e,
984+
2**attempt,
985+
)
986+
if attempt == _MAX_RETRIES - 1:
987+
return types.EvalCaseMetricResult(
988+
metric_name=metric_name,
989+
error_message=f"Judge model resource exhausted after {_MAX_RETRIES} retries: {e}",
990+
)
991+
time.sleep(2**attempt)
992+
else:
993+
raise e
970994

971995
if (
972996
api_response

0 commit comments

Comments
 (0)