|
33 | 33 | from vertexai._genai import evals |
34 | 34 | from vertexai._genai import types as vertexai_genai_types |
35 | 35 | from google.genai import client |
| 36 | +from google.genai import errors as genai_errors |
36 | 37 | from google.genai import types as genai_types |
37 | 38 | import pandas as pd |
38 | 39 | import pytest |
@@ -4861,6 +4862,110 @@ def test_execute_evaluation_adds_creation_timestamp( |
4861 | 4862 | assert result.metadata is not None |
4862 | 4863 | assert result.metadata.creation_timestamp == mock_now |
4863 | 4864 |
|
| 4865 | + @mock.patch( |
| 4866 | + "vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS", |
| 4867 | + frozenset(["summarization_quality"]), |
| 4868 | + ) |
| 4869 | + @mock.patch("time.sleep", return_value=None) |
| 4870 | + @mock.patch("vertexai._genai.evals.Evals._evaluate_instances") |
| 4871 | + def test_predefined_metric_retry_on_resource_exhausted( |
| 4872 | + self, |
| 4873 | + mock_private_evaluate_instances, |
| 4874 | + mock_sleep, |
| 4875 | + mock_api_client_fixture, |
| 4876 | + ): |
| 4877 | + dataset_df = pd.DataFrame( |
| 4878 | + [{"prompt": "Test prompt", "response": "Test response"}] |
| 4879 | + ) |
| 4880 | + input_dataset = vertexai_genai_types.EvaluationDataset( |
| 4881 | + eval_dataset_df=dataset_df |
| 4882 | + ) |
| 4883 | + metric = vertexai_genai_types.Metric(name="summarization_quality") |
| 4884 | + metric_result = vertexai_genai_types.MetricResult( |
| 4885 | + score=0.9, |
| 4886 | + explanation="Mocked predefined explanation", |
| 4887 | + rubric_verdicts=[], |
| 4888 | + error=None, |
| 4889 | + ) |
| 4890 | + error_response_json = { |
| 4891 | + "error": { |
| 4892 | + "code": 429, |
| 4893 | + "message": ("Judge model resource exhausted. Please try again later."), |
| 4894 | + "status": "RESOURCE_EXHAUSTED", |
| 4895 | + } |
| 4896 | + } |
| 4897 | + mock_private_evaluate_instances.side_effect = [ |
| 4898 | + genai_errors.ClientError(code=429, response_json=error_response_json), |
| 4899 | + genai_errors.ClientError(code=429, response_json=error_response_json), |
| 4900 | + vertexai_genai_types.EvaluateInstancesResponse( |
| 4901 | + metric_results=[metric_result] |
| 4902 | + ), |
| 4903 | + ] |
| 4904 | + |
| 4905 | + result = _evals_common._execute_evaluation( |
| 4906 | + api_client=mock_api_client_fixture, |
| 4907 | + dataset=input_dataset, |
| 4908 | + metrics=[metric], |
| 4909 | + ) |
| 4910 | + |
| 4911 | + assert mock_private_evaluate_instances.call_count == 3 |
| 4912 | + assert mock_sleep.call_count == 2 |
| 4913 | + assert len(result.summary_metrics) == 1 |
| 4914 | + summary_metric = result.summary_metrics[0] |
| 4915 | + assert summary_metric.metric_name == "summarization_quality" |
| 4916 | + assert summary_metric.mean_score == 0.9 |
| 4917 | + |
| 4918 | + @mock.patch( |
| 4919 | + "vertexai._genai._evals_metric_handlers._evals_constant.SUPPORTED_PREDEFINED_METRICS", |
| 4920 | + frozenset(["summarization_quality"]), |
| 4921 | + ) |
| 4922 | + @mock.patch("time.sleep", return_value=None) |
| 4923 | + @mock.patch("vertexai._genai.evals.Evals._evaluate_instances") |
| 4924 | + def test_predefined_metric_retry_fail_on_resource_exhausted( |
| 4925 | + self, |
| 4926 | + mock_private_evaluate_instances, |
| 4927 | + mock_sleep, |
| 4928 | + mock_api_client_fixture, |
| 4929 | + ): |
| 4930 | + dataset_df = pd.DataFrame( |
| 4931 | + [{"prompt": "Test prompt", "response": "Test response"}] |
| 4932 | + ) |
| 4933 | + input_dataset = vertexai_genai_types.EvaluationDataset( |
| 4934 | + eval_dataset_df=dataset_df |
| 4935 | + ) |
| 4936 | + error_response_json = { |
| 4937 | + "error": { |
| 4938 | + "code": 429, |
| 4939 | + "message": ("Judge model resource exhausted. Please try again later."), |
| 4940 | + "status": "RESOURCE_EXHAUSTED", |
| 4941 | + } |
| 4942 | + } |
| 4943 | + metric = vertexai_genai_types.Metric(name="summarization_quality") |
| 4944 | + mock_private_evaluate_instances.side_effect = [ |
| 4945 | + genai_errors.ClientError(code=429, response_json=error_response_json), |
| 4946 | + genai_errors.ClientError(code=429, response_json=error_response_json), |
| 4947 | + genai_errors.ClientError(code=429, response_json=error_response_json), |
| 4948 | + ] |
| 4949 | + |
| 4950 | + result = _evals_common._execute_evaluation( |
| 4951 | + api_client=mock_api_client_fixture, |
| 4952 | + dataset=input_dataset, |
| 4953 | + metrics=[metric], |
| 4954 | + ) |
| 4955 | + |
| 4956 | + assert mock_private_evaluate_instances.call_count == 3 |
| 4957 | + assert mock_sleep.call_count == 2 |
| 4958 | + assert len(result.summary_metrics) == 1 |
| 4959 | + summary_metric = result.summary_metrics[0] |
| 4960 | + assert summary_metric.metric_name == "summarization_quality" |
| 4961 | + assert summary_metric.mean_score is None |
| 4962 | + assert summary_metric.num_cases_error == 1 |
| 4963 | + assert ( |
| 4964 | + "Judge model resource exhausted after 3 retries" |
| 4965 | + ) in result.eval_case_results[0].response_candidate_results[0].metric_results[ |
| 4966 | + "summarization_quality" |
| 4967 | + ].error_message |
| 4968 | + |
4864 | 4969 |
|
4865 | 4970 | class TestEvaluationDataset: |
4866 | 4971 | """Contains set of tests for the EvaluationDataset class methods.""" |
|
0 commit comments