Skip to content

Commit d1f182e

Browse files
ankursharmascopybara-github
authored andcommitted
feat: Use LocalEvalService to run all evals in cli and web
We update both adk web run eval endpoint and adk eval cli to use the LocalService. The old method is marked as deprecated and will be removed in later PRs. PiperOrigin-RevId: 785612708
1 parent 0e173d7 commit d1f182e

File tree

6 files changed

+341
-291
lines changed

6 files changed

+341
-291
lines changed

src/google/adk/cli/cli_eval.py

Lines changed: 70 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,16 @@
2525
from typing import Optional
2626
import uuid
2727

28+
from typing_extensions import deprecated
29+
2830
from ..agents import Agent
2931
from ..artifacts.base_artifact_service import BaseArtifactService
32+
from ..evaluation.base_eval_service import BaseEvalService
33+
from ..evaluation.base_eval_service import EvaluateConfig
34+
from ..evaluation.base_eval_service import EvaluateRequest
35+
from ..evaluation.base_eval_service import InferenceConfig
36+
from ..evaluation.base_eval_service import InferenceRequest
37+
from ..evaluation.base_eval_service import InferenceResult
3038
from ..evaluation.constants import MISSING_EVAL_DEPENDENCIES_MESSAGE
3139
from ..evaluation.eval_case import EvalCase
3240
from ..evaluation.eval_metrics import EvalMetric
@@ -110,26 +118,80 @@ def try_get_reset_func(agent_module_file_path: str) -> Any:
110118

111119

112120
def parse_and_get_evals_to_run(
113-
eval_set_file_path: tuple[str],
121+
evals_to_run_info: list[str],
114122
) -> dict[str, list[str]]:
115-
"""Returns a dictionary of eval sets to evals that should be run."""
123+
"""Returns a dictionary of eval set info to evals that should be run.
124+
125+
Args:
126+
evals_to_run_info: While the structure is quite simple, a list of string,
127+
each string actually is formatted with the following convention:
128+
<eval_set_file_path | eval_set_id>:[comma separated eval case ids]
129+
"""
116130
eval_set_to_evals = {}
117-
for input_eval_set in eval_set_file_path:
131+
for input_eval_set in evals_to_run_info:
118132
evals = []
119133
if ":" not in input_eval_set:
120-
eval_set_file = input_eval_set
134+
# We don't have any eval cases specified. This would be the case where the
135+
# the user wants to run all eval cases in the eval set.
136+
eval_set = input_eval_set
121137
else:
122-
eval_set_file = input_eval_set.split(":")[0]
138+
# There are eval cases that we need to parse. The user wants to run
139+
# specific eval cases from the eval set.
140+
eval_set = input_eval_set.split(":")[0]
123141
evals = input_eval_set.split(":")[1].split(",")
142+
evals = [s for s in evals if s.strip()]
124143

125-
if eval_set_file not in eval_set_to_evals:
126-
eval_set_to_evals[eval_set_file] = []
144+
if eval_set not in eval_set_to_evals:
145+
eval_set_to_evals[eval_set] = []
127146

128-
eval_set_to_evals[eval_set_file].extend(evals)
147+
eval_set_to_evals[eval_set].extend(evals)
129148

130149
return eval_set_to_evals
131150

132151

152+
async def _collect_inferences(
153+
inference_requests: list[InferenceRequest],
154+
eval_service: BaseEvalService,
155+
) -> list[InferenceResult]:
156+
"""Simple utility methods to collect inferences from an eval service.
157+
158+
The method is intentionally kept private to prevent general usage.
159+
"""
160+
inference_results = []
161+
for inference_request in inference_requests:
162+
async for inference_result in eval_service.perform_inference(
163+
inference_request=inference_request
164+
):
165+
inference_results.append(inference_result)
166+
return inference_results
167+
168+
169+
async def _collect_eval_results(
170+
inference_results: list[InferenceResult],
171+
eval_service: BaseEvalService,
172+
eval_metrics: list[EvalMetric],
173+
) -> list[EvalCaseResult]:
174+
"""Simple utility methods to collect eval results from an eval service.
175+
176+
The method is intentionally kept private to prevent general usage.
177+
"""
178+
eval_results = []
179+
evaluate_request = EvaluateRequest(
180+
inference_results=inference_results,
181+
evaluate_config=EvaluateConfig(eval_metrics=eval_metrics),
182+
)
183+
async for eval_result in eval_service.evaluate(
184+
evaluate_request=evaluate_request
185+
):
186+
eval_results.append(eval_result)
187+
188+
return eval_results
189+
190+
191+
@deprecated(
192+
"This method is deprecated and will be removed in fututre release. Please"
193+
" use LocalEvalService to define your custom evals."
194+
)
133195
async def run_evals(
134196
eval_cases_by_eval_set_id: dict[str, list[EvalCase]],
135197
root_agent: Agent,

src/google/adk/cli/cli_tools_click.py

Lines changed: 133 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ def cli_run(
272272
exists=True, dir_okay=True, file_okay=False, resolve_path=True
273273
),
274274
)
275-
@click.argument("eval_set_file_path", nargs=-1)
275+
@click.argument("eval_set_file_path_or_id", nargs=-1)
276276
@click.option("--config_file_path", help="Optional. The path to config file.")
277277
@click.option(
278278
"--print_detailed_results",
@@ -292,7 +292,7 @@ def cli_run(
292292
)
293293
def cli_eval(
294294
agent_module_file_path: str,
295-
eval_set_file_path: list[str],
295+
eval_set_file_path_or_id: list[str],
296296
config_file_path: str,
297297
print_detailed_results: bool,
298298
eval_storage_uri: Optional[str] = None,
@@ -302,40 +302,75 @@ def cli_eval(
302302
AGENT_MODULE_FILE_PATH: The path to the __init__.py file that contains a
303303
module by the name "agent". "agent" module contains a root_agent.
304304
305-
EVAL_SET_FILE_PATH: You can specify one or more eval set file paths.
305+
EVAL_SET_FILE_PATH_OR_ID: You can specify one or more eval set file paths or
306+
eval set id.
306307
308+
Mixing of eval set file paths with eval set ids is not allowed.
309+
310+
*Eval Set File Path*
307311
For each file, all evals will be run by default.
308312
309313
If you want to run only specific evals from a eval set, first create a comma
310314
separated list of eval names and then add that as a suffix to the eval set
311315
file name, demarcated by a `:`.
312316
313-
For example,
317+
For example, we have `sample_eval_set_file.json` file that has following the
318+
eval cases:
319+
sample_eval_set_file.json:
320+
|....... eval_1
321+
|....... eval_2
322+
|....... eval_3
323+
|....... eval_4
324+
|....... eval_5
314325
315326
sample_eval_set_file.json:eval_1,eval_2,eval_3
316327
317328
This will only run eval_1, eval_2 and eval_3 from sample_eval_set_file.json.
318329
330+
*Eval Set Id*
331+
For each eval set, all evals will be run by default.
332+
333+
If you want to run only specific evals from a eval set, first create a comma
334+
separated list of eval names and then add that as a suffix to the eval set
335+
file name, demarcated by a `:`.
336+
337+
For example, we have `sample_eval_set_id` that has following the eval cases:
338+
sample_eval_set_id:
339+
|....... eval_1
340+
|....... eval_2
341+
|....... eval_3
342+
|....... eval_4
343+
|....... eval_5
344+
345+
If we did:
346+
sample_eval_set_id:eval_1,eval_2,eval_3
347+
348+
This will only run eval_1, eval_2 and eval_3 from sample_eval_set_id.
349+
319350
CONFIG_FILE_PATH: The path to config file.
320351
321352
PRINT_DETAILED_RESULTS: Prints detailed results on the console.
322353
"""
323354
envs.load_dotenv_for_agent(agent_module_file_path, ".")
324355

325356
try:
357+
from ..evaluation.base_eval_service import InferenceConfig
358+
from ..evaluation.base_eval_service import InferenceRequest
359+
from ..evaluation.eval_metrics import EvalMetric
360+
from ..evaluation.eval_result import EvalCaseResult
361+
from ..evaluation.evaluator import EvalStatus
362+
from ..evaluation.in_memory_eval_sets_manager import InMemoryEvalSetsManager
363+
from ..evaluation.local_eval_service import LocalEvalService
326364
from ..evaluation.local_eval_set_results_manager import LocalEvalSetResultsManager
327365
from ..evaluation.local_eval_sets_manager import load_eval_set_from_file
328-
from ..sessions.in_memory_session_service import InMemorySessionService
329-
from .cli_eval import EvalCaseResult
330-
from .cli_eval import EvalMetric
331-
from .cli_eval import EvalStatus
366+
from ..evaluation.local_eval_sets_manager import LocalEvalSetsManager
367+
from .cli_eval import _collect_eval_results
368+
from .cli_eval import _collect_inferences
332369
from .cli_eval import get_evaluation_criteria_or_default
333370
from .cli_eval import get_root_agent
334371
from .cli_eval import parse_and_get_evals_to_run
335-
from .cli_eval import run_evals
336-
from .cli_eval import try_get_reset_func
337-
except ModuleNotFoundError:
338-
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE)
372+
except ModuleNotFoundError as mnf:
373+
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
339374

340375
evaluation_criteria = get_evaluation_criteria_or_default(config_file_path)
341376
eval_metrics = []
@@ -347,80 +382,103 @@ def cli_eval(
347382
print(f"Using evaluation criteria: {evaluation_criteria}")
348383

349384
root_agent = get_root_agent(agent_module_file_path)
350-
reset_func = try_get_reset_func(agent_module_file_path)
351-
352-
gcs_eval_sets_manager = None
385+
app_name = os.path.basename(agent_module_file_path)
386+
agents_dir = os.path.dirname(agent_module_file_path)
387+
eval_sets_manager = None
353388
eval_set_results_manager = None
389+
354390
if eval_storage_uri:
355391
gcs_eval_managers = evals.create_gcs_eval_managers_from_uri(
356392
eval_storage_uri
357393
)
358-
gcs_eval_sets_manager = gcs_eval_managers.eval_sets_manager
394+
eval_sets_manager = gcs_eval_managers.eval_sets_manager
359395
eval_set_results_manager = gcs_eval_managers.eval_set_results_manager
360396
else:
361-
eval_set_results_manager = LocalEvalSetResultsManager(
362-
agents_dir=os.path.dirname(agent_module_file_path)
363-
)
364-
eval_set_file_path_to_evals = parse_and_get_evals_to_run(eval_set_file_path)
365-
eval_set_id_to_eval_cases = {}
366-
367-
# Read the eval_set files and get the cases.
368-
for eval_set_file_path, eval_case_ids in eval_set_file_path_to_evals.items():
369-
if gcs_eval_sets_manager:
370-
eval_set = gcs_eval_sets_manager._load_eval_set_from_blob(
371-
eval_set_file_path
372-
)
373-
if not eval_set:
397+
eval_set_results_manager = LocalEvalSetResultsManager(agents_dir=agents_dir)
398+
399+
inference_requests = []
400+
eval_set_file_or_id_to_evals = parse_and_get_evals_to_run(
401+
eval_set_file_path_or_id
402+
)
403+
404+
# Check if the first entry is a file that exists, if it does then we assume
405+
# rest of the entries are also files. We enforce this assumption in the if
406+
# block.
407+
if eval_set_file_or_id_to_evals and os.path.exists(
408+
list(eval_set_file_or_id_to_evals.keys())[0]
409+
):
410+
eval_sets_manager = InMemoryEvalSetsManager()
411+
412+
# Read the eval_set files and get the cases.
413+
for (
414+
eval_set_file_path,
415+
eval_case_ids,
416+
) in eval_set_file_or_id_to_evals.items():
417+
try:
418+
eval_set = load_eval_set_from_file(
419+
eval_set_file_path, eval_set_file_path
420+
)
421+
except FileNotFoundError as fne:
374422
raise click.ClickException(
375-
f"Eval set {eval_set_file_path} not found in GCS."
423+
f"`{eval_set_file_path}` should be a valid eval set file."
424+
) from fne
425+
426+
eval_sets_manager.create_eval_set(
427+
app_name=app_name, eval_set_id=eval_set.eval_set_id
428+
)
429+
for eval_case in eval_set.eval_cases:
430+
eval_sets_manager.add_eval_case(
431+
app_name=app_name,
432+
eval_set_id=eval_set.eval_set_id,
433+
eval_case=eval_case,
376434
)
377-
else:
378-
eval_set = load_eval_set_from_file(eval_set_file_path, eval_set_file_path)
379-
eval_cases = eval_set.eval_cases
380-
381-
if eval_case_ids:
382-
# There are eval_ids that we should select.
383-
eval_cases = [
384-
e for e in eval_set.eval_cases if e.eval_id in eval_case_ids
385-
]
386-
387-
eval_set_id_to_eval_cases[eval_set.eval_set_id] = eval_cases
388-
389-
async def _collect_eval_results() -> list[EvalCaseResult]:
390-
session_service = InMemorySessionService()
391-
eval_case_results = []
392-
async for eval_case_result in run_evals(
393-
eval_set_id_to_eval_cases,
394-
root_agent,
395-
reset_func,
396-
eval_metrics,
397-
session_service=session_service,
398-
):
399-
eval_case_result.session_details = await session_service.get_session(
400-
app_name=os.path.basename(agent_module_file_path),
401-
user_id=eval_case_result.user_id,
402-
session_id=eval_case_result.session_id,
435+
inference_requests.append(
436+
InferenceRequest(
437+
app_name=app_name,
438+
eval_set_id=eval_set.eval_set_id,
439+
eval_case_ids=eval_case_ids,
440+
inference_config=InferenceConfig(),
441+
)
442+
)
443+
else:
444+
# We assume that what we have are eval set ids instead.
445+
eval_sets_manager = (
446+
eval_sets_manager
447+
if eval_storage_uri
448+
else LocalEvalSetsManager(agents_dir=agents_dir)
449+
)
450+
451+
for eval_set_id_key, eval_case_ids in eval_set_file_or_id_to_evals.items():
452+
inference_requests.append(
453+
InferenceRequest(
454+
app_name=app_name,
455+
eval_set_id=eval_set_id_key,
456+
eval_case_ids=eval_case_ids,
457+
inference_config=InferenceConfig(),
458+
)
403459
)
404-
eval_case_results.append(eval_case_result)
405-
return eval_case_results
406460

407461
try:
408-
eval_results = asyncio.run(_collect_eval_results())
409-
except ModuleNotFoundError:
410-
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE)
411-
412-
# Write eval set results.
413-
eval_set_id_to_eval_results = collections.defaultdict(list)
414-
for eval_case_result in eval_results:
415-
eval_set_id = eval_case_result.eval_set_id
416-
eval_set_id_to_eval_results[eval_set_id].append(eval_case_result)
417-
418-
for eval_set_id, eval_case_results in eval_set_id_to_eval_results.items():
419-
eval_set_results_manager.save_eval_set_result(
420-
app_name=os.path.basename(agent_module_file_path),
421-
eval_set_id=eval_set_id,
422-
eval_case_results=eval_case_results,
462+
eval_service = LocalEvalService(
463+
root_agent=root_agent,
464+
eval_sets_manager=eval_sets_manager,
465+
eval_set_results_manager=eval_set_results_manager,
466+
)
467+
468+
inference_results = asyncio.run(
469+
_collect_inferences(
470+
inference_requests=inference_requests, eval_service=eval_service
471+
)
472+
)
473+
eval_results = asyncio.run(
474+
_collect_eval_results(
475+
inference_results=inference_results,
476+
eval_service=eval_service,
477+
eval_metrics=eval_metrics,
478+
)
423479
)
480+
except ModuleNotFoundError as mnf:
481+
raise click.ClickException(MISSING_EVAL_DEPENDENCIES_MESSAGE) from mnf
424482

425483
print("*********************************************************************")
426484
eval_run_summary = {}
@@ -1023,7 +1081,8 @@ def cli_deploy_agent_engine(
10231081
Example:
10241082
10251083
adk deploy agent_engine --project=[project] --region=[region]
1026-
--staging_bucket=[staging_bucket] --display_name=[app_name] path/to/my_agent
1084+
--staging_bucket=[staging_bucket] --display_name=[app_name]
1085+
path/to/my_agent
10271086
"""
10281087
try:
10291088
cli_deploy.to_agent_engine(

0 commit comments

Comments
 (0)