1515from __future__ import annotations
1616
1717import importlib .util
18+ import inspect
1819import json
1920import logging
2021import os
3132from ..evaluation .eval_metrics import EvalMetric
3233from ..evaluation .eval_metrics import EvalMetricResult
3334from ..evaluation .eval_metrics import EvalMetricResultPerInvocation
35+ from ..evaluation .eval_metrics import JudgeModelOptions
3436from ..evaluation .eval_result import EvalCaseResult
3537from ..evaluation .evaluator import EvalStatus
3638from ..evaluation .evaluator import Evaluator
4244TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score"
4345RESPONSE_MATCH_SCORE_KEY = "response_match_score"
4446SAFETY_V1_KEY = "safety_v1"
47+ FINAL_RESPONSE_MATCH_V2 = "final_response_match_v2"
4548# This evaluation is not very stable.
4649# This is always optional unless explicitly specified.
4750RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score"
@@ -191,10 +194,16 @@ async def run_evals(
191194 for eval_metric in eval_metrics :
192195 metric_evaluator = _get_evaluator (eval_metric )
193196
194- evaluation_result = metric_evaluator .evaluate_invocations (
195- actual_invocations = inference_result ,
196- expected_invocations = eval_case .conversation ,
197- )
197+ if inspect .iscoroutinefunction (metric_evaluator .evaluate_invocations ):
198+ evaluation_result = await metric_evaluator .evaluate_invocations (
199+ actual_invocations = inference_result ,
200+ expected_invocations = eval_case .conversation ,
201+ )
202+ else :
203+ evaluation_result = metric_evaluator .evaluate_invocations (
204+ actual_invocations = inference_result ,
205+ expected_invocations = eval_case .conversation ,
206+ )
198207
199208 overall_eval_metric_results .append (
200209 EvalMetricResult (
@@ -260,6 +269,7 @@ async def run_evals(
260269
261270def _get_evaluator (eval_metric : EvalMetric ) -> Evaluator :
262271 try :
272+ from ..evaluation .final_response_match_v2 import FinalResponseMatchV2Evaluator
263273 from ..evaluation .response_evaluator import ResponseEvaluator
264274 from ..evaluation .safety_evaluator import SafetyEvaluatorV1
265275 from ..evaluation .trajectory_evaluator import TrajectoryEvaluator
@@ -276,5 +286,8 @@ def _get_evaluator(eval_metric: EvalMetric) -> Evaluator:
276286 )
277287 elif eval_metric .metric_name == SAFETY_V1_KEY :
278288 return SafetyEvaluatorV1 (eval_metric )
289+ elif eval_metric .metric_name == FINAL_RESPONSE_MATCH_V2 :
290+ eval_metric .judge_model_options = JudgeModelOptions ()
291+ return FinalResponseMatchV2Evaluator (eval_metric )
279292
280293 raise ValueError (f"Unsupported eval metric: { eval_metric } " )
0 commit comments