googleapis
diff --git a/‎vertexai/preview/evaluation/_eval_tasks.py‎
Lines changed: 4 additions & 1 deletion b/‎vertexai/preview/evaluation/_eval_tasks.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎vertexai/preview/evaluation/metrics/_base.py‎
Lines changed: 65 additions & 5 deletions b/‎vertexai/preview/evaluation/metrics/_base.py‎
Lines changed: 65 additions & 5 deletions
diff --git a/‎vertexai/preview/evaluation/prompt_template.py‎
Lines changed: 5 additions & 9 deletions b/‎vertexai/preview/evaluation/prompt_template.py‎
Lines changed: 5 additions & 9 deletions
@@ -54,7 +54,8 @@ class EvalTask:
  models and their settings, and assess the quality of the model's generated
  text.
 
- Dataset details:
+ Dataset Details:
+
  Default dataset column names:
  * content_column_name: "content"
  * reference_column_name: "reference"
@@ -74,11 +75,13 @@ class EvalTask:
  dataset must contain `instruction` and `context` column.
 
  Metrics Details:
+
  The supported metrics, metric bundle descriptions, grading rubrics, and
  the required input fields can be found on the Vertex AI public
  documentation page [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval).
 
  Usage:
+
  1. To perform bring-your-own-prediction(BYOP) evaluation, provide the model
  responses in the response column in the dataset. The response column name
  is "response" by default, or specify `response_column_name` parameter to
 
@@ -20,7 +20,67 @@
 
 
 class PairwiseMetric:
- """The Side-by-side(SxS) Pairwise Metric."""
+ """The Side-by-side(SxS) Pairwise Metric.
+
+ A model-based evaluation metric that compares two generative models
+ side-by-side, and allows users to A/B test their generative models to
+ determine which model is performing better on the given evaluation task.
+
+ For more details on when to use pairwise metrics, see
+ [Evaluation methods and metrics](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#pointwise_versus_pairwise).
+
+ Result Details:
+
+ * In `EvalResult.summary_metrics`, win rates for both the baseline and
+ candidate model are computed, showing the rate of each model performs
+ better on the given task. The win rate is computed as the number of times
+ the candidate model performs better than the baseline model divided by the
+ total number of examples. The win rate is a number between 0 and 1.
+
+ * In `EvalResult.metrics_table`, a pairwise metric produces three
+ evaluation results for each row in the dataset:
+ * `pairwise_choice`: the `pairwise_choice` in the evaluation result is
+ an enumeration that indicates whether the candidate or baseline
+ model perform better.
+ * `explanation`: The model AutoRater's rationale behind each verdict
+ using chain-of-thought reasoning. These explanations help users
+ scrutinize the AutoRater's judgment and build appropriate trust in
+ its decisions.
+ * `confidence`: A score between 0 and 1, which signifies how confident
+ the AutoRater was with its verdict. A score closer to 1 means higher
+ confidence.
+
+ See [documentation page](https://cloud.google.com/vertex-ai/generative-ai/docs/models/determine-eval#understand-results)
+ for more details on understanding the metric results.
+
+ Usages:
+
+ ```
+ from vertexai.generative_models import GenerativeModel
+ from vertexai.preview.evaluation import EvalTask, PairwiseMetric
+
+ baseline_model = GenerativeModel("gemini-1.0-pro")
+ candidate_model = GenerativeModel("gemini-1.5-pro")
+
+ pairwise_summarization_quality = PairwiseMetric(
+ metric = "summarization_quality",
+ baseline_model=baseline_model,
+ )
+
+ eval_task = EvalTask(
+ dataset = pd.DataFrame({
+ "instruction": [...],
+ "context": [...],
+ }),
+ metrics=[pairwise_summarization_quality],
+ )
+
+ pairwise_results = eval_task.evaluate(
+ prompt_template="instruction: {instruction}. context: {context}",
+ model=candidate_model,
+ )
+ ```
+ """
 
  def __init__(
  self,
@@ -37,8 +97,8 @@ def __init__(
  Args:
  metric: The Side-by-side(SxS) pairwise evaluation metric name.
  baseline_model: The baseline model for the Side-by-side(SxS) comparison.
- use_reference: Whether to use reference to compute the metric. If specified,
- the reference column is required in the dataset.
+ use_reference: Whether to use reference to compute the metric. If
+ specified, the reference column is required in the dataset.
  version: The metric version to use for evaluation.
  """
  self._metric = metric
@@ -74,8 +134,8 @@ class CustomMetric:
  Attributes:
  name: The name of the metric.
  metric_function: The evaluation function. Must use the dataset row/instance
- as the metric_function input. Returns per-instance metric result as a
- dictionary. The metric score must mapped to the CustomMetric.name as key.
+  as the metric_function input. Returns per-instance metric result as a
+  dictionary. The metric score must mapped to the CustomMetric.name as key.
  """
 
  def __init__(
 
@@ -27,18 +27,14 @@ class PromptTemplate:
  values using the `assemble` method, providing flexibility in generating
  dynamic prompts.
 
- Example Usage:
+ Usage:
 
  ```
-  template_str = "Hello, {name}! Today is {day}. How are you?"
-  prompt_template = PromptTemplate(template_str)
-  completed_prompt = prompt_template.assemble(name="John", day="Monday")
-  print(completed_prompt)
+ template_str = "Hello, {name}! Today is {day}. How are you?"
+ prompt_template = PromptTemplate(template_str)
+ completed_prompt = prompt_template.assemble(name="John", day="Monday")
+ print(completed_prompt)
  ```
-
- Attributes:
- template: The template string containing placeholders for replacement.
- placeholders: A set of placeholder names from the template string.
  """
 
  def __init__(self, template: str):