huggingface
diff --git a/‎benchmark_v2/framework/benchmark_runner.py‎
Lines changed: 77 additions & 93 deletions b/‎benchmark_v2/framework/benchmark_runner.py‎
Lines changed: 77 additions & 93 deletions
diff --git a/‎benchmark_v2/framework/data_classes.py‎
Lines changed: 26 additions & 18 deletions b/‎benchmark_v2/framework/data_classes.py‎
Lines changed: 26 additions & 18 deletions
diff --git a/‎src/transformers/generation/continuous_batching/cache.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/generation/continuous_batching/cache.py‎
Lines changed: 1 addition & 1 deletion
@@ -10,6 +10,7 @@
 from queue import Queue
 from typing import Any
 
+import numpy as np
 import torch
 from datasets import Dataset
 from huggingface_hub import HfApi
@@ -208,10 +209,11 @@ def run_benchmark(
  self.logger.info(f"Running benchmark scenario: {config.name}")
 
  # Quick validation: try one measurement first to see if this scenario works
- generate_fn = self.time_generate_batch if config.continuous_batching else self.time_generate
  flush_memory()
- e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
- max_new_tokens=1, gpu_monitor=None
+ e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics = self.time_generate(
+ max_new_tokens=config.num_tokens_to_generate,
+ use_continuous_batching=config.continuous_batching,
+ gpu_monitor=None,
  )
  if e2e_latency < 0:
  self.logger.warning(f"Skipping config {config.name}: {e2e_latency = } (no GPU monitoring)")
@@ -220,18 +222,23 @@ def run_benchmark(
  # Warmup runs
  self.logger.info(f"Warming up with {config.warmup_iterations} iterations...")
  for _ in trange(config.warmup_iterations, desc="Warmup"):
- _ = generate_fn(max_new_tokens=config.num_tokens_to_generate)
+ _ = self.time_generate(
+ max_new_tokens=config.num_tokens_to_generate,
+ use_continuous_batching=config.continuous_batching,
+ gpu_monitor=None,
+ )
  self.logger.info("Warmup over.")
 
  # Measurement runs
  result = BenchmarkResult()
  self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
  for _ in trange(config.measurement_iterations, desc="Benchmarking"):
- e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
+ e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics = self.time_generate(
  max_new_tokens=config.num_tokens_to_generate,
+ use_continuous_batching=config.continuous_batching,
  gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
  )
- result.accumulate(e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics)
+ result.accumulate(e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics)
  self.logger.info("Benchmarking done. Cleaning up.")
 
  # Profile if needed
@@ -249,75 +256,50 @@ def run_benchmark(
  "config": config,
  }
 
- # TODO: refactor `generate_batch` to handle streaming so we can use it here
- def time_generate_batch(
- self,
- max_new_tokens: int,
- gpu_monitor: GPUMonitor | None = None,
- ) -> tuple[float, list[float], str, GPURawMetrics | None]:
- if gpu_monitor is not None:
- gpu_monitor.start()
- # Prepare inputs
- inputs = self.inputs["input_ids"].tolist()
- timestamps = []
- last_result_generated_tokens = None
- wall_time_0 = time.perf_counter()
- # We disable prefix sharing because all prompts are the same
- with self.model.continuous_batching_context_manager(allow_prefix_sharing=False) as manager:
- manager.add_requests(inputs, max_new_tokens=max_new_tokens, streaming=True)
- unfinished_requests = len(inputs)
- while unfinished_requests > 0:
- # NOTE: I don't like having the extra if stmt here, but hopefully won't degrade perf too much
- result = manager.get_result()
- if result is not None:
- timestamps.append(time.perf_counter() - wall_time_0) # FIXME: the timestamps are wrong
- if result.is_finished():
- last_result_generated_tokens = result.generated_tokens
- unfinished_requests -= 1
- elif not manager.is_running():
- raise RuntimeError("Generation thread exited unexpectedly")
- # Post-processing
- wall_time_1 = time.perf_counter()
- e2e_latency = wall_time_1 - wall_time_0
- gpu_metrics = gpu_monitor.stop_and_collect() if gpu_monitor is not None else None
- decoded_output = self.tokenizer.decode(last_result_generated_tokens, skip_special_tokens=True)
- shape_and_decoded_output = f"{(1, len(last_result_generated_tokens))} | {decoded_output}"
- return e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics
-
  def time_generate(
  self,
  max_new_tokens: int,
+ use_continuous_batching: bool = False,
  gpu_monitor: GPUMonitor | None = None,
  ) -> tuple[float, list[float], str, GPURawMetrics | None]:
- """Time the latency of a call to model.generate() with the given (inputs) and (max_new_tokens)."""
  # Prepare gpu monitoring if needed
  if gpu_monitor is not None:
  gpu_monitor.start()
- # Prepare streamer
- streamer = BenchmarkStreamer()
+
  # Generate and time
- wall_time_0 = time.perf_counter()
- outputs = self.model.generate(
- **self.inputs,
- max_new_tokens=max_new_tokens,
- streamer=streamer,
- )
+ if use_continuous_batching:
+ inputs = self.inputs["input_ids"].tolist()
+ wall_time_0 = time.perf_counter()
+ results = self.model.generate_batch(inputs, allow_prefix_sharing=False, record_timestamps=True)
+ else:
+ streamer = BenchmarkStreamer()
+ wall_time_0 = time.perf_counter()
+ results = self.model.generate(**self.inputs, streamer=streamer)
+
  wall_time_1 = time.perf_counter()
- # Stop gpu monitoring if needed
  gpu_metrics = gpu_monitor.stop_and_collect() if gpu_monitor is not None else None
- # Check if generation had the right number of tokens
+
+ # Retrieve timestamps and results in a way that allows similar post-processing
  input_tokens = self.inputs["input_ids"].size(-1)
- batch_size, output_tokens = outputs.shape
- new_tokens = output_tokens - input_tokens
- if new_tokens != max_new_tokens:
- raise RuntimeError(f"Generated {new_tokens} tokens, expected {max_new_tokens}")
+ if use_continuous_batching:
+ timestamps = [result.timestamps for result in results.values()]
+ results = torch.tensor([result.generated_tokens for result in results.values()])
+ else:
+ timestamps = [streamer.timestamps[1:]] # skip the first timestamp because it's the input tokens
+ results = results[:, input_tokens:]
+
+ # Check if generation had the right number of tokens
+ if results.size(-1) != max_new_tokens:
+ raise RuntimeError(f"Generated {results.size(-1)} tokens, expected {max_new_tokens}")
+
  # Decode outputs
- decoded_output = self.tokenizer.decode(outputs[0, input_tokens:], skip_special_tokens=True)
- shape_and_decoded_output = f"{tuple(outputs.shape)} | {decoded_output}"
- # Compute intermediate quantities
+ decoded_output = self.tokenizer.decode(results[0], skip_special_tokens=True)
+ shape_and_decoded_output = f"{tuple(results.shape)} | {decoded_output}"
+
+ # Compute metrics
  e2e_latency = wall_time_1 - wall_time_0
- token_generation_times = [t - wall_time_0 for t in streamer.timestamps[1:]]
- return e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics
+ timestamps = torch.tensor(timestamps).sub(wall_time_0).tolist()
+ return e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics
 
  def profile_generate(self, num_tokens_to_profile: int, config_name: str) -> None:
  """Profile the latency of a call to model.generate() with the given (inputs) and (max_new_tokens)."""
@@ -431,36 +413,38 @@ def push_results_to_hub(self, dataset_id: str, results: dict[Any, Any], timestam
  "PUSH_TO_HUB_TOKEN is not set, cannot push results to the Hub. When setting dataset_id, please also set the PUSH_TO_HUB_TOKEN environment variable."
  )
 
+ api = HfApi()
  n_results = len(results)
- self.logger.info(f"Pushing {n_results} results to: {dataset_id}")
- rows = []
- for cfg_hash, entry in results.items():
- row = {
- "benchmark_config_hash": cfg_hash,
- "config": entry["config"].to_dict(),
- "measurements": entry["measurements"].to_dict(),
- "metadata": entry["metadata"].to_dict(),
- }
- rows.append(row)
-
- ds = Dataset.from_list(rows)
- with tempfile.TemporaryDirectory() as tmp:
- jsonl_path = os.path.join(tmp, "data.jsonl")
- with open(jsonl_path, "w") as f:
- json_lines = []
- for ex in ds:
- json_lines.append(json.dumps(ex, ensure_ascii=False))
- f.write("\n".join(json_lines))
-
- api = HfApi()
- # NOTE: we expect the repository to already exist
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if not timestamp else timestamp
- file_name = f"benchmark_run_{timestamp}.jsonl"
- api.upload_file(
- path_or_fileobj=jsonl_path,
- path_in_repo=file_name,
- repo_id=dataset_id,
- repo_type="dataset",
- token=PUSH_TO_HUB_TOKEN,
- )
- self.logger.info(f"Successfully uploaded results to: {dataset_id}")
+ for summarized in [False, True]:
+ self.logger.info(f"Pushing {n_results} results to: {dataset_id} with {summarized = }")
+ rows = []
+ for cfg_hash, entry in results.items():
+ row = {
+ "benchmark_config_hash": cfg_hash,
+ "config": entry["config"].to_dict(),
+ "measurements": entry["measurements"].to_dict(summarized=summarized),
+ "metadata": entry["metadata"].to_dict(),
+ }
+ rows.append(row)
+
+ ds = Dataset.from_list(rows)
+ with tempfile.TemporaryDirectory() as tmp:
+ file_name = "summarized_results" if summarized else "full_results"
+ jsonl_path = os.path.join(tmp, f"{file_name}.jsonl")
+ with open(jsonl_path, "w") as f:
+ json_lines = []
+ for ex in ds:
+ json_lines.append(json.dumps(ex, ensure_ascii=False))
+ f.write("\n".join(json_lines))
+
+ # NOTE: we expect the repository to already exist
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if not timestamp else timestamp
+ file_name = file_name + "/" + f"benchmark_run_{timestamp}.jsonl"
+ api.upload_file(
+ path_or_fileobj=jsonl_path,
+ path_in_repo=file_name,
+ repo_id=dataset_id,
+ repo_type="dataset",
+ token=PUSH_TO_HUB_TOKEN,
+ )
+ self.logger.info(f"Successfully uploaded results to: {dataset_id} with {summarized = }")
@@ -89,31 +89,35 @@ class BenchmarkResult:
 
  def __init__(self) -> None:
  self.e2e_latency = []
+ self._timestamps = []
  self.time_to_first_token = []
  self.inter_token_latency = []
  self.shape_and_decoded_outputs = []
  self.gpu_metrics = []
 
- def compute_itl(self, token_generation_times: list[float]) -> list[float]:
- return (token_generation_times[-1] - token_generation_times[0]) / len(token_generation_times)
-
  def accumulate(
  self,
  e2e_latency: float,
- token_generation_times: list[float],
+ timestamps: list[float],
  shape_and_decoded_output: str,
  gpu_metrics: GPURawMetrics | None,
  ) -> None:
  self.e2e_latency.append(e2e_latency)
- self.time_to_first_token.append(token_generation_times[0])
- # inter-token latency is already an average in itself
- self.inter_token_latency.append(self.compute_itl(token_generation_times))
+ self._timestamps.append(timestamps)
+ self._accumulate_ttft_and_itl(timestamps)
  self.shape_and_decoded_outputs.append(shape_and_decoded_output)
  self.gpu_metrics.append(gpu_metrics)
 
- def to_dict(self) -> dict[str, None | int | float]:
- # Save GPU metrics as None if it contains only None values
- if all(gm is None for gm in self.gpu_metrics):
+ def _accumulate_ttft_and_itl(self, timestamps: list[float]) -> None:
+ timestamps = np.array(timestamps)
+ tftt = np.min(timestamps[:, 0])
+ itl = np.mean(timestamps[:, -1] - timestamps[:, 0]) / (timestamps.shape[1] - 1)
+ self.time_to_first_token.append(tftt)
+ self.inter_token_latency.append(itl)
+
+ def to_dict(self, summarized: bool = False) -> dict[str, Any]:
+ # Save GPU metrics as None if it contains only None values or if we are summarizing
+ if summarized or all(gm is None for gm in self.gpu_metrics):
  gpu_metrics = None
  else:
  gpu_metrics = [gm.to_dict() for gm in self.gpu_metrics]
@@ -123,6 +127,7 @@ def to_dict(self) -> dict[str, None | int | float]:
  "inter_token_latency": self.inter_token_latency,
  "shape_and_decoded_outputs": self.shape_and_decoded_outputs,
  "gpu_metrics": gpu_metrics,
+ "timestamps": None if summarized else self._timestamps,
  }
 
  @classmethod
@@ -132,16 +137,19 @@ def from_dict(cls, data: dict[str, None | int | float]) -> "BenchmarkResult":
  gpu_metrics = [None for _ in range(len(data["e2e_latency"]))]
  else:
  gpu_metrics = [GPURawMetrics.from_dict(gm) for gm in data["gpu_metrics"]]
+ # Handle timestamps, which can be saved as None to reduce file size
+ if data["timestamps"] is None:
+ timestamps = [None for _ in range(len(data["e2e_latency"]))]
+ else:
+ timestamps = data["timestamps"]
  # Create a new instance and accumulate the data
  new_instance = cls()
- for i in range(len(data["e2e_latency"])):
- new_instance.accumulate(
- e2e_latency=data["e2e_latency"][i],
- time_to_first_token=data["time_to_first_token"][i],
- inter_token_latency=data["inter_token_latency"][i],
- shape_and_decoded_output=data["shape_and_decoded_outputs"][i],
- gpu_metrics=gpu_metrics[i],
- )
+ new_instance.e2e_latency = data["e2e_latency"]
+ new_instance._timestamps = timestamps
+ new_instance.time_to_first_token = data["time_to_first_token"]
+ new_instance.inter_token_latency = data["inter_token_latency"]
+ new_instance.shape_and_decoded_outputs = data["shape_and_decoded_outputs"]
+ new_instance.gpu_metrics = gpu_metrics
  return new_instance
 
  def get_throughput(self, total_generated_tokens: int) -> list[float]:
 
@@ -379,7 +379,7 @@ def mark_blocks_as_complete(self, state: RequestState) -> None:
  self._block_manager.mark_blocks_as_complete(
  num_complete_blocks=num_complete_blocks,
  allocated_blocks=cm.block_table[state.request_id],
- prompt_ids=(state.full_prompt_ids + state.static_outputs),
+ prompt_ids=(state.initial_tokens + state.generated_tokens),
  )
Original file line number	Diff line number	Diff line change
`@@ -379,7 +379,7 @@ def mark_blocks_as_complete(self, state: RequestState) -> None:`
`379`	`379`	`self._block_manager.mark_blocks_as_complete(`
`380`	`380`	`num_complete_blocks=num_complete_blocks,`
`381`	`381`	`allocated_blocks=cm.block_table[state.request_id],`
`382`		`- prompt_ids=(state.full_prompt_ids + state.static_outputs),`
	`382`	`+ prompt_ids=(state.initial_tokens + state.generated_tokens),`
`383`	`383`	`)`
`384`	`384`
`385`	`385`