Skip to content

Commit f7c33ab

Browse files
authored
Small changes to benchmarking script (#41662)
1 parent 9839d57 commit f7c33ab

File tree

4 files changed

+88
-77
lines changed

4 files changed

+88
-77
lines changed

benchmark_v2/framework/benchmark_config.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def to_dict(self) -> dict[str, Any]:
104104
"attn_implementation": self.attn_implementation,
105105
"sdpa_backend": self.sdpa_backend,
106106
"compile_mode": self.compile_mode,
107-
"compile_options": self.compile_options,
107+
"compile_options": self.compile_options | {}, # to avoid inplace modification of the original dict
108108
"kernelize": self.kernelize,
109109
}
110110

@@ -191,28 +191,25 @@ def generate_all_configs(
191191
)
192192

193193

194-
def generate_default_configs(
194+
def generate_main_configs(
195195
warmup_iterations: int = 5,
196196
measurement_iterations: int = 20,
197197
batch_size: int = 1,
198198
sequence_length: int = 128,
199199
num_tokens_to_generate: int = 128,
200200
gpu_monitoring: bool = False,
201201
) -> list[BenchmarkConfig]:
202-
all_attn_implementations = [
203-
("flash_attention_2", None),
204-
("eager", None),
205-
("sdpa", "math"),
206-
("sdpa", "flash_attention"), # note: this one can fail with compile because of attn mask
202+
# Create kwargs common to all configs
203+
kwargs = {
204+
"warmup_iterations": warmup_iterations,
205+
"measurement_iterations": measurement_iterations,
206+
"batch_size": batch_size,
207+
"sequence_length": sequence_length,
208+
"num_tokens_to_generate": num_tokens_to_generate,
209+
"gpu_monitoring": gpu_monitoring,
210+
}
211+
return [ # TODO: test max-autotune instead of default
212+
BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", **kwargs),
213+
BenchmarkConfig(attn_implementation="eager", compile_mode="default", **kwargs),
214+
BenchmarkConfig(attn_implementation="flash_attention_2", **kwargs),
207215
]
208-
return cross_generate_configs(
209-
attn_impl_and_sdpa_backend=all_attn_implementations,
210-
compiled_mode=[None, "max-autotune"],
211-
kernelized=[False, KERNELIZATION_AVAILABLE],
212-
warmup_iterations=warmup_iterations,
213-
measurement_iterations=measurement_iterations,
214-
batch_size=batch_size,
215-
sequence_length=sequence_length,
216-
num_tokens_to_generate=num_tokens_to_generate,
217-
gpu_monitoring=gpu_monitoring,
218-
)

benchmark_v2/framework/benchmark_runner.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -144,11 +144,11 @@ def __next__(self):
144144
class BenchmarkRunner:
145145
"""Main benchmark runner that coordinates benchmark execution."""
146146

147-
def __init__(
148-
self, logger: logging.Logger, output_dir: str = "benchmark_results", commit_id: str | None = None
149-
) -> None:
147+
def __init__(self, logger: logging.Logger, output_dir: str | None = None, commit_id: str | None = None) -> None:
150148
# Those stay constant for the whole run
151149
self.logger = logger
150+
if output_dir is None:
151+
output_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "benchmark_results")
152152
self.output_dir = output_dir
153153
self.commit_id = get_git_revision() if commit_id is None else commit_id
154154
os.makedirs(self.output_dir, exist_ok=True)
@@ -214,7 +214,7 @@ def run_one_benchmark(self, model_id: str, config: BenchmarkConfig, num_tokens_t
214214

215215
# Quick validation: try one measurement first to see if this scenario works
216216
flush_memory()
217-
e2e_latency, token_generation_times, decoded_output, gpu_metrics = self.time_generate(
217+
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
218218
max_new_tokens=1, gpu_monitor=None
219219
)
220220
if e2e_latency < 0:
@@ -231,11 +231,11 @@ def run_one_benchmark(self, model_id: str, config: BenchmarkConfig, num_tokens_t
231231
result = BenchmarkResult()
232232
self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
233233
for _ in trange(config.measurement_iterations):
234-
e2e_latency, token_generation_times, decoded_output, gpu_metrics = self.time_generate(
234+
e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
235235
max_new_tokens=config.num_tokens_to_generate,
236236
gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
237237
)
238-
result.accumulate(e2e_latency, token_generation_times, decoded_output, gpu_metrics)
238+
result.accumulate(e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics)
239239
self.logger.info("Benchmarking done. Cleaning up.")
240240

241241
# Profile if needed
@@ -277,10 +277,11 @@ def time_generate(
277277
raise RuntimeError(f"Generated {new_tokens} tokens, expected {max_new_tokens}")
278278
# Decode outputs
279279
decoded_output = self.tokenizer.decode(outputs[0, input_tokens:], skip_special_tokens=True)
280+
shape_and_decoded_output = f"{tuple(outputs.shape)} | {decoded_output}"
280281
# Compute intermediate quantities
281282
e2e_latency = wall_time_1 - wall_time_0
282283
token_generation_times = [t - wall_time_0 for t in streamer.timestamps[1:]]
283-
return e2e_latency, token_generation_times, decoded_output, gpu_metrics
284+
return e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics
284285

285286
def profile_generate(self, num_tokens_to_profile: int, config_name: str) -> None:
286287
"""Profile the latency of a call to model.generate() with the given (inputs) and (max_new_tokens)."""
@@ -351,10 +352,10 @@ def run_benchmarks(
351352
first_metadata = all_results[first_key]["metadata"].to_dict()
352353
hardware_info = first_metadata.pop("hardware_info")
353354
pretty_print_dict(first_metadata | hardware_info, tabs=1)
354-
for value in all_results.values():
355+
for result in all_results.values():
355356
print("=" * 100)
356-
print(f"Config: {value['config'].infer_name(compact=False)}\n")
357-
value["measurements"].pprint(tabs=1)
357+
print(f"Config: {result['config'].infer_name(compact=False)}\n")
358+
result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
358359
print("=" * 100)
359360

360361
return all_results

benchmark_v2/framework/data_classes.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -82,19 +82,19 @@ class BenchmarkResult:
8282
def __init__(self) -> None:
8383
self.e2e_latency = []
8484
self.token_generation_times = [] # time at which each token was generated (relative to start of the generation)
85-
self.decoded_outputs = []
85+
self.shape_and_decoded_outputs = []
8686
self.gpu_metrics = []
8787

8888
def accumulate(
8989
self,
9090
e2e_latency: float,
9191
token_generation_times: list[float],
92-
decoded_output: str,
92+
shape_and_decoded_output: str,
9393
gpu_metrics: GPURawMetrics | None,
9494
) -> None:
9595
self.e2e_latency.append(e2e_latency)
9696
self.token_generation_times.append(token_generation_times)
97-
self.decoded_outputs.append(decoded_output)
97+
self.shape_and_decoded_outputs.append(shape_and_decoded_output)
9898
self.gpu_metrics.append(gpu_metrics)
9999

100100
def to_dict(self) -> dict[str, None | int | float]:
@@ -106,7 +106,7 @@ def to_dict(self) -> dict[str, None | int | float]:
106106
return {
107107
"e2e_latency": self.e2e_latency,
108108
"token_generation_times": self.token_generation_times,
109-
"decoded_outputs": self.decoded_outputs,
109+
"shape_and_decoded_outputs": self.shape_and_decoded_outputs,
110110
"gpu_metrics": gpu_metrics,
111111
}
112112

@@ -123,7 +123,7 @@ def from_dict(cls, data: dict[str, None | int | float]) -> "BenchmarkResult":
123123
new_instance.accumulate(
124124
e2e_latency=data["e2e_latency"][i],
125125
token_generation_times=data["token_generation_times"][i],
126-
decoded_output=data["decoded_output"][i],
126+
shape_and_decoded_output=data["shape_and_decoded_outputs"][i],
127127
gpu_metrics=gpu_metrics[i],
128128
)
129129
return new_instance
@@ -134,19 +134,27 @@ def get_measured_ttft(self) -> list[float]:
134134
def get_measured_itl(self) -> list[float]:
135135
return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]
136136

137-
def pprint(self, tabs: int = 0) -> None:
138-
collated_stats = equalize_lengths_and_collate(
139-
[
140-
add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
141-
add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
142-
add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
143-
]
144-
)
145-
pretty_print_dict(
146-
{
147-
"E2E Latency": collated_stats[0],
148-
"Time to First Token": collated_stats[1],
149-
"Inter-Token Latency": collated_stats[2],
150-
},
151-
tabs=tabs,
152-
)
137+
def get_throughput(self, batch_size: int) -> float:
138+
return [
139+
batch_size * len(dt) / e2e_latency
140+
for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
141+
]
142+
143+
def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
144+
stats_to_collate = [
145+
add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
146+
add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
147+
add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
148+
]
149+
if batch_size > 0:
150+
throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
151+
stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
152+
collated_stats = equalize_lengths_and_collate(stats_to_collate)
153+
dict_to_pprint = {
154+
"E2E Latency": collated_stats[0],
155+
"Time to First Token": collated_stats[1],
156+
"Inter-Token Latency": collated_stats[2],
157+
}
158+
if batch_size > 0:
159+
dict_to_pprint["Throughput"] = collated_stats[3]
160+
pretty_print_dict(dict_to_pprint, tabs=tabs)

benchmark_v2/run_benchmarks.py

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -20,28 +20,28 @@
2020

2121
import argparse
2222
import logging
23-
import random
2423
import sys
2524
import uuid
2625

27-
from framework.benchmark_config import BenchmarkConfig, generate_all_configs
26+
from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
2827
from framework.benchmark_runner import BenchmarkRunner
2928

3029

3130
if __name__ == "__main__":
3231
# Parse arguments
3332
parser = argparse.ArgumentParser()
34-
parser.add_argument("--output-dir", type=str, default="benchmark_results", help="Output dir for benchmark results")
33+
parser.add_argument("--output-dir", type=str, default=None, help="Output dir for benchmark results")
3534
parser.add_argument("--log-level", type=str, choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO")
3635
parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
3736

38-
parser.add_argument("--warmup", type=int, default=5, help="Number of warmup iterations")
39-
parser.add_argument("--iterations", type=int, default=20, help="Number of measurement iterations")
37+
parser.add_argument("--warmup", type=int, default=3, help="Number of warmup iterations")
38+
parser.add_argument("--iterations", type=int, default=10, help="Number of measurement iterations")
4039

4140
parser.add_argument("--batch-size", "-b", type=int, nargs="+", help="Batch size")
4241
parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
4342
parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")
4443

44+
parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
4545
parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")
4646

4747
parser.add_argument("--commit-id", type=str, help="Git commit ID (if not provided, will auto-detect from git)")
@@ -69,42 +69,47 @@
6969

7070
# If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
7171
elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
72-
benchmark_configs = generate_all_configs(
72+
if args.cross_generate:
73+
benchmark_configs = generate_all_configs(
74+
warmup_iterations=args.warmup,
75+
measurement_iterations=args.iterations,
76+
batch_size=args.batch_size[0],
77+
sequence_length=args.sequence_length[0],
78+
num_tokens_to_generate=args.num_tokens_to_generate[0],
79+
)
80+
else:
81+
benchmark_configs = generate_main_configs(
82+
warmup_iterations=args.warmup,
83+
measurement_iterations=args.iterations,
84+
batch_size=args.batch_size[0],
85+
sequence_length=args.sequence_length[0],
86+
num_tokens_to_generate=args.num_tokens_to_generate[0],
87+
)
88+
89+
# Otherwise, we benchmark across all combinations of dimensions
90+
else:
91+
main_config = generate_main_configs(
7392
warmup_iterations=args.warmup,
7493
measurement_iterations=args.iterations,
7594
batch_size=args.batch_size[0],
7695
sequence_length=args.sequence_length[0],
7796
num_tokens_to_generate=args.num_tokens_to_generate[0],
78-
)
79-
random.shuffle(benchmark_configs)
80-
81-
# Otherwise, we benchmark across all combinations of dimensions
82-
else:
83-
kwargs = {
84-
"warmup_iterations": args.warmup,
85-
"measurement_iterations": args.iterations,
86-
"gpu_monitoring": False,
87-
"batch_size": args.batch_size[0],
88-
"sequence_length": args.sequence_length[0],
89-
"num_tokens_to_generate": args.num_tokens_to_generate[0],
90-
"attn_implementation": "flex_attention",
91-
"sdpa_backend": None,
92-
"compile_mode": "default",
93-
"kernelize": False,
94-
}
97+
)[0]
9598
benchmark_configs = []
9699
for num_tokens_to_generate in args.num_tokens_to_generate:
97100
for sequence_length in args.sequence_length:
98101
for batch_size in args.batch_size:
99-
kwargs["batch_size"] = batch_size
100-
kwargs["sequence_length"] = sequence_length
101-
kwargs["num_tokens_to_generate"] = num_tokens_to_generate
102-
benchmark_configs.append(BenchmarkConfig(**kwargs))
102+
cfg_dict = main_config.to_dict()
103+
cfg_dict["batch_size"] = batch_size
104+
cfg_dict["sequence_length"] = sequence_length
105+
cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
106+
cfg_dict.pop("name")
107+
benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
103108

104109
runner = BenchmarkRunner(logger, args.output_dir, args.commit_id)
105110
results = runner.run_benchmarks(
106111
args.model_id,
107-
benchmark_configs[:3],
112+
benchmark_configs,
108113
args.num_tokens_to_profile,
109114
pretty_print_summary=True,
110115
)

0 commit comments

Comments
 (0)