Files
vllm/benchmarks/kernels/benchmark_mrope.py

329 lines
11 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models).
# It generates test data, runs benchmarks, and saves results to a CSV file.
#
# The CSV file (named with current date/time) contains these columns:
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
# speedup
#
# == Usage Examples ==
#
# Single model benchmark:
# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \
# --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
#
# All models benchmark:
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
#
# All models with different TP sizes:
# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
#
# All models with different token counts:
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384
import csv
import os
import time
from datetime import datetime
from typing import Any
import numpy as np
import torch
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.utils import FlexibleArgumentParser
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def generate_test_data(
num_tokens: int,
num_q_heads: int,
num_kv_heads: int,
head_size: int,
max_position_embeddings: int,
dtype: torch.dtype,
device: torch.device,
):
"""Generate test data for given configuration."""
# Create 2D positions (3, num_tokens) for multimodal case
positions = torch.randint(
0, max_position_embeddings // 4, (3, num_tokens), device=device
)
# Create query and key tensors
query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)
return positions, query, key
def calculate_stats(times: list[float]) -> dict[str, float]:
"""Calculate statistics from a list of times."""
times_array = np.array(times)
return {
"mean": np.mean(times_array),
"median": np.median(times_array),
"p99": np.percentile(times_array, 99),
"min": np.min(times_array),
"max": np.max(times_array),
}
def benchmark_mrope(
model_name: str,
num_tokens: int,
head_dim: int,
tp_size: int,
num_heads: int,
num_kv_heads: int,
max_position: int = 8192,
rope_theta: float = 10000,
is_neox_style: bool = True,
rope_scaling: dict[str, Any] = None,
dtype: torch.dtype = torch.bfloat16,
seed: int = 0,
warmup_iter: int = 10,
benchmark_iter: int = 100,
csv_writer=None,
):
current_platform.seed_everything(seed)
torch.set_default_device(device)
# the parameters to compute the q k v size based on tp_size
mrope_helper_class = get_rope(
head_size=head_dim,
rotary_dim=head_dim,
max_position=max_position,
base=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=rope_scaling,
dtype=dtype,
).to(device=device)
print(80 * "=")
print(
f"Evaluating model: {model_name} "
f"with tp_size: {tp_size} "
f"and num_tokens: {num_tokens}, "
f"dtype: {dtype}"
)
# create q k v input tensors
# create rotary pos emb input tensors
positions, query, key = generate_test_data(
num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
)
# Warm up
for _ in range(warmup_iter):
mrope_helper_class.forward_native(
positions,
query.clone(),
key.clone(),
)
mrope_helper_class.forward_cuda(
positions,
query.clone(),
key.clone(),
)
torch.cuda.synchronize()
# Time reference implementation
torch_times = []
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.cuda.synchronize()
start_time = time.time()
mrope_helper_class.forward_native(
positions,
query_clone,
key_clone,
)
torch.cuda.synchronize()
torch_times.append(time.time() - start_time)
# Time triton kernel implementation
triton_times = []
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.cuda.synchronize()
start_time = time.time()
mrope_helper_class.forward_cuda(
positions,
query_clone,
key_clone,
)
torch.cuda.synchronize()
triton_times.append(time.time() - start_time)
# Calculate statistics
torch_stats = calculate_stats(torch_times)
triton_stats = calculate_stats(triton_times)
print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):")
print(
f"Torch implementation: "
f"mean={torch_stats['mean']:.8f}s, "
f"median={torch_stats['median']:.8f}s, "
f"p99={torch_stats['p99']:.8f}s"
)
print(
f"Triton implementation: "
f"mean={triton_stats['mean']:.8f}s, "
f"median={triton_stats['median']:.8f}s, "
f"p99={triton_stats['p99']:.8f}s"
)
print(
f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x"
)
# Write to CSV
if csv_writer:
row = [
model_name,
tp_size,
num_tokens,
num_heads,
num_kv_heads,
head_dim,
max_position,
rope_theta,
is_neox_style,
str(rope_scaling),
str(dtype).split(".")[-1],
torch_stats["mean"],
torch_stats["median"],
torch_stats["p99"],
torch_stats["min"],
torch_stats["max"],
triton_stats["mean"],
triton_stats["median"],
triton_stats["p99"],
triton_stats["min"],
triton_stats["max"],
torch_stats["mean"] / triton_stats["mean"], # speedup
]
csv_writer.writerow(row)
return torch_stats, triton_stats
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark the rotary embedding kernels."
)
parser.add_argument("--model-name", type=str, default="")
parser.add_argument("--tp-size", type=int, default=1)
parser.add_argument("--warmup-iter", type=int, default=10)
parser.add_argument("--benchmark-iter", type=int, default=100)
parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--num-tokens", type=int, nargs="+", required=False)
parser.add_argument("--trust-remote-code", action="store_true")
parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv")
args = parser.parse_args()
print(args)
# Create CSV file for results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv"
with open(csv_filename, "w", newline="") as csvfile:
csv_writer = csv.writer(csvfile)
# Write header
header = [
"model_name",
"tp_size",
"num_tokens",
"num_heads",
"num_kv_heads",
"head_dim",
"max_position",
"rope_theta",
"is_neox_style",
"rope_scaling",
"dtype",
"torch_mean",
"torch_median",
"torch_p99",
"torch_min",
"torch_max",
"triton_mean",
"triton_median",
"triton_p99",
"triton_min",
"triton_max",
"speedup",
]
csv_writer.writerow(header)
model_tp_dict = {}
if args.model_name == "":
model_tp_dict = {
"Qwen/Qwen2-VL-2B-Instruct": [1],
"Qwen/Qwen2-VL-7B-Instruct": [1],
"Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8],
"Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8],
"Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8],
"Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8],
}
else:
model_tp_dict[args.model_name] = [args.tp_size]
if args.num_tokens is None:
num_tokens_list = [2**i for i in range(0, 18)]
else:
num_tokens_list = args.num_tokens
for model_name, tp_list in model_tp_dict.items():
config = get_config(model_name, trust_remote_code=args.trust_remote_code)
for tp_size in tp_list:
# get the model config
total_num_kv_heads = config.num_key_value_heads
total_num_heads = config.num_attention_heads
num_heads = total_num_heads // tp_size
num_kv_heads = max(1, total_num_kv_heads // tp_size)
head_dim = config.hidden_size // total_num_heads
q_size = num_heads * head_dim
kv_size = num_kv_heads * head_dim
is_neox_style = True
rope_theta = config.rope_theta
max_position = config.max_position_embeddings
for num_tokens in num_tokens_list:
benchmark_mrope(
model_name=model_name,
num_tokens=num_tokens,
head_dim=head_dim,
tp_size=tp_size,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
max_position=max_position,
rope_theta=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=config.rope_scaling,
dtype=getattr(torch, args.dtype),
seed=args.seed,
warmup_iter=args.warmup_iter,
benchmark_iter=args.benchmark_iter,
csv_writer=csv_writer,
)
print(f"Benchmark results saved to {csv_filename}")