mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
329 lines
11 KiB
Python
329 lines
11 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models).
|
|
# It generates test data, runs benchmarks, and saves results to a CSV file.
|
|
#
|
|
# The CSV file (named with current date/time) contains these columns:
|
|
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
|
|
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
|
|
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
|
|
# speedup
|
|
#
|
|
# == Usage Examples ==
|
|
#
|
|
# Single model benchmark:
|
|
# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \
|
|
# --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
|
|
#
|
|
# All models benchmark:
|
|
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
|
|
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
|
|
#
|
|
# All models with different TP sizes:
|
|
# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \
|
|
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
|
|
#
|
|
# All models with different token counts:
|
|
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
|
|
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384
|
|
import csv
|
|
import os
|
|
import time
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import torch
|
|
|
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
|
from vllm.platforms import current_platform
|
|
from vllm.transformers_utils.config import get_config
|
|
from vllm.utils import FlexibleArgumentParser
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
def generate_test_data(
|
|
num_tokens: int,
|
|
num_q_heads: int,
|
|
num_kv_heads: int,
|
|
head_size: int,
|
|
max_position_embeddings: int,
|
|
dtype: torch.dtype,
|
|
device: torch.device,
|
|
):
|
|
"""Generate test data for given configuration."""
|
|
# Create 2D positions (3, num_tokens) for multimodal case
|
|
positions = torch.randint(
|
|
0, max_position_embeddings // 4, (3, num_tokens), device=device
|
|
)
|
|
|
|
# Create query and key tensors
|
|
query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
|
|
key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)
|
|
|
|
return positions, query, key
|
|
|
|
|
|
def calculate_stats(times: list[float]) -> dict[str, float]:
|
|
"""Calculate statistics from a list of times."""
|
|
times_array = np.array(times)
|
|
return {
|
|
"mean": np.mean(times_array),
|
|
"median": np.median(times_array),
|
|
"p99": np.percentile(times_array, 99),
|
|
"min": np.min(times_array),
|
|
"max": np.max(times_array),
|
|
}
|
|
|
|
|
|
def benchmark_mrope(
|
|
model_name: str,
|
|
num_tokens: int,
|
|
head_dim: int,
|
|
tp_size: int,
|
|
num_heads: int,
|
|
num_kv_heads: int,
|
|
max_position: int = 8192,
|
|
rope_theta: float = 10000,
|
|
is_neox_style: bool = True,
|
|
rope_scaling: dict[str, Any] = None,
|
|
dtype: torch.dtype = torch.bfloat16,
|
|
seed: int = 0,
|
|
warmup_iter: int = 10,
|
|
benchmark_iter: int = 100,
|
|
csv_writer=None,
|
|
):
|
|
current_platform.seed_everything(seed)
|
|
torch.set_default_device(device)
|
|
# the parameters to compute the q k v size based on tp_size
|
|
mrope_helper_class = get_rope(
|
|
head_size=head_dim,
|
|
rotary_dim=head_dim,
|
|
max_position=max_position,
|
|
base=rope_theta,
|
|
is_neox_style=is_neox_style,
|
|
rope_scaling=rope_scaling,
|
|
dtype=dtype,
|
|
).to(device=device)
|
|
|
|
print(80 * "=")
|
|
print(
|
|
f"Evaluating model: {model_name} "
|
|
f"with tp_size: {tp_size} "
|
|
f"and num_tokens: {num_tokens}, "
|
|
f"dtype: {dtype}"
|
|
)
|
|
|
|
# create q k v input tensors
|
|
# create rotary pos emb input tensors
|
|
positions, query, key = generate_test_data(
|
|
num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
|
|
)
|
|
|
|
# Warm up
|
|
for _ in range(warmup_iter):
|
|
mrope_helper_class.forward_native(
|
|
positions,
|
|
query.clone(),
|
|
key.clone(),
|
|
)
|
|
|
|
mrope_helper_class.forward_cuda(
|
|
positions,
|
|
query.clone(),
|
|
key.clone(),
|
|
)
|
|
|
|
torch.cuda.synchronize()
|
|
|
|
# Time reference implementation
|
|
torch_times = []
|
|
for _ in range(benchmark_iter):
|
|
query_clone = query.clone()
|
|
key_clone = key.clone()
|
|
torch.cuda.synchronize()
|
|
start_time = time.time()
|
|
|
|
mrope_helper_class.forward_native(
|
|
positions,
|
|
query_clone,
|
|
key_clone,
|
|
)
|
|
|
|
torch.cuda.synchronize()
|
|
torch_times.append(time.time() - start_time)
|
|
|
|
# Time triton kernel implementation
|
|
triton_times = []
|
|
for _ in range(benchmark_iter):
|
|
query_clone = query.clone()
|
|
key_clone = key.clone()
|
|
torch.cuda.synchronize()
|
|
start_time = time.time()
|
|
mrope_helper_class.forward_cuda(
|
|
positions,
|
|
query_clone,
|
|
key_clone,
|
|
)
|
|
torch.cuda.synchronize()
|
|
triton_times.append(time.time() - start_time)
|
|
|
|
# Calculate statistics
|
|
torch_stats = calculate_stats(torch_times)
|
|
triton_stats = calculate_stats(triton_times)
|
|
print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):")
|
|
|
|
print(
|
|
f"Torch implementation: "
|
|
f"mean={torch_stats['mean']:.8f}s, "
|
|
f"median={torch_stats['median']:.8f}s, "
|
|
f"p99={torch_stats['p99']:.8f}s"
|
|
)
|
|
|
|
print(
|
|
f"Triton implementation: "
|
|
f"mean={triton_stats['mean']:.8f}s, "
|
|
f"median={triton_stats['median']:.8f}s, "
|
|
f"p99={triton_stats['p99']:.8f}s"
|
|
)
|
|
|
|
print(
|
|
f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x"
|
|
)
|
|
|
|
# Write to CSV
|
|
if csv_writer:
|
|
row = [
|
|
model_name,
|
|
tp_size,
|
|
num_tokens,
|
|
num_heads,
|
|
num_kv_heads,
|
|
head_dim,
|
|
max_position,
|
|
rope_theta,
|
|
is_neox_style,
|
|
str(rope_scaling),
|
|
str(dtype).split(".")[-1],
|
|
torch_stats["mean"],
|
|
torch_stats["median"],
|
|
torch_stats["p99"],
|
|
torch_stats["min"],
|
|
torch_stats["max"],
|
|
triton_stats["mean"],
|
|
triton_stats["median"],
|
|
triton_stats["p99"],
|
|
triton_stats["min"],
|
|
triton_stats["max"],
|
|
torch_stats["mean"] / triton_stats["mean"], # speedup
|
|
]
|
|
csv_writer.writerow(row)
|
|
|
|
return torch_stats, triton_stats
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = FlexibleArgumentParser(
|
|
description="Benchmark the rotary embedding kernels."
|
|
)
|
|
parser.add_argument("--model-name", type=str, default="")
|
|
parser.add_argument("--tp-size", type=int, default=1)
|
|
parser.add_argument("--warmup-iter", type=int, default=10)
|
|
parser.add_argument("--benchmark-iter", type=int, default=100)
|
|
parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16")
|
|
parser.add_argument("--seed", type=int, default=0)
|
|
parser.add_argument("--num-tokens", type=int, nargs="+", required=False)
|
|
parser.add_argument("--trust-remote-code", action="store_true")
|
|
parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv")
|
|
args = parser.parse_args()
|
|
print(args)
|
|
|
|
# Create CSV file for results
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv"
|
|
|
|
with open(csv_filename, "w", newline="") as csvfile:
|
|
csv_writer = csv.writer(csvfile)
|
|
# Write header
|
|
header = [
|
|
"model_name",
|
|
"tp_size",
|
|
"num_tokens",
|
|
"num_heads",
|
|
"num_kv_heads",
|
|
"head_dim",
|
|
"max_position",
|
|
"rope_theta",
|
|
"is_neox_style",
|
|
"rope_scaling",
|
|
"dtype",
|
|
"torch_mean",
|
|
"torch_median",
|
|
"torch_p99",
|
|
"torch_min",
|
|
"torch_max",
|
|
"triton_mean",
|
|
"triton_median",
|
|
"triton_p99",
|
|
"triton_min",
|
|
"triton_max",
|
|
"speedup",
|
|
]
|
|
csv_writer.writerow(header)
|
|
|
|
model_tp_dict = {}
|
|
if args.model_name == "":
|
|
model_tp_dict = {
|
|
"Qwen/Qwen2-VL-2B-Instruct": [1],
|
|
"Qwen/Qwen2-VL-7B-Instruct": [1],
|
|
"Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8],
|
|
"Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8],
|
|
"Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8],
|
|
"Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8],
|
|
}
|
|
else:
|
|
model_tp_dict[args.model_name] = [args.tp_size]
|
|
|
|
if args.num_tokens is None:
|
|
num_tokens_list = [2**i for i in range(0, 18)]
|
|
else:
|
|
num_tokens_list = args.num_tokens
|
|
|
|
for model_name, tp_list in model_tp_dict.items():
|
|
config = get_config(model_name, trust_remote_code=args.trust_remote_code)
|
|
for tp_size in tp_list:
|
|
# get the model config
|
|
total_num_kv_heads = config.num_key_value_heads
|
|
total_num_heads = config.num_attention_heads
|
|
num_heads = total_num_heads // tp_size
|
|
num_kv_heads = max(1, total_num_kv_heads // tp_size)
|
|
head_dim = config.hidden_size // total_num_heads
|
|
q_size = num_heads * head_dim
|
|
kv_size = num_kv_heads * head_dim
|
|
is_neox_style = True
|
|
rope_theta = config.rope_theta
|
|
max_position = config.max_position_embeddings
|
|
|
|
for num_tokens in num_tokens_list:
|
|
benchmark_mrope(
|
|
model_name=model_name,
|
|
num_tokens=num_tokens,
|
|
head_dim=head_dim,
|
|
tp_size=tp_size,
|
|
num_heads=num_heads,
|
|
num_kv_heads=num_kv_heads,
|
|
max_position=max_position,
|
|
rope_theta=rope_theta,
|
|
is_neox_style=is_neox_style,
|
|
rope_scaling=config.rope_scaling,
|
|
dtype=getattr(torch, args.dtype),
|
|
seed=args.seed,
|
|
warmup_iter=args.warmup_iter,
|
|
benchmark_iter=args.benchmark_iter,
|
|
csv_writer=csv_writer,
|
|
)
|
|
|
|
print(f"Benchmark results saved to {csv_filename}")
|