mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
75 lines
2.1 KiB
Python
75 lines
2.1 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
import gc
|
|
|
|
from tabulate import tabulate
|
|
|
|
from benchmark_utils import TimeCollector
|
|
from vllm.utils import FlexibleArgumentParser
|
|
from vllm.v1.core.block_pool import BlockPool
|
|
|
|
|
|
def main(args):
|
|
rows = []
|
|
for allocate_block in args.allocate_blocks:
|
|
# Enforce a GC collect ahead to minimize the impact among runs
|
|
gc.collect()
|
|
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
|
|
|
|
get_blocks_times = TimeCollector(TimeCollector.US)
|
|
free_blocks_times = TimeCollector(TimeCollector.US)
|
|
for _ in range(args.num_iteration):
|
|
with get_blocks_times:
|
|
blocks = block_pool.get_new_blocks(allocate_block)
|
|
with free_blocks_times:
|
|
block_pool.free_blocks(blocks)
|
|
|
|
rows.append(
|
|
[get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
|
|
+ get_blocks_times.dump_avg_max()
|
|
+ free_blocks_times.dump_avg_max()
|
|
)
|
|
|
|
print(
|
|
tabulate(
|
|
rows,
|
|
headers=[
|
|
"Iterations",
|
|
"Total\nBlocks",
|
|
"Allocated\nBlocks",
|
|
"Get Blocks\nAvg (us)",
|
|
"Get Blocks\nMax (us)",
|
|
"Free Blocks\nAvg (us)",
|
|
"Free Blocks\nMax (us)",
|
|
],
|
|
tablefmt="grid",
|
|
floatfmt=".3f",
|
|
)
|
|
)
|
|
|
|
|
|
def invoke_main() -> None:
|
|
parser = FlexibleArgumentParser(
|
|
description="Benchmark the performance of BlockPool for KV Cache."
|
|
)
|
|
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
|
|
parser.add_argument(
|
|
"--num-iteration",
|
|
type=int,
|
|
default=1000,
|
|
help="Number of iterations to run to stablize final data readings",
|
|
)
|
|
parser.add_argument(
|
|
"--allocate-blocks",
|
|
type=int,
|
|
nargs="*",
|
|
default=[10, 50, 100, 500, 1000],
|
|
help="Number of blocks to allocate",
|
|
)
|
|
args = parser.parse_args()
|
|
main(args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
invoke_main() # pragma: no cover
|