vllm/benchmarks/kv_cache/benchmark_block_pool.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import time
from typing import Optional

from tabulate import tabulate

from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool


class Metric:
    def __init__(self) -> None:
        self.cnt: int = 0
        self.sum_v: int = 0
        self.max_v: Optional[int] = None

    def update(self, v: int) -> None:
        self.cnt += 1
        self.sum_v += v
        if self.max_v is None:
            self.max_v = v
        else:
            self.max_v = max(self.max_v, v)

    def avg_v(self) -> float:
        return self.sum_v * 1.0 / self.cnt


def main(args):
    rows = []
    for allocate_block in args.allocate_blocks:
        # Enforce a GC collect ahead to minimize the impact among runs
        gc.collect()
        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)

        get_blocks_metric: Metric = Metric()
        free_blocks_metric: Metric = Metric()
        for _ in range(args.num_iteration):
            t1 = time.monotonic_ns()
            blocks = block_pool.get_new_blocks(allocate_block)
            t2 = time.monotonic_ns()
            block_pool.free_blocks(blocks)
            t3 = time.monotonic_ns()
            get_blocks_metric.update(t2 - t1)
            free_blocks_metric.update(t3 - t2)

        if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
            rows.append(
                [
                    get_blocks_metric.cnt,
                    args.num_gpu_blocks,
                    allocate_block,
                    get_blocks_metric.avg_v() / 1000000,
                    get_blocks_metric.max_v / 1000000.0,
                    free_blocks_metric.avg_v() / 1000000,
                    free_blocks_metric.max_v / 1000000.0,
                ]
            )
        else:
            print(
                "No valid metrics found."
                f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
            )

    print(
        tabulate(
            rows,
            headers=[
                "Iterations",
                "Total\nBlocks",
                "Allocated\nBlocks",
                "Get Blocks\nAvg (ms)",
                "Get Blocks\nMax (ms)",
                "Free Blocks\nAvg (ms)",
                "Free Blocks\nMax (ms)",
            ],
            tablefmt="grid",
            floatfmt=".6f",
        )
    )


def invoke_main() -> None:
    parser = FlexibleArgumentParser(
        description="Benchmark the performance of BlockPool for KV Cache."
    )
    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
    parser.add_argument(
        "--num-iteration",
        type=int,
        default=1000,
        help="Number of iterations to run to stablize final data readings",
    )
    parser.add_argument(
        "--allocate-blocks",
        type=int,
        nargs="*",
        default=[10, 50, 100, 500, 1000],
        help="Number of blocks to allocate",
    )
    args = parser.parse_args()
    main(args)


if __name__ == "__main__":
    invoke_main()  # pragma: no cover