mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-11-04 17:34:34 +08:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			v0.11.0rc2
			...
			pd_schedul
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 161010c384 | 
@ -1,20 +1,19 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import sys
 | 
			
		||||
import zipfile
 | 
			
		||||
 | 
			
		||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
 | 
			
		||||
# Note that we have 800 MiB quota, please use it wisely.
 | 
			
		||||
# See https://github.com/pypi/support/issues/6326 .
 | 
			
		||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
 | 
			
		||||
# Note that we have 400 MiB quota, please use it wisely.
 | 
			
		||||
# See https://github.com/pypi/support/issues/3792 .
 | 
			
		||||
# Please also sync the value with the one in Dockerfile.
 | 
			
		||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 | 
			
		||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def print_top_10_largest_files(zip_file):
 | 
			
		||||
    """Print the top 10 largest files in the given zip file."""
 | 
			
		||||
    with zipfile.ZipFile(zip_file, "r") as z:
 | 
			
		||||
    with zipfile.ZipFile(zip_file, 'r') as z:
 | 
			
		||||
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
 | 
			
		||||
        file_sizes.sort(key=lambda x: x[1], reverse=True)
 | 
			
		||||
        for f, size in file_sizes[:10]:
 | 
			
		||||
@ -29,18 +28,14 @@ def check_wheel_size(directory):
 | 
			
		||||
                wheel_path = os.path.join(root, file_name)
 | 
			
		||||
                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
 | 
			
		||||
                if wheel_size_mb > VLLM_MAX_SIZE_MB:
 | 
			
		||||
                    print(
 | 
			
		||||
                        f"Not allowed: Wheel {wheel_path} is larger "
 | 
			
		||||
                        f"({wheel_size_mb:.2f} MB) than the limit "
 | 
			
		||||
                        f"({VLLM_MAX_SIZE_MB} MB)."
 | 
			
		||||
                    )
 | 
			
		||||
                    print(f"Not allowed: Wheel {wheel_path} is larger "
 | 
			
		||||
                          f"({wheel_size_mb:.2f} MB) than the limit "
 | 
			
		||||
                          f"({VLLM_MAX_SIZE_MB} MB).")
 | 
			
		||||
                    print_top_10_largest_files(wheel_path)
 | 
			
		||||
                    return 1
 | 
			
		||||
                else:
 | 
			
		||||
                    print(
 | 
			
		||||
                        f"Wheel {wheel_path} is within the allowed size "
 | 
			
		||||
                        f"({wheel_size_mb:.2f} MB)."
 | 
			
		||||
                    )
 | 
			
		||||
                    print(f"Wheel {wheel_path} is within the allowed size "
 | 
			
		||||
                          f"({wheel_size_mb:.2f} MB).")
 | 
			
		||||
    return 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import os
 | 
			
		||||
@ -8,8 +7,7 @@ template = """<!DOCTYPE html>
 | 
			
		||||
<html>
 | 
			
		||||
    <body>
 | 
			
		||||
    <h1>Links for vLLM</h1/>
 | 
			
		||||
        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
 | 
			
		||||
        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
 | 
			
		||||
        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
 | 
			
		||||
    </body>
 | 
			
		||||
</html>
 | 
			
		||||
"""
 | 
			
		||||
@ -22,25 +20,7 @@ filename = os.path.basename(args.wheel)
 | 
			
		||||
 | 
			
		||||
with open("index.html", "w") as f:
 | 
			
		||||
    print(f"Generated index.html for {args.wheel}")
 | 
			
		||||
    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
 | 
			
		||||
    if "x86_64" in filename:
 | 
			
		||||
        x86_wheel = filename
 | 
			
		||||
        arm_wheel = filename.replace("x86_64", "aarch64").replace(
 | 
			
		||||
            "manylinux1", "manylinux2014"
 | 
			
		||||
        )
 | 
			
		||||
    elif "aarch64" in filename:
 | 
			
		||||
        x86_wheel = filename.replace("aarch64", "x86_64").replace(
 | 
			
		||||
            "manylinux2014", "manylinux1"
 | 
			
		||||
        )
 | 
			
		||||
        arm_wheel = filename
 | 
			
		||||
    else:
 | 
			
		||||
        raise ValueError(f"Unsupported wheel: {filename}")
 | 
			
		||||
    # cloudfront requires escaping the '+' character
 | 
			
		||||
    f.write(
 | 
			
		||||
        template.format(
 | 
			
		||||
            x86_wheel=x86_wheel,
 | 
			
		||||
            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
 | 
			
		||||
            arm_wheel=arm_wheel,
 | 
			
		||||
            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
 | 
			
		||||
        )
 | 
			
		||||
    )
 | 
			
		||||
        template.format(wheel=filename,
 | 
			
		||||
                        wheel_html_escaped=filename.replace("+", "%2B")))
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 | 
			
		||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 | 
			
		||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
 | 
			
		||||
@ -1,11 +1,11 @@
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.54
 | 
			
		||||
    value: 0.419
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.59
 | 
			
		||||
limit: 1319
 | 
			
		||||
    value: 0.416
 | 
			
		||||
limit: 1000
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,11 +0,0 @@
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.335
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.323
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "mgoin/Minitron-4B-Base-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 | 
			
		||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 | 
			
		||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 | 
			
		||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
 | 
			
		||||
@ -1,12 +1,11 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.30
 | 
			
		||||
    value: 0.31
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.465
 | 
			
		||||
    value: 0.47
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 | 
			
		||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,11 +0,0 @@
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.47
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.64
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 | 
			
		||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,10 @@
 | 
			
		||||
Qwen2.5-1.5B-Instruct.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 | 
			
		||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 | 
			
		||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 | 
			
		||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
 | 
			
		||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 | 
			
		||||
Qwen2-1.5B-Instruct-FP8W8.yaml
 | 
			
		||||
Meta-Llama-3-8B-QQQ.yaml
 | 
			
		||||
 | 
			
		||||
@ -1,44 +0,0 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pytest_addoption(parser):
 | 
			
		||||
    parser.addoption(
 | 
			
		||||
        "--config-list-file",
 | 
			
		||||
        action="store",
 | 
			
		||||
        help="Path to the file listing model config YAMLs (one per line)",
 | 
			
		||||
    )
 | 
			
		||||
    parser.addoption(
 | 
			
		||||
        "--tp-size",
 | 
			
		||||
        action="store",
 | 
			
		||||
        default="1",
 | 
			
		||||
        help="Tensor parallel size to use for evaluation",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def config_list_file(pytestconfig, config_dir):
 | 
			
		||||
    rel_path = pytestconfig.getoption("--config-list-file")
 | 
			
		||||
    return config_dir / rel_path
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def tp_size(pytestconfig):
 | 
			
		||||
    return pytestconfig.getoption("--tp-size")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pytest_generate_tests(metafunc):
 | 
			
		||||
    if "config_filename" in metafunc.fixturenames:
 | 
			
		||||
        rel_path = metafunc.config.getoption("--config-list-file")
 | 
			
		||||
        config_list_file = Path(rel_path).resolve()
 | 
			
		||||
        config_dir = config_list_file.parent
 | 
			
		||||
        with open(config_list_file, encoding="utf-8") as f:
 | 
			
		||||
            configs = [
 | 
			
		||||
                config_dir / line.strip()
 | 
			
		||||
                for line in f
 | 
			
		||||
                if line.strip() and not line.startswith("#")
 | 
			
		||||
            ]
 | 
			
		||||
        metafunc.parametrize("config_filename", configs)
 | 
			
		||||
@ -2,7 +2,7 @@
 | 
			
		||||
# We can use this script to compute baseline accuracy on GSM for transformers.
 | 
			
		||||
#
 | 
			
		||||
# Make sure you have lm-eval-harness installed:
 | 
			
		||||
#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 | 
			
		||||
#   pip install lm-eval==0.4.4
 | 
			
		||||
 | 
			
		||||
usage() {
 | 
			
		||||
    echo``
 | 
			
		||||
 | 
			
		||||
@ -3,7 +3,7 @@
 | 
			
		||||
# We use this for fp8, which HF does not support.
 | 
			
		||||
#
 | 
			
		||||
# Make sure you have lm-eval-harness installed:
 | 
			
		||||
#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 | 
			
		||||
#   pip install lm-eval==0.4.4
 | 
			
		||||
 | 
			
		||||
usage() {
 | 
			
		||||
    echo``
 | 
			
		||||
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
lm_eval --model vllm \
 | 
			
		||||
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
 | 
			
		||||
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
 | 
			
		||||
  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
 | 
			
		||||
  --batch_size "$BATCH_SIZE"
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										59
									
								
								.buildkite/lm-eval-harness/run-tests.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								.buildkite/lm-eval-harness/run-tests.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,59 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
usage() {
 | 
			
		||||
    echo``
 | 
			
		||||
    echo "Runs lm eval harness on GSM8k using vllm and compares to "
 | 
			
		||||
    echo "precomputed baseline (measured by HF transformers.)"
 | 
			
		||||
    echo
 | 
			
		||||
    echo "usage: ${0} <options>"
 | 
			
		||||
    echo
 | 
			
		||||
    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
 | 
			
		||||
    echo "  -t    - tensor parallel size"
 | 
			
		||||
    echo
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
SUCCESS=0
 | 
			
		||||
 | 
			
		||||
while getopts "c:t:" OPT; do
 | 
			
		||||
  case ${OPT} in
 | 
			
		||||
    c ) 
 | 
			
		||||
        CONFIG="$OPTARG"
 | 
			
		||||
        ;;
 | 
			
		||||
    t )
 | 
			
		||||
        TP_SIZE="$OPTARG"
 | 
			
		||||
        ;;
 | 
			
		||||
    \? )
 | 
			
		||||
        usage
 | 
			
		||||
        exit 1
 | 
			
		||||
        ;;
 | 
			
		||||
  esac
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
# Parse list of configs.
 | 
			
		||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 | 
			
		||||
 | 
			
		||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 | 
			
		||||
do
 | 
			
		||||
    LOCAL_SUCCESS=0
 | 
			
		||||
    
 | 
			
		||||
    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
 | 
			
		||||
 | 
			
		||||
    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
 | 
			
		||||
    export LM_EVAL_TP_SIZE=$TP_SIZE
 | 
			
		||||
    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
 | 
			
		||||
 | 
			
		||||
    if [[ $LOCAL_SUCCESS == 0 ]]; then
 | 
			
		||||
        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
 | 
			
		||||
    else
 | 
			
		||||
        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
 | 
			
		||||
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
if [ "${SUCCESS}" -eq "0" ]; then
 | 
			
		||||
    exit 0
 | 
			
		||||
else
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
@ -1,57 +1,69 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
"""
 | 
			
		||||
LM eval harness on model to compare vs HF baseline computed offline.
 | 
			
		||||
Configs are found in configs/$MODEL.yaml
 | 
			
		||||
 | 
			
		||||
pytest -s -v test_lm_eval_correctness.py \
 | 
			
		||||
    --config-list-file=configs/models-small.txt \
 | 
			
		||||
    --tp-size=1
 | 
			
		||||
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
 | 
			
		||||
* export LM_EVAL_TP_SIZE=4 
 | 
			
		||||
* pytest -s test_lm_eval_correctness.py
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
import lm_eval
 | 
			
		||||
import numpy as np
 | 
			
		||||
import numpy
 | 
			
		||||
import pytest
 | 
			
		||||
import yaml
 | 
			
		||||
 | 
			
		||||
RTOL = 0.08
 | 
			
		||||
RTOL = 0.05
 | 
			
		||||
TEST_DATA_FILE = os.environ.get(
 | 
			
		||||
    "LM_EVAL_TEST_DATA_FILE",
 | 
			
		||||
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
 | 
			
		||||
 | 
			
		||||
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def launch_lm_eval(eval_config, tp_size):
 | 
			
		||||
    trust_remote_code = eval_config.get("trust_remote_code", False)
 | 
			
		||||
    max_model_len = eval_config.get("max_model_len", 4096)
 | 
			
		||||
    model_args = (
 | 
			
		||||
        f"pretrained={eval_config['model_name']},"
 | 
			
		||||
        f"tensor_parallel_size={tp_size},"
 | 
			
		||||
        f"enforce_eager=true,"
 | 
			
		||||
        f"add_bos_token=true,"
 | 
			
		||||
        f"trust_remote_code={trust_remote_code},"
 | 
			
		||||
        f"max_model_len={max_model_len}"
 | 
			
		||||
    )
 | 
			
		||||
def launch_lm_eval(eval_config):
 | 
			
		||||
    trust_remote_code = eval_config.get('trust_remote_code', False)
 | 
			
		||||
 | 
			
		||||
    model_args = f"pretrained={eval_config['model_name']}," \
 | 
			
		||||
                 f"tensor_parallel_size={TP_SIZE}," \
 | 
			
		||||
                 f"add_bos_token=true," \
 | 
			
		||||
                 f"trust_remote_code={trust_remote_code}"
 | 
			
		||||
 | 
			
		||||
    results = lm_eval.simple_evaluate(
 | 
			
		||||
        model="vllm",
 | 
			
		||||
        model_args=model_args,
 | 
			
		||||
        tasks=[task["name"] for task in eval_config["tasks"]],
 | 
			
		||||
        num_fewshot=eval_config["num_fewshot"],
 | 
			
		||||
        limit=eval_config["limit"],
 | 
			
		||||
        batch_size="auto",
 | 
			
		||||
    )
 | 
			
		||||
        batch_size="auto")
 | 
			
		||||
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_lm_eval_correctness_param(config_filename, tp_size):
 | 
			
		||||
    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
 | 
			
		||||
def test_lm_eval_correctness():
 | 
			
		||||
    eval_config = yaml.safe_load(
 | 
			
		||||
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
 | 
			
		||||
 | 
			
		||||
    results = launch_lm_eval(eval_config, tp_size)
 | 
			
		||||
    if eval_config[
 | 
			
		||||
            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
 | 
			
		||||
        pytest.skip("FBGEMM is currently failing on main.")
 | 
			
		||||
 | 
			
		||||
    # Launch eval requests.
 | 
			
		||||
    results = launch_lm_eval(eval_config)
 | 
			
		||||
 | 
			
		||||
    # Confirm scores match ground truth.
 | 
			
		||||
    success = True
 | 
			
		||||
    for task in eval_config["tasks"]:
 | 
			
		||||
        for metric in task["metrics"]:
 | 
			
		||||
            ground_truth = metric["value"]
 | 
			
		||||
            measured_value = results["results"][task["name"]][metric["name"]]
 | 
			
		||||
            print(
 | 
			
		||||
                f"{task['name']} | {metric['name']}: "
 | 
			
		||||
                f"ground_truth={ground_truth} | measured={measured_value}"
 | 
			
		||||
            )
 | 
			
		||||
            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
 | 
			
		||||
            print(f'{task["name"]} | {metric["name"]}: '
 | 
			
		||||
                  f'ground_truth={ground_truth} | measured={measured_value}')
 | 
			
		||||
            success = success and numpy.isclose(
 | 
			
		||||
                ground_truth, measured_value, rtol=RTOL)
 | 
			
		||||
 | 
			
		||||
    # Assert at the end, print all scores even on failure for debugging.
 | 
			
		||||
    assert success
 | 
			
		||||
 | 
			
		||||
@ -7,11 +7,11 @@ This directory contains two sets of benchmark for vllm.
 | 
			
		||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 | 
			
		||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 | 
			
		||||
 | 
			
		||||
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 | 
			
		||||
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 | 
			
		||||
 | 
			
		||||
## Performance benchmark quick overview
 | 
			
		||||
 | 
			
		||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
 | 
			
		||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
 | 
			
		||||
 | 
			
		||||
**Benchmarking Duration**: about 1hr.
 | 
			
		||||
 | 
			
		||||
@ -28,34 +28,16 @@ See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName
 | 
			
		||||
## Trigger the benchmark
 | 
			
		||||
 | 
			
		||||
Performance benchmark will be triggered when:
 | 
			
		||||
 | 
			
		||||
- A PR being merged into vllm.
 | 
			
		||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 | 
			
		||||
 | 
			
		||||
Manually Trigger the benchmark
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Runtime environment variables:
 | 
			
		||||
 | 
			
		||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 | 
			
		||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 | 
			
		||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 | 
			
		||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 | 
			
		||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 | 
			
		||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 | 
			
		||||
 | 
			
		||||
Nightly benchmark will be triggered when:
 | 
			
		||||
 | 
			
		||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 | 
			
		||||
 | 
			
		||||
## Performance benchmark details
 | 
			
		||||
 | 
			
		||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 | 
			
		||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
 | 
			
		||||
>
 | 
			
		||||
 | 
			
		||||
### Latency test
 | 
			
		||||
 | 
			
		||||
Here is an example of one test inside `latency-tests.json`:
 | 
			
		||||
@ -78,7 +60,7 @@ Here is an example of one test inside `latency-tests.json`:
 | 
			
		||||
In this example:
 | 
			
		||||
 | 
			
		||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
 | 
			
		||||
- The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 | 
			
		||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
 | 
			
		||||
 | 
			
		||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
 | 
			
		||||
 | 
			
		||||
@ -86,13 +68,13 @@ WARNING: The benchmarking script will save json results by itself, so please do
 | 
			
		||||
 | 
			
		||||
### Throughput test
 | 
			
		||||
 | 
			
		||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`.
 | 
			
		||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
 | 
			
		||||
 | 
			
		||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
 | 
			
		||||
 | 
			
		||||
### Serving test
 | 
			
		||||
 | 
			
		||||
We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 | 
			
		||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
[
 | 
			
		||||
@ -104,6 +86,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "swap_space": 16,
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
@ -121,8 +104,8 @@ Inside this example:
 | 
			
		||||
 | 
			
		||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
 | 
			
		||||
- The `server-parameters` includes the command line arguments for vLLM server.
 | 
			
		||||
- The `client-parameters` includes the command line arguments for `vllm bench serve`.
 | 
			
		||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve`
 | 
			
		||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
 | 
			
		||||
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
 | 
			
		||||
 | 
			
		||||
The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
 | 
			
		||||
 | 
			
		||||
@ -130,29 +113,12 @@ WARNING: The benchmarking script will save json results by itself, so please do
 | 
			
		||||
 | 
			
		||||
### Visualizing the results
 | 
			
		||||
 | 
			
		||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
 | 
			
		||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 | 
			
		||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 | 
			
		||||
If you do not see the table, please wait till the benchmark finish running.
 | 
			
		||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 | 
			
		||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 | 
			
		||||
 | 
			
		||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
 | 
			
		||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
 | 
			
		||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
 | 
			
		||||
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
 | 
			
		||||
 | 
			
		||||
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
 | 
			
		||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 | 
			
		||||
 | 
			
		||||
|   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
 | 
			
		||||
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
 | 
			
		||||
| 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 |
 | 
			
		||||
| 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 |
 | 
			
		||||
 | 
			
		||||
A comparison diagram will be generated below the table.
 | 
			
		||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
 | 
			
		||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
 | 
			
		||||
 | 
			
		||||
## Nightly test details
 | 
			
		||||
 | 
			
		||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
 | 
			
		||||
@ -160,9 +126,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript
 | 
			
		||||
### Workflow
 | 
			
		||||
 | 
			
		||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
 | 
			
		||||
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
 | 
			
		||||
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
 | 
			
		||||
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 | 
			
		||||
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
 | 
			
		||||
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
 | 
			
		||||
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 | 
			
		||||
 | 
			
		||||
### Nightly tests
 | 
			
		||||
 | 
			
		||||
@ -172,6 +138,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a
 | 
			
		||||
 | 
			
		||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 | 
			
		||||
 | 
			
		||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
 | 
			
		||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
 | 
			
		||||
 | 
			
		||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# Nightly benchmark annotation
 | 
			
		||||
 | 
			
		||||
## Description
 | 
			
		||||
 | 
			
		||||
@ -14,15 +13,15 @@ Please download the visualization scripts in the post
 | 
			
		||||
 | 
			
		||||
- Find the docker we use in `benchmarking pipeline`
 | 
			
		||||
- Deploy the docker, and inside the docker:
 | 
			
		||||
    - Download `nightly-benchmarks.zip`.
 | 
			
		||||
    - In the same folder, run the following code:
 | 
			
		||||
  - Download `nightly-benchmarks.zip`.
 | 
			
		||||
  - In the same folder, run the following code:
 | 
			
		||||
 | 
			
		||||
    ```bash
 | 
			
		||||
    export HF_TOKEN=<your HF token>
 | 
			
		||||
    apt update
 | 
			
		||||
    apt install -y git
 | 
			
		||||
    unzip nightly-benchmarks.zip
 | 
			
		||||
    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
 | 
			
		||||
    ```
 | 
			
		||||
  ```console
 | 
			
		||||
  export HF_TOKEN=<your HF token>
 | 
			
		||||
  apt update
 | 
			
		||||
  apt install -y git
 | 
			
		||||
  unzip nightly-benchmarks.zip
 | 
			
		||||
  VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
 | 
			
		||||
  ```
 | 
			
		||||
 | 
			
		||||
And the results will be inside `./benchmarks/results`.
 | 
			
		||||
 | 
			
		||||
@ -8,30 +8,30 @@ This benchmark aims to:
 | 
			
		||||
 | 
			
		||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 | 
			
		||||
 | 
			
		||||
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 | 
			
		||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 | 
			
		||||
 | 
			
		||||
## Setup
 | 
			
		||||
 | 
			
		||||
- Docker images:
 | 
			
		||||
    - vLLM: `vllm/vllm-openai:v0.6.2`
 | 
			
		||||
    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
 | 
			
		||||
    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
 | 
			
		||||
    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
 | 
			
		||||
        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
 | 
			
		||||
    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 | 
			
		||||
  - vLLM: `vllm/vllm-openai:v0.6.2`
 | 
			
		||||
  - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
 | 
			
		||||
  - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
 | 
			
		||||
  - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
 | 
			
		||||
    - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
 | 
			
		||||
  - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 | 
			
		||||
- Hardware
 | 
			
		||||
    - 8x Nvidia A100 GPUs
 | 
			
		||||
  - 8x Nvidia A100 GPUs
 | 
			
		||||
- Workload:
 | 
			
		||||
    - Dataset
 | 
			
		||||
        - ShareGPT dataset
 | 
			
		||||
        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
 | 
			
		||||
        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
 | 
			
		||||
        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
 | 
			
		||||
    - Models: llama-3 8B, llama-3 70B.
 | 
			
		||||
        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
 | 
			
		||||
    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
 | 
			
		||||
        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
 | 
			
		||||
    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 | 
			
		||||
  - Dataset
 | 
			
		||||
    - ShareGPT dataset
 | 
			
		||||
    - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
 | 
			
		||||
    - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
 | 
			
		||||
    - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
 | 
			
		||||
  - Models: llama-3 8B, llama-3 70B.
 | 
			
		||||
    - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
 | 
			
		||||
  - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
 | 
			
		||||
    - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
 | 
			
		||||
  - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 | 
			
		||||
 | 
			
		||||
## Known issues
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,12 +1,10 @@
 | 
			
		||||
# Performance benchmarks descriptions
 | 
			
		||||
 | 
			
		||||
## Latency tests
 | 
			
		||||
 | 
			
		||||
- Input length: 32 tokens.
 | 
			
		||||
- Output length: 128 tokens.
 | 
			
		||||
- Batch size: fixed (8).
 | 
			
		||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- CPU Models: llama-3.1 8B.
 | 
			
		||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
 | 
			
		||||
 | 
			
		||||
{latency_tests_markdown_table}
 | 
			
		||||
@ -16,8 +14,7 @@
 | 
			
		||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 | 
			
		||||
- Output length: the corresponding output length of these 200 prompts.
 | 
			
		||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
 | 
			
		||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- CPU Models: llama-3.1 8B.
 | 
			
		||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- Evaluation metrics: throughput.
 | 
			
		||||
 | 
			
		||||
{throughput_tests_markdown_table}
 | 
			
		||||
@ -28,18 +25,12 @@
 | 
			
		||||
- Output length: the corresponding output length of these 200 prompts.
 | 
			
		||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 | 
			
		||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 | 
			
		||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 | 
			
		||||
- CPU Models: llama-3.1 8B.
 | 
			
		||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- We also added a speculative decoding test for llama-3 70B, under QPS 2
 | 
			
		||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 | 
			
		||||
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
 | 
			
		||||
 | 
			
		||||
{serving_tests_markdown_table}
 | 
			
		||||
 | 
			
		||||
## Platform Information
 | 
			
		||||
 | 
			
		||||
{platform_markdown_table}
 | 
			
		||||
 | 
			
		||||
## json version of the benchmarking tables
 | 
			
		||||
 | 
			
		||||
This section contains the data of the markdown tables above in JSON format.
 | 
			
		||||
 | 
			
		||||
@ -1,307 +0,0 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
import argparse
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
from importlib import util
 | 
			
		||||
 | 
			
		||||
import pandas as pd
 | 
			
		||||
 | 
			
		||||
plotly_found = util.find_spec("plotly.express") is not None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def compare_data_columns(
 | 
			
		||||
    files, name_column, data_column, info_cols, drop_column, debug=False
 | 
			
		||||
):
 | 
			
		||||
    """
 | 
			
		||||
    Align concatenation by keys derived from info_cols instead of row order.
 | 
			
		||||
    - Pick one canonical key list: subset of info_cols present in ALL files.
 | 
			
		||||
    - For each file: set index to those keys, aggregate duplicates
 | 
			
		||||
    - (mean for metric, first for names).
 | 
			
		||||
    - Concat along axis=1 (indexes align), then reset_index so callers can
 | 
			
		||||
    - group by columns.
 | 
			
		||||
    - If --debug, add a <file_label>_name column per file.
 | 
			
		||||
    """
 | 
			
		||||
    print("\ncompare_data_column:", data_column)
 | 
			
		||||
 | 
			
		||||
    frames = []
 | 
			
		||||
    raw_data_cols = []
 | 
			
		||||
    compare_frames = []
 | 
			
		||||
 | 
			
		||||
    # 1) choose a canonical key list from info_cols that exists in ALL files
 | 
			
		||||
    cols_per_file = []
 | 
			
		||||
    for f in files:
 | 
			
		||||
        try:
 | 
			
		||||
            df_tmp = pd.read_json(f, orient="records")
 | 
			
		||||
        except Exception as err:
 | 
			
		||||
            raise ValueError(f"Failed to read {f}") from err
 | 
			
		||||
        cols_per_file.append(set(df_tmp.columns))
 | 
			
		||||
 | 
			
		||||
    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
 | 
			
		||||
    if not key_cols:
 | 
			
		||||
        # soft fallback: use any info_cols present in the first file
 | 
			
		||||
        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
 | 
			
		||||
    if not key_cols:
 | 
			
		||||
        raise ValueError(
 | 
			
		||||
            "No common key columns found from info_cols across the input files."
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
 | 
			
		||||
    meta_added = False
 | 
			
		||||
 | 
			
		||||
    for file in files:
 | 
			
		||||
        df = pd.read_json(file, orient="records")
 | 
			
		||||
 | 
			
		||||
        # Keep rows that actually have the compared metric (same as original behavior)
 | 
			
		||||
        if drop_column in df.columns:
 | 
			
		||||
            df = df.dropna(subset=[drop_column], ignore_index=True)
 | 
			
		||||
 | 
			
		||||
        # Stabilize numeric key columns (harmless if missing)
 | 
			
		||||
        for c in (
 | 
			
		||||
            "Input Len",
 | 
			
		||||
            "Output Len",
 | 
			
		||||
            "TP Size",
 | 
			
		||||
            "PP Size",
 | 
			
		||||
            "# of max concurrency.",
 | 
			
		||||
            "qps",
 | 
			
		||||
        ):
 | 
			
		||||
            if c in df.columns:
 | 
			
		||||
                df[c] = pd.to_numeric(df[c], errors="coerce")
 | 
			
		||||
 | 
			
		||||
        # Ensure all key columns exist
 | 
			
		||||
        for c in key_cols:
 | 
			
		||||
            if c not in df.columns:
 | 
			
		||||
                df[c] = pd.NA
 | 
			
		||||
 | 
			
		||||
        # Set index = key_cols and aggregate duplicates → unique MultiIndex
 | 
			
		||||
        df_idx = df.set_index(key_cols, drop=False)
 | 
			
		||||
 | 
			
		||||
        # meta (key columns), unique per key
 | 
			
		||||
        meta = df_idx[key_cols]
 | 
			
		||||
        if not meta.index.is_unique:
 | 
			
		||||
            meta = meta.groupby(level=key_cols, dropna=False).first()
 | 
			
		||||
 | 
			
		||||
        # metric series for this file, aggregated to one row per key
 | 
			
		||||
        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
 | 
			
		||||
        s = df_idx[data_column]
 | 
			
		||||
        if not s.index.is_unique:
 | 
			
		||||
            s = s.groupby(level=key_cols, dropna=False).mean()
 | 
			
		||||
        s.name = file_label  # column label like original
 | 
			
		||||
 | 
			
		||||
        # add meta once (from first file) so keys are the leftmost columns
 | 
			
		||||
        if not meta_added:
 | 
			
		||||
            frames.append(meta)
 | 
			
		||||
            meta_added = True
 | 
			
		||||
 | 
			
		||||
        # (NEW) debug: aligned test-name column per file
 | 
			
		||||
        if debug and name_column in df_idx.columns:
 | 
			
		||||
            name_s = df_idx[name_column]
 | 
			
		||||
            if not name_s.index.is_unique:
 | 
			
		||||
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
 | 
			
		||||
            name_s.name = f"{file_label}_name"
 | 
			
		||||
            frames.append(name_s)
 | 
			
		||||
 | 
			
		||||
        frames.append(s)
 | 
			
		||||
        raw_data_cols.append(file_label)
 | 
			
		||||
        compare_frames.append(s)
 | 
			
		||||
 | 
			
		||||
        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
 | 
			
		||||
        if len(compare_frames) >= 2:
 | 
			
		||||
            base = compare_frames[0]
 | 
			
		||||
            current = compare_frames[-1]
 | 
			
		||||
            ratio = current / base
 | 
			
		||||
            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
 | 
			
		||||
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
 | 
			
		||||
            frames.append(ratio)
 | 
			
		||||
 | 
			
		||||
    # 4) concat on columns with aligned MultiIndex;
 | 
			
		||||
    # then reset_index to return keys as columns
 | 
			
		||||
    concat_df = pd.concat(frames, axis=1)
 | 
			
		||||
    concat_df = concat_df.reset_index(drop=True).reset_index()
 | 
			
		||||
    if "index" in concat_df.columns:
 | 
			
		||||
        concat_df = concat_df.drop(columns=["index"])
 | 
			
		||||
 | 
			
		||||
    # Ensure key/info columns appear first (in your info_cols order)
 | 
			
		||||
    front = [c for c in info_cols if c in concat_df.columns]
 | 
			
		||||
    rest = [c for c in concat_df.columns if c not in front]
 | 
			
		||||
    concat_df = concat_df[front + rest]
 | 
			
		||||
 | 
			
		||||
    print(raw_data_cols)
 | 
			
		||||
    return concat_df, raw_data_cols
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def split_json_by_tp_pp(
 | 
			
		||||
    input_file: str = "benchmark_results.json", output_root: str = "."
 | 
			
		||||
) -> list[str]:
 | 
			
		||||
    """
 | 
			
		||||
    Split a benchmark JSON into separate folders by (TP Size, PP Size).
 | 
			
		||||
 | 
			
		||||
    Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
 | 
			
		||||
    Returns: list of file paths written.
 | 
			
		||||
    """
 | 
			
		||||
    # Load JSON data into DataFrame
 | 
			
		||||
    with open(input_file, encoding="utf-8") as f:
 | 
			
		||||
        data = json.load(f)
 | 
			
		||||
 | 
			
		||||
    # If the JSON is a dict with a list under common keys, use that list
 | 
			
		||||
    if isinstance(data, dict):
 | 
			
		||||
        for key in ("results", "serving_results", "benchmarks", "data"):
 | 
			
		||||
            if isinstance(data.get(key), list):
 | 
			
		||||
                data = data[key]
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
    df = pd.DataFrame(data)
 | 
			
		||||
 | 
			
		||||
    # Keep only "serving" tests
 | 
			
		||||
    name_col = next(
 | 
			
		||||
        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
 | 
			
		||||
    )
 | 
			
		||||
    if name_col:
 | 
			
		||||
        df = df[
 | 
			
		||||
            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
 | 
			
		||||
        ].copy()
 | 
			
		||||
 | 
			
		||||
    # Handle alias column names
 | 
			
		||||
    rename_map = {
 | 
			
		||||
        "tp_size": "TP Size",
 | 
			
		||||
        "tensor_parallel_size": "TP Size",
 | 
			
		||||
        "pp_size": "PP Size",
 | 
			
		||||
        "pipeline_parallel_size": "PP Size",
 | 
			
		||||
    }
 | 
			
		||||
    df.rename(
 | 
			
		||||
        columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Ensure TP/PP columns exist (default to 1 if missing)
 | 
			
		||||
    if "TP Size" not in df.columns:
 | 
			
		||||
        df["TP Size"] = 1
 | 
			
		||||
    if "PP Size" not in df.columns:
 | 
			
		||||
        df["PP Size"] = 1
 | 
			
		||||
 | 
			
		||||
    # make sure TP/PP are numeric ints with no NaN
 | 
			
		||||
    df["TP Size"] = (
 | 
			
		||||
        pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
 | 
			
		||||
    )
 | 
			
		||||
    df["PP Size"] = (
 | 
			
		||||
        pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    # Split into separate folders
 | 
			
		||||
    saved_paths: list[str] = []
 | 
			
		||||
    for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
 | 
			
		||||
        folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
 | 
			
		||||
        os.makedirs(folder_name, exist_ok=True)
 | 
			
		||||
        filepath = os.path.join(folder_name, "benchmark_results.json")
 | 
			
		||||
        group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
 | 
			
		||||
        print(f"Saved: {filepath}")
 | 
			
		||||
        saved_paths.append(filepath)
 | 
			
		||||
 | 
			
		||||
    return saved_paths
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    parser = argparse.ArgumentParser()
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "-f", "--file", action="append", type=str, help="input file name"
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--debug", action="store_true", help="show all information for debugging"
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--plot",
 | 
			
		||||
        action=argparse.BooleanOptionalAction,
 | 
			
		||||
        default=True,
 | 
			
		||||
        help="plot perf diagrams or not --no-plot --plot",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "-x",
 | 
			
		||||
        "--xaxis",
 | 
			
		||||
        type=str,
 | 
			
		||||
        default="# of max concurrency.",
 | 
			
		||||
        help="column name to use as X Axis in comparison graph",
 | 
			
		||||
    )
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
    drop_column = "P99"
 | 
			
		||||
    name_column = "Test name"
 | 
			
		||||
    info_cols = [
 | 
			
		||||
        "Model",
 | 
			
		||||
        "Dataset Name",
 | 
			
		||||
        "Input Len",
 | 
			
		||||
        "Output Len",
 | 
			
		||||
        "TP Size",
 | 
			
		||||
        "PP Size",
 | 
			
		||||
        "# of max concurrency.",
 | 
			
		||||
        "qps",
 | 
			
		||||
    ]
 | 
			
		||||
    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
 | 
			
		||||
    html_msgs_for_data_cols = [
 | 
			
		||||
        "Compare Output Tokens /n",
 | 
			
		||||
        "Median TTFT /n",
 | 
			
		||||
        "Median TPOT /n",
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    if len(args.file) == 1:
 | 
			
		||||
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
 | 
			
		||||
        info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
 | 
			
		||||
    else:
 | 
			
		||||
        files = args.file
 | 
			
		||||
    print("comparing : " + ", ".join(files))
 | 
			
		||||
    debug = args.debug
 | 
			
		||||
    plot = args.plot
 | 
			
		||||
    # For Plot feature, assign y axis from one of info_cols
 | 
			
		||||
    y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
 | 
			
		||||
    with open("perf_comparison.html", "w") as text_file:
 | 
			
		||||
        for i in range(len(data_cols_to_compare)):
 | 
			
		||||
            output_df, raw_data_cols = compare_data_columns(
 | 
			
		||||
                files,
 | 
			
		||||
                name_column,
 | 
			
		||||
                data_cols_to_compare[i],
 | 
			
		||||
                info_cols,
 | 
			
		||||
                drop_column,
 | 
			
		||||
                debug=debug,
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            # For Plot feature, insert y axis from one of info_cols
 | 
			
		||||
            raw_data_cols.insert(0, info_cols[y_axis_index])
 | 
			
		||||
 | 
			
		||||
            filtered_info_cols = info_cols[:-2]
 | 
			
		||||
            existing_group_cols = [
 | 
			
		||||
                c for c in filtered_info_cols if c in output_df.columns
 | 
			
		||||
            ]
 | 
			
		||||
            if not existing_group_cols:
 | 
			
		||||
                raise ValueError(
 | 
			
		||||
                    f"No valid group-by columns  "
 | 
			
		||||
                    f"Expected subset: {filtered_info_cols}, "
 | 
			
		||||
                    f"but DataFrame has: {list(output_df.columns)}"
 | 
			
		||||
                )
 | 
			
		||||
            output_df_sorted = output_df.sort_values(by=existing_group_cols)
 | 
			
		||||
            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
 | 
			
		||||
            for name, group in output_groups:
 | 
			
		||||
                html = group.to_html()
 | 
			
		||||
                text_file.write(html_msgs_for_data_cols[i])
 | 
			
		||||
                text_file.write(html)
 | 
			
		||||
 | 
			
		||||
                if plot and plotly_found:
 | 
			
		||||
                    import plotly.express as px
 | 
			
		||||
 | 
			
		||||
                    df = group[raw_data_cols]
 | 
			
		||||
                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
 | 
			
		||||
                    # Melt DataFrame for plotting
 | 
			
		||||
                    df_melted = df_sorted.melt(
 | 
			
		||||
                        id_vars=info_cols[y_axis_index],
 | 
			
		||||
                        var_name="Configuration",
 | 
			
		||||
                        value_name=data_cols_to_compare[i],
 | 
			
		||||
                    )
 | 
			
		||||
                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
 | 
			
		||||
                    # Create Plotly line chart
 | 
			
		||||
                    fig = px.line(
 | 
			
		||||
                        df_melted,
 | 
			
		||||
                        x=info_cols[y_axis_index],
 | 
			
		||||
                        y=data_cols_to_compare[i],
 | 
			
		||||
                        color="Configuration",
 | 
			
		||||
                        title=title,
 | 
			
		||||
                        markers=True,
 | 
			
		||||
                    )
 | 
			
		||||
                    # Export to HTML
 | 
			
		||||
                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
 | 
			
		||||
@ -1,19 +1,14 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
import shlex
 | 
			
		||||
from importlib import util
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import Any
 | 
			
		||||
 | 
			
		||||
import pandas as pd
 | 
			
		||||
import psutil
 | 
			
		||||
import regex as re
 | 
			
		||||
from tabulate import tabulate
 | 
			
		||||
 | 
			
		||||
results_folder = Path("results/")
 | 
			
		||||
 | 
			
		||||
# latency results and the keys that will be printed into markdown
 | 
			
		||||
latency_results = []
 | 
			
		||||
latency_column_mapping = {
 | 
			
		||||
@ -33,39 +28,28 @@ throughput_results = []
 | 
			
		||||
throughput_results_column_mapping = {
 | 
			
		||||
    "test_name": "Test name",
 | 
			
		||||
    "gpu_type": "GPU",
 | 
			
		||||
    "num_requests": "# of req.",
 | 
			
		||||
    "total_num_tokens": "Total # of tokens",
 | 
			
		||||
    "elapsed_time": "Elapsed time (s)",
 | 
			
		||||
    # "num_requests": "# of req.",
 | 
			
		||||
    # "total_num_tokens": "Total # of tokens",
 | 
			
		||||
    # "elapsed_time": "Elapsed time (s)",
 | 
			
		||||
    "requests_per_second": "Tput (req/s)",
 | 
			
		||||
    "tokens_per_second": "Tput (tok/s)",
 | 
			
		||||
    # "tokens_per_second": "Tput (tok/s)",
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# serving results and the keys that will be printed into markdown
 | 
			
		||||
serving_results = []
 | 
			
		||||
serving_column_mapping = {
 | 
			
		||||
    "test_name": "Test name",
 | 
			
		||||
    "model_id": "Model",
 | 
			
		||||
    "dataset_name": "Dataset Name",
 | 
			
		||||
    "input_len": "Input Len",
 | 
			
		||||
    "output_len": "Output Len",
 | 
			
		||||
    "tp_size": "TP Size",
 | 
			
		||||
    "pp_size": "PP Size",
 | 
			
		||||
    "dtype": "dtype",
 | 
			
		||||
    "gpu_type": "GPU",
 | 
			
		||||
    "completed": "# of req.",
 | 
			
		||||
    "qps": "qps",
 | 
			
		||||
    "max_concurrency": "# of max concurrency.",
 | 
			
		||||
    # "completed": "# of req.",
 | 
			
		||||
    "request_throughput": "Tput (req/s)",
 | 
			
		||||
    "total_token_throughput": "Total Token Tput (tok/s)",
 | 
			
		||||
    "output_throughput": "Output Tput (tok/s)",
 | 
			
		||||
    # "total_input_tokens": "Total input tokens",
 | 
			
		||||
    # "total_output_tokens": "Total output tokens",
 | 
			
		||||
    # "input_throughput": "Input Tput (tok/s)",
 | 
			
		||||
    # "output_throughput": "Output Tput (tok/s)",
 | 
			
		||||
    "mean_ttft_ms": "Mean TTFT (ms)",
 | 
			
		||||
    "median_ttft_ms": "Median TTFT (ms)",
 | 
			
		||||
    "p99_ttft_ms": "P99 TTFT (ms)",
 | 
			
		||||
    "mean_tpot_ms": "Mean TPOT (ms)",
 | 
			
		||||
    "median_tpot_ms": "Median",
 | 
			
		||||
    "p99_tpot_ms": "P99",
 | 
			
		||||
    # "mean_tpot_ms": "Mean TPOT (ms)",
 | 
			
		||||
    # "median_tpot_ms": "Median",
 | 
			
		||||
    # "p99_tpot_ms": "P99",
 | 
			
		||||
    "mean_itl_ms": "Mean ITL (ms)",
 | 
			
		||||
    "median_itl_ms": "Median ITL (ms)",
 | 
			
		||||
    "p99_itl_ms": "P99 ITL (ms)",
 | 
			
		||||
@ -81,134 +65,24 @@ def read_markdown(file):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def results_to_json(latency, throughput, serving):
 | 
			
		||||
    return json.dumps(
 | 
			
		||||
        {
 | 
			
		||||
            "latency": latency.to_dict(),
 | 
			
		||||
            "throughput": throughput.to_dict(),
 | 
			
		||||
            "serving": serving.to_dict(),
 | 
			
		||||
        }
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_size_with_unit(bytes, suffix="B"):
 | 
			
		||||
    """
 | 
			
		||||
    Scale bytes to its proper format
 | 
			
		||||
    e.g:
 | 
			
		||||
        1253656 => '1.20MB'
 | 
			
		||||
        1253656678 => '1.17GB'
 | 
			
		||||
    """
 | 
			
		||||
    factor = 1024
 | 
			
		||||
    for unit in ["", "K", "M", "G", "T", "P"]:
 | 
			
		||||
        if bytes < factor:
 | 
			
		||||
            return f"{bytes:.2f}{unit}{suffix}"
 | 
			
		||||
        bytes /= factor
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _coerce(val: str) -> Any:
 | 
			
		||||
    """Best-effort type coercion from string to Python types."""
 | 
			
		||||
    low = val.lower()
 | 
			
		||||
    if low == "null":
 | 
			
		||||
        return None
 | 
			
		||||
    if low == "true":
 | 
			
		||||
        return True
 | 
			
		||||
    if low == "false":
 | 
			
		||||
        return False
 | 
			
		||||
    # integers
 | 
			
		||||
    if re.fullmatch(r"[+-]?\d+", val):
 | 
			
		||||
        try:
 | 
			
		||||
            return int(val)
 | 
			
		||||
        except ValueError:
 | 
			
		||||
            pass
 | 
			
		||||
    # floats (keep 'inf'/'-inf'/'nan' as strings)
 | 
			
		||||
    if re.fullmatch(r"[+-]?\d*\.\d+", val):
 | 
			
		||||
        try:
 | 
			
		||||
            return float(val)
 | 
			
		||||
        except ValueError:
 | 
			
		||||
            pass
 | 
			
		||||
    return val
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_client_command(cmd: str) -> dict[str, Any]:
 | 
			
		||||
    """Parse the client_command shell string into {executable, script, args}."""
 | 
			
		||||
    toks = shlex.split(cmd)
 | 
			
		||||
    if len(toks) < 2:
 | 
			
		||||
        raise ValueError("client_command must include an executable and a script")
 | 
			
		||||
    executable, script = toks[0], toks[1]
 | 
			
		||||
    args: dict[str, Any] = {}
 | 
			
		||||
 | 
			
		||||
    i = 2
 | 
			
		||||
    while i < len(toks):
 | 
			
		||||
        t = toks[i]
 | 
			
		||||
        if t.startswith("--"):
 | 
			
		||||
            # --key=value or --key (value) or boolean flag
 | 
			
		||||
            if "=" in t:
 | 
			
		||||
                key, val = t.split("=", 1)
 | 
			
		||||
                if key == "--metadata":
 | 
			
		||||
                    md = {}
 | 
			
		||||
                    if val:
 | 
			
		||||
                        if "=" in val:
 | 
			
		||||
                            k, v = val.split("=", 1)
 | 
			
		||||
                            md[k] = _coerce(v)
 | 
			
		||||
                        else:
 | 
			
		||||
                            md[val] = True
 | 
			
		||||
                    args[key] = md
 | 
			
		||||
                else:
 | 
			
		||||
                    args[key] = _coerce(val)
 | 
			
		||||
                i += 1
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            key = t
 | 
			
		||||
 | 
			
		||||
            # Special: consume metadata k=v pairs until next --flag
 | 
			
		||||
            if key == "--metadata":
 | 
			
		||||
                i += 1
 | 
			
		||||
                md = {}
 | 
			
		||||
                while i < len(toks) and not toks[i].startswith("--"):
 | 
			
		||||
                    pair = toks[i]
 | 
			
		||||
                    if "=" in pair:
 | 
			
		||||
                        k, v = pair.split("=", 1)
 | 
			
		||||
                        md[k] = _coerce(v)
 | 
			
		||||
                    else:
 | 
			
		||||
                        md[pair] = True
 | 
			
		||||
                    i += 1
 | 
			
		||||
                args[key] = md
 | 
			
		||||
                continue
 | 
			
		||||
 | 
			
		||||
            # Standard: check if next token is a value (not a flag)
 | 
			
		||||
            if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
 | 
			
		||||
                args[key] = _coerce(toks[i + 1])
 | 
			
		||||
                i += 2
 | 
			
		||||
            else:
 | 
			
		||||
                # lone flag -> True
 | 
			
		||||
                args[key] = True
 | 
			
		||||
                i += 1
 | 
			
		||||
        else:
 | 
			
		||||
            # unexpected positional; skip
 | 
			
		||||
            i += 1
 | 
			
		||||
 | 
			
		||||
    return {"executable": executable, "script": script, "args": args}
 | 
			
		||||
    return json.dumps({
 | 
			
		||||
        'latency': latency.to_dict(),
 | 
			
		||||
        'throughput': throughput.to_dict(),
 | 
			
		||||
        'serving': serving.to_dict()
 | 
			
		||||
    })
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    parser = argparse.ArgumentParser()
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "-r",
 | 
			
		||||
        "--result",
 | 
			
		||||
        type=str,
 | 
			
		||||
        default="results",
 | 
			
		||||
        help="Folder name for benchmark output results.",
 | 
			
		||||
    )
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    results_folder = Path(args.result)
 | 
			
		||||
    if not results_folder.exists():
 | 
			
		||||
        raise FileNotFoundError(f"results folder does not exist: {results_folder}")
 | 
			
		||||
 | 
			
		||||
    # collect results
 | 
			
		||||
    for test_file in results_folder.glob("*.json"):
 | 
			
		||||
 | 
			
		||||
        with open(test_file) as f:
 | 
			
		||||
            raw_result = json.loads(f.read())
 | 
			
		||||
 | 
			
		||||
        if "serving" in str(test_file):
 | 
			
		||||
            # this result is generated via `vllm bench serve` command
 | 
			
		||||
            # this result is generated via `benchmark_serving.py`
 | 
			
		||||
 | 
			
		||||
            # attach the benchmarking command to raw_result
 | 
			
		||||
            try:
 | 
			
		||||
                with open(test_file.with_suffix(".commands")) as f:
 | 
			
		||||
@ -216,50 +90,18 @@ if __name__ == "__main__":
 | 
			
		||||
            except OSError as e:
 | 
			
		||||
                print(e)
 | 
			
		||||
                continue
 | 
			
		||||
            # Parse Server Command Arg
 | 
			
		||||
            out: dict[str, Any] = {
 | 
			
		||||
                "server_command": parse_client_command(command["server_command"])
 | 
			
		||||
            }
 | 
			
		||||
            parse_args = [
 | 
			
		||||
                "--tensor-parallel-size",
 | 
			
		||||
                "--pipeline-parallel-size",
 | 
			
		||||
                "--dtype",
 | 
			
		||||
            ]
 | 
			
		||||
            col_mapping = ["tp_size", "pp_size", "dtype"]
 | 
			
		||||
            for index, arg in enumerate(parse_args):
 | 
			
		||||
                if arg in out["server_command"]["args"]:
 | 
			
		||||
                    raw_result.update(
 | 
			
		||||
                        {col_mapping[index]: out["server_command"]["args"][arg]}
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
            # Parse Client Command Arg
 | 
			
		||||
            out: dict[str, Any] = {
 | 
			
		||||
                "client_command": parse_client_command(command["client_command"])
 | 
			
		||||
            }
 | 
			
		||||
            parse_args = [
 | 
			
		||||
                "--dataset-name",
 | 
			
		||||
                "--random-input-len",
 | 
			
		||||
                "--random-output-len",
 | 
			
		||||
                "--request-rate",
 | 
			
		||||
            ]
 | 
			
		||||
            col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
 | 
			
		||||
 | 
			
		||||
            for index, arg in enumerate(parse_args):
 | 
			
		||||
                if arg in out["client_command"]["args"]:
 | 
			
		||||
                    raw_result.update(
 | 
			
		||||
                        {col_mapping[index]: out["client_command"]["args"][arg]}
 | 
			
		||||
                    )
 | 
			
		||||
            # Add Server, Client command
 | 
			
		||||
            raw_result.update(command)
 | 
			
		||||
 | 
			
		||||
            # update the test name of this result
 | 
			
		||||
            raw_result.update({"test_name": test_file.stem})
 | 
			
		||||
 | 
			
		||||
            # add the result to raw_result
 | 
			
		||||
            serving_results.append(raw_result)
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        elif "latency" in f.name:
 | 
			
		||||
            # this result is generated via `vllm bench latency` command
 | 
			
		||||
            # this result is generated via `benchmark_latency.py`
 | 
			
		||||
 | 
			
		||||
            # attach the benchmarking command to raw_result
 | 
			
		||||
            try:
 | 
			
		||||
@ -278,8 +120,7 @@ if __name__ == "__main__":
 | 
			
		||||
            for perc in [10, 25, 50, 75, 90, 99]:
 | 
			
		||||
                # Multiply 1000 to convert the time unit from s to ms
 | 
			
		||||
                raw_result.update(
 | 
			
		||||
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
 | 
			
		||||
                )
 | 
			
		||||
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
 | 
			
		||||
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
 | 
			
		||||
 | 
			
		||||
            # add the result to raw_result
 | 
			
		||||
@ -287,7 +128,7 @@ if __name__ == "__main__":
 | 
			
		||||
            continue
 | 
			
		||||
 | 
			
		||||
        elif "throughput" in f.name:
 | 
			
		||||
            # this result is generated via `vllm bench throughput` command
 | 
			
		||||
            # this result is generated via `benchmark_throughput.py`
 | 
			
		||||
 | 
			
		||||
            # attach the benchmarking command to raw_result
 | 
			
		||||
            try:
 | 
			
		||||
@ -312,51 +153,26 @@ if __name__ == "__main__":
 | 
			
		||||
    serving_results = pd.DataFrame.from_dict(serving_results)
 | 
			
		||||
    throughput_results = pd.DataFrame.from_dict(throughput_results)
 | 
			
		||||
 | 
			
		||||
    svmem = psutil.virtual_memory()
 | 
			
		||||
    platform_data = {
 | 
			
		||||
        "Physical cores": [psutil.cpu_count(logical=False)],
 | 
			
		||||
        "Total cores": [psutil.cpu_count(logical=True)],
 | 
			
		||||
        "Total Memory": [get_size_with_unit(svmem.total)],
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if util.find_spec("numa") is not None:
 | 
			
		||||
        from numa import info
 | 
			
		||||
 | 
			
		||||
        platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
 | 
			
		||||
 | 
			
		||||
    if util.find_spec("cpuinfo") is not None:
 | 
			
		||||
        from cpuinfo import get_cpu_info
 | 
			
		||||
 | 
			
		||||
        platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
 | 
			
		||||
 | 
			
		||||
    platform_results = pd.DataFrame.from_dict(
 | 
			
		||||
        platform_data, orient="index", columns=["Platform Info"]
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    raw_results_json = results_to_json(
 | 
			
		||||
        latency_results, throughput_results, serving_results
 | 
			
		||||
    )
 | 
			
		||||
    raw_results_json = results_to_json(latency_results, throughput_results,
 | 
			
		||||
                                       serving_results)
 | 
			
		||||
 | 
			
		||||
    # remapping the key, for visualization purpose
 | 
			
		||||
    if not latency_results.empty:
 | 
			
		||||
        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
 | 
			
		||||
            columns=latency_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
        latency_results = latency_results[list(
 | 
			
		||||
            latency_column_mapping.keys())].rename(
 | 
			
		||||
                columns=latency_column_mapping)
 | 
			
		||||
    if not serving_results.empty:
 | 
			
		||||
        valid_columns = [
 | 
			
		||||
            col for col in serving_column_mapping if col in serving_results.columns
 | 
			
		||||
        ]
 | 
			
		||||
        serving_results = serving_results[valid_columns].rename(
 | 
			
		||||
            columns=serving_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
        serving_results = serving_results[list(
 | 
			
		||||
            serving_column_mapping.keys())].rename(
 | 
			
		||||
                columns=serving_column_mapping)
 | 
			
		||||
    if not throughput_results.empty:
 | 
			
		||||
        throughput_results = throughput_results[
 | 
			
		||||
            list(throughput_results_column_mapping.keys())
 | 
			
		||||
        ].rename(columns=throughput_results_column_mapping)
 | 
			
		||||
        throughput_results = throughput_results[list(
 | 
			
		||||
            throughput_results_column_mapping.keys())].rename(
 | 
			
		||||
                columns=throughput_results_column_mapping)
 | 
			
		||||
 | 
			
		||||
    processed_results_json = results_to_json(
 | 
			
		||||
        latency_results, throughput_results, serving_results
 | 
			
		||||
    )
 | 
			
		||||
    processed_results_json = results_to_json(latency_results,
 | 
			
		||||
                                             throughput_results,
 | 
			
		||||
                                             serving_results)
 | 
			
		||||
 | 
			
		||||
    for df in [latency_results, serving_results, throughput_results]:
 | 
			
		||||
        if df.empty:
 | 
			
		||||
@ -368,45 +184,38 @@ if __name__ == "__main__":
 | 
			
		||||
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
 | 
			
		||||
        # we want to turn it into "8xGPUTYPE"
 | 
			
		||||
        df["GPU"] = df["GPU"].apply(
 | 
			
		||||
            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
 | 
			
		||||
        )
 | 
			
		||||
            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
 | 
			
		||||
 | 
			
		||||
    # get markdown tables
 | 
			
		||||
    latency_md_table = tabulate(
 | 
			
		||||
        latency_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    serving_md_table = tabulate(
 | 
			
		||||
        serving_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    throughput_md_table = tabulate(
 | 
			
		||||
        throughput_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    platform_md_table = tabulate(
 | 
			
		||||
        platform_results, headers="keys", tablefmt="pipe", showindex=True
 | 
			
		||||
    )
 | 
			
		||||
    latency_md_table = tabulate(latency_results,
 | 
			
		||||
                                headers='keys',
 | 
			
		||||
                                tablefmt='pipe',
 | 
			
		||||
                                showindex=False)
 | 
			
		||||
    serving_md_table = tabulate(serving_results,
 | 
			
		||||
                                headers='keys',
 | 
			
		||||
                                tablefmt='pipe',
 | 
			
		||||
                                showindex=False)
 | 
			
		||||
    throughput_md_table = tabulate(throughput_results,
 | 
			
		||||
                                   headers='keys',
 | 
			
		||||
                                   tablefmt='pipe',
 | 
			
		||||
                                   showindex=False)
 | 
			
		||||
 | 
			
		||||
    # document the result
 | 
			
		||||
    md_file = "benchmark_results.md"
 | 
			
		||||
    json_file = "benchmark_results.json"
 | 
			
		||||
    with open(results_folder / md_file, "w") as f:
 | 
			
		||||
        results = read_markdown(
 | 
			
		||||
            "../.buildkite/nightly-benchmarks/"
 | 
			
		||||
            + "performance-benchmarks-descriptions.md"
 | 
			
		||||
        )
 | 
			
		||||
    with open(results_folder / "benchmark_results.md", "w") as f:
 | 
			
		||||
 | 
			
		||||
        results = read_markdown("../.buildkite/nightly-benchmarks/" +
 | 
			
		||||
                                "performance-benchmarks-descriptions.md")
 | 
			
		||||
        results = results.format(
 | 
			
		||||
            latency_tests_markdown_table=latency_md_table,
 | 
			
		||||
            throughput_tests_markdown_table=throughput_md_table,
 | 
			
		||||
            serving_tests_markdown_table=serving_md_table,
 | 
			
		||||
            platform_markdown_table=platform_md_table,
 | 
			
		||||
            benchmarking_results_in_json_string=processed_results_json,
 | 
			
		||||
        )
 | 
			
		||||
            benchmarking_results_in_json_string=processed_results_json)
 | 
			
		||||
        f.write(results)
 | 
			
		||||
 | 
			
		||||
    # document benchmarking results in json
 | 
			
		||||
    with open(results_folder / json_file, "w") as f:
 | 
			
		||||
        results = (
 | 
			
		||||
            latency_results.to_dict(orient="records")
 | 
			
		||||
            + throughput_results.to_dict(orient="records")
 | 
			
		||||
            + serving_results.to_dict(orient="records")
 | 
			
		||||
        )
 | 
			
		||||
    with open(results_folder / "benchmark_results.json", "w") as f:
 | 
			
		||||
 | 
			
		||||
        results = latency_results.to_dict(
 | 
			
		||||
            orient='records') + throughput_results.to_dict(
 | 
			
		||||
                orient='records') + serving_results.to_dict(orient='records')
 | 
			
		||||
        f.write(json.dumps(results))
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
 | 
			
		||||
@ -15,12 +14,15 @@ def main(model, cachedir):
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    parser = argparse.ArgumentParser(
 | 
			
		||||
        description="Download and save Hugging Face tokenizer"
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument("--model", type=str, required=True, help="Name of the model")
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
 | 
			
		||||
    )
 | 
			
		||||
        description="Download and save Hugging Face tokenizer")
 | 
			
		||||
    parser.add_argument("--model",
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help="Name of the model")
 | 
			
		||||
    parser.add_argument("--cachedir",
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help="Directory to save the tokenizer")
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    main(args.model, args.cachedir)
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import json
 | 
			
		||||
@ -12,33 +11,33 @@ from tabulate import tabulate
 | 
			
		||||
 | 
			
		||||
def parse_arguments():
 | 
			
		||||
    parser = argparse.ArgumentParser(
 | 
			
		||||
        description="Parse command line arguments for summary-nightly-results script."
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--results-folder",
 | 
			
		||||
        type=str,
 | 
			
		||||
        required=True,
 | 
			
		||||
        help="The folder where the results are stored.",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--description", type=str, required=True, help="Description of the results."
 | 
			
		||||
    )
 | 
			
		||||
        description=
 | 
			
		||||
        'Parse command line arguments for summary-nightly-results script.')
 | 
			
		||||
    parser.add_argument('--results-folder',
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help='The folder where the results are stored.')
 | 
			
		||||
    parser.add_argument('--description',
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help='Description of the results.')
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    return args
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_perf(df, method, model, metric):
 | 
			
		||||
 | 
			
		||||
    means = []
 | 
			
		||||
 | 
			
		||||
    for qps in [2, 4, 8, 16, "inf"]:
 | 
			
		||||
        target = df["Test name"].str.contains(model)
 | 
			
		||||
        target = target & df["Engine"].str.contains(method)
 | 
			
		||||
        target = target & df["Test name"].str.contains("qps_" + str(qps))
 | 
			
		||||
        target = df['Test name'].str.contains(model)
 | 
			
		||||
        target = target & df['Engine'].str.contains(method)
 | 
			
		||||
        target = target & df['Test name'].str.contains("qps_" + str(qps))
 | 
			
		||||
        filtered_df = df[target]
 | 
			
		||||
 | 
			
		||||
        if filtered_df.empty:
 | 
			
		||||
            means.append(0.0)
 | 
			
		||||
            means.append(0.)
 | 
			
		||||
        else:
 | 
			
		||||
            means.append(filtered_df[metric].values[0])
 | 
			
		||||
 | 
			
		||||
@ -46,6 +45,7 @@ def get_perf(df, method, model, metric):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_perf_w_std(df, method, model, metric):
 | 
			
		||||
 | 
			
		||||
    if metric in ["TTFT", "ITL"]:
 | 
			
		||||
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
 | 
			
		||||
        mean = mean.tolist()
 | 
			
		||||
@ -60,8 +60,7 @@ def get_perf_w_std(df, method, model, metric):
 | 
			
		||||
    else:
 | 
			
		||||
        assert metric == "Tput"
 | 
			
		||||
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
 | 
			
		||||
            df, method, model, "Output Tput (tok/s)"
 | 
			
		||||
        )
 | 
			
		||||
            df, method, model, "Output Tput (tok/s)")
 | 
			
		||||
        mean = mean.tolist()
 | 
			
		||||
        std = None
 | 
			
		||||
 | 
			
		||||
@ -81,17 +80,18 @@ def main(args):
 | 
			
		||||
    # generate markdown table
 | 
			
		||||
    df = pd.DataFrame.from_dict(results)
 | 
			
		||||
 | 
			
		||||
    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
 | 
			
		||||
    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
 | 
			
		||||
 | 
			
		||||
    with open(args.description) as f:
 | 
			
		||||
        description = f.read()
 | 
			
		||||
 | 
			
		||||
    description = description.format(nightly_results_benchmarking_table=md_table)
 | 
			
		||||
    description = description.format(
 | 
			
		||||
        nightly_results_benchmarking_table=md_table)
 | 
			
		||||
 | 
			
		||||
    with open("nightly_results.md", "w") as f:
 | 
			
		||||
        f.write(description)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    args = parse_arguments()
 | 
			
		||||
    main(args)
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
from lmdeploy.serve.openai.api_client import APIClient
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -95,14 +95,12 @@ json2args() {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kill_gpu_processes() {
 | 
			
		||||
  pkill -f '[p]ython'
 | 
			
		||||
  pkill -f '[p]ython3'
 | 
			
		||||
  pkill -f '[t]ritonserver'
 | 
			
		||||
  pkill -f '[p]t_main_thread'
 | 
			
		||||
  pkill -f '[t]ext-generation'
 | 
			
		||||
  pkill -f '[l]mdeploy'
 | 
			
		||||
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
 | 
			
		||||
  pkill -f '[V]LLM'
 | 
			
		||||
  pkill -f python
 | 
			
		||||
  pkill -f python3
 | 
			
		||||
  pkill -f tritonserver
 | 
			
		||||
  pkill -f pt_main_thread
 | 
			
		||||
  pkill -f text-generation
 | 
			
		||||
  pkill -f lmdeploy
 | 
			
		||||
 | 
			
		||||
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
 | 
			
		||||
    sleep 1
 | 
			
		||||
@ -127,7 +125,7 @@ ensure_installed() {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
run_serving_tests() {
 | 
			
		||||
  # run serving tests using `vllm bench serve` command
 | 
			
		||||
  # run serving tests using `benchmark_serving.py`
 | 
			
		||||
  # $1: a json file specifying serving test cases
 | 
			
		||||
 | 
			
		||||
  local serving_test_file
 | 
			
		||||
@ -227,7 +225,7 @@ run_serving_tests() {
 | 
			
		||||
 | 
			
		||||
      if [[ "$dataset_name" = "sharegpt" ]]; then
 | 
			
		||||
 | 
			
		||||
        client_command="vllm bench serve \
 | 
			
		||||
        client_command="python3 benchmark_serving.py \
 | 
			
		||||
          --backend $backend \
 | 
			
		||||
          --tokenizer /tokenizer_cache \
 | 
			
		||||
          --model $model \
 | 
			
		||||
@ -248,7 +246,7 @@ run_serving_tests() {
 | 
			
		||||
        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
 | 
			
		||||
        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
 | 
			
		||||
 | 
			
		||||
        client_command="vllm bench serve \
 | 
			
		||||
        client_command="python3 benchmark_serving.py \
 | 
			
		||||
          --backend $backend \
 | 
			
		||||
          --tokenizer /tokenizer_cache \
 | 
			
		||||
          --model $model \
 | 
			
		||||
@ -382,7 +380,7 @@ run_genai_perf_tests() {
 | 
			
		||||
      client_command="genai-perf profile \
 | 
			
		||||
        -m $model \
 | 
			
		||||
        --service-kind openai \
 | 
			
		||||
        --backend "$backend" \
 | 
			
		||||
        --backend vllm \
 | 
			
		||||
        --endpoint-type chat \
 | 
			
		||||
        --streaming \
 | 
			
		||||
        --url localhost:$port \
 | 
			
		||||
 | 
			
		||||
@ -31,20 +31,6 @@ check_gpus() {
 | 
			
		||||
  echo "GPU type is $gpu_type"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
check_cpus() {
 | 
			
		||||
  # check the number of CPUs and NUMA Node and GPU type.
 | 
			
		||||
  declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
 | 
			
		||||
  if [[ $numa_count -gt 0 ]]; then
 | 
			
		||||
    echo "NUMA found."
 | 
			
		||||
    echo $numa_count
 | 
			
		||||
  else
 | 
			
		||||
    echo "Need at least 1 NUMA to run benchmarking."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  declare -g gpu_type="cpu"
 | 
			
		||||
  echo "GPU type is $gpu_type"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
check_hf_token() {
 | 
			
		||||
  # check if HF_TOKEN is available and valid
 | 
			
		||||
  if [[ -z "$HF_TOKEN" ]]; then
 | 
			
		||||
@ -83,22 +69,6 @@ json2args() {
 | 
			
		||||
  echo "$args"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
json2envs() {
 | 
			
		||||
  # transforms the JSON string to environment variables.
 | 
			
		||||
  # example:
 | 
			
		||||
  # input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
 | 
			
		||||
  # output: VLLM_CPU_KVCACHE_SPACE=5
 | 
			
		||||
  local json_string=$1
 | 
			
		||||
  local args=$(
 | 
			
		||||
    echo "$json_string" | jq -r '
 | 
			
		||||
      to_entries |
 | 
			
		||||
      map((.key ) + "=" + (.value | tostring)) |
 | 
			
		||||
      join(" ")
 | 
			
		||||
    '
 | 
			
		||||
  )
 | 
			
		||||
  echo "$args"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
wait_for_server() {
 | 
			
		||||
  # wait for vllm server to start
 | 
			
		||||
  # return 1 if vllm server crashes
 | 
			
		||||
@ -126,8 +96,7 @@ kill_gpu_processes() {
 | 
			
		||||
  ps -aux
 | 
			
		||||
  lsof -t -i:8000 | xargs -r kill -9
 | 
			
		||||
  pgrep python3 | xargs -r kill -9
 | 
			
		||||
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
 | 
			
		||||
  pgrep VLLM | xargs -r kill -9
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # wait until GPU memory usage smaller than 1GB
 | 
			
		||||
  if command -v nvidia-smi; then
 | 
			
		||||
@ -165,7 +134,7 @@ upload_to_buildkite() {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
run_latency_tests() {
 | 
			
		||||
  # run latency tests using `vllm bench latency` command
 | 
			
		||||
  # run latency tests using `benchmark_latency.py`
 | 
			
		||||
  # $1: a json file specifying latency test cases
 | 
			
		||||
 | 
			
		||||
  local latency_test_file
 | 
			
		||||
@ -189,26 +158,15 @@ run_latency_tests() {
 | 
			
		||||
    # get arguments
 | 
			
		||||
    latency_params=$(echo "$params" | jq -r '.parameters')
 | 
			
		||||
    latency_args=$(json2args "$latency_params")
 | 
			
		||||
    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
 | 
			
		||||
    latency_envs=$(json2envs "$latency_environment_variables")
 | 
			
		||||
 | 
			
		||||
    # check if there is enough GPU to run the test
 | 
			
		||||
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
 | 
			
		||||
    if [ "$ON_CPU" == "1" ]; then
 | 
			
		||||
      pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
 | 
			
		||||
      world_size=$(($tp*$pp))
 | 
			
		||||
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
 | 
			
		||||
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    else
 | 
			
		||||
      if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    latency_command=" $latency_envs vllm bench latency \
 | 
			
		||||
    latency_command="python3 benchmark_latency.py \
 | 
			
		||||
      --output-json $RESULTS_FOLDER/${test_name}.json \
 | 
			
		||||
      $latency_args"
 | 
			
		||||
 | 
			
		||||
@ -234,7 +192,7 @@ run_latency_tests() {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
run_throughput_tests() {
 | 
			
		||||
  # run throughput tests using `vllm bench throughput`
 | 
			
		||||
  # run throughput tests using `benchmark_throughput.py`
 | 
			
		||||
  # $1: a json file specifying throughput test cases
 | 
			
		||||
 | 
			
		||||
  local throughput_test_file
 | 
			
		||||
@ -258,26 +216,15 @@ run_throughput_tests() {
 | 
			
		||||
    # get arguments
 | 
			
		||||
    throughput_params=$(echo "$params" | jq -r '.parameters')
 | 
			
		||||
    throughput_args=$(json2args "$throughput_params")
 | 
			
		||||
    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
 | 
			
		||||
    throughput_envs=$(json2envs "$throughput_environment_variables")
 | 
			
		||||
 | 
			
		||||
    # check if there is enough GPU to run the test
 | 
			
		||||
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
 | 
			
		||||
    if [ "$ON_CPU" == "1" ]; then
 | 
			
		||||
      pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
 | 
			
		||||
      world_size=$(($tp*$pp))
 | 
			
		||||
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
 | 
			
		||||
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    else
 | 
			
		||||
      if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    throughput_command=" $throughput_envs vllm bench throughput \
 | 
			
		||||
    throughput_command="python3 benchmark_throughput.py \
 | 
			
		||||
      --output-json $RESULTS_FOLDER/${test_name}.json \
 | 
			
		||||
      $throughput_args"
 | 
			
		||||
 | 
			
		||||
@ -302,7 +249,7 @@ run_throughput_tests() {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
run_serving_tests() {
 | 
			
		||||
  # run serving tests using `vllm bench serve` command
 | 
			
		||||
  # run serving tests using `benchmark_serving.py`
 | 
			
		||||
  # $1: a json file specifying serving test cases
 | 
			
		||||
 | 
			
		||||
  local serving_test_file
 | 
			
		||||
@ -325,36 +272,18 @@ run_serving_tests() {
 | 
			
		||||
 | 
			
		||||
    # get client and server arguments
 | 
			
		||||
    server_params=$(echo "$params" | jq -r '.server_parameters')
 | 
			
		||||
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
 | 
			
		||||
    client_params=$(echo "$params" | jq -r '.client_parameters')
 | 
			
		||||
    server_args=$(json2args "$server_params")
 | 
			
		||||
    server_envs=$(json2envs "$server_envs")
 | 
			
		||||
    client_args=$(json2args "$client_params")
 | 
			
		||||
    qps_list=$(echo "$params" | jq -r '.qps_list')
 | 
			
		||||
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
 | 
			
		||||
    echo "Running over qps list $qps_list"
 | 
			
		||||
    max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
 | 
			
		||||
    if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
 | 
			
		||||
        num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
 | 
			
		||||
        max_concurrency_list="[$num_prompts]"
 | 
			
		||||
    fi
 | 
			
		||||
    max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
 | 
			
		||||
    echo "Running over max concurrency list $max_concurrency_list"
 | 
			
		||||
 | 
			
		||||
    # check if there is enough resources to run the test
 | 
			
		||||
    # check if there is enough GPU to run the test
 | 
			
		||||
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
 | 
			
		||||
    if [ "$ON_CPU" == "1" ]; then
 | 
			
		||||
      pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
 | 
			
		||||
      world_size=$(($tp*$pp))
 | 
			
		||||
      if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then
 | 
			
		||||
        echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    else
 | 
			
		||||
      if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # check if server model and client model is aligned
 | 
			
		||||
@ -365,33 +294,23 @@ run_serving_tests() {
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    server_command="$server_envs python3 \
 | 
			
		||||
    server_command="python3 \
 | 
			
		||||
      -m vllm.entrypoints.openai.api_server \
 | 
			
		||||
      $server_args"
 | 
			
		||||
 | 
			
		||||
    # run the server
 | 
			
		||||
    echo "Running test case $test_name"
 | 
			
		||||
    echo "Server command: $server_command"
 | 
			
		||||
    # support remote vllm server
 | 
			
		||||
    client_remote_args=""
 | 
			
		||||
    if [[ -z "${REMOTE_HOST}" ]]; then
 | 
			
		||||
      bash -c "$server_command" &
 | 
			
		||||
      server_pid=$!
 | 
			
		||||
      # wait until the server is alive
 | 
			
		||||
      if wait_for_server; then
 | 
			
		||||
        echo ""
 | 
			
		||||
        echo "vLLM server is up and running."
 | 
			
		||||
      else
 | 
			
		||||
        echo ""
 | 
			
		||||
        echo "vLLM failed to start within the timeout period."
 | 
			
		||||
      fi
 | 
			
		||||
    bash -c "$server_command" &
 | 
			
		||||
    server_pid=$!
 | 
			
		||||
 | 
			
		||||
    # wait until the server is alive
 | 
			
		||||
    if wait_for_server; then
 | 
			
		||||
      echo ""
 | 
			
		||||
      echo "vllm server is up and running."
 | 
			
		||||
    else
 | 
			
		||||
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
 | 
			
		||||
      if [[ ${REMOTE_PORT} ]]; then
 | 
			
		||||
        client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
 | 
			
		||||
      else
 | 
			
		||||
        client_remote_args=" --host=$REMOTE_HOST "
 | 
			
		||||
      fi
 | 
			
		||||
      echo ""
 | 
			
		||||
      echo "vllm failed to start within the timeout period."
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # iterate over different QPS
 | 
			
		||||
@ -403,39 +322,35 @@ run_serving_tests() {
 | 
			
		||||
        echo "now qps is $qps"
 | 
			
		||||
      fi
 | 
			
		||||
 | 
			
		||||
      # iterate over different max_concurrency
 | 
			
		||||
      for max_concurrency in $max_concurrency_list; do
 | 
			
		||||
        new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
 | 
			
		||||
        echo " new test name $new_test_name"
 | 
			
		||||
        # pass the tensor parallel size to the client so that it can be displayed
 | 
			
		||||
        # on the benchmark dashboard
 | 
			
		||||
        client_command="vllm bench serve \
 | 
			
		||||
          --save-result \
 | 
			
		||||
          --result-dir $RESULTS_FOLDER \
 | 
			
		||||
          --result-filename ${new_test_name}.json \
 | 
			
		||||
          --request-rate $qps \
 | 
			
		||||
          --max-concurrency $max_concurrency \
 | 
			
		||||
          --metadata "tensor_parallel_size=$tp" \
 | 
			
		||||
          $client_args $client_remote_args "
 | 
			
		||||
      new_test_name=$test_name"_qps_"$qps
 | 
			
		||||
 | 
			
		||||
        echo "Running test case $test_name with qps $qps"
 | 
			
		||||
        echo "Client command: $client_command"
 | 
			
		||||
      # pass the tensor parallel size to the client so that it can be displayed
 | 
			
		||||
      # on the benchmark dashboard
 | 
			
		||||
      client_command="python3 benchmark_serving.py \
 | 
			
		||||
        --save-result \
 | 
			
		||||
        --result-dir $RESULTS_FOLDER \
 | 
			
		||||
        --result-filename ${new_test_name}.json \
 | 
			
		||||
        --request-rate $qps \
 | 
			
		||||
        --metadata "tensor_parallel_size=$tp" \
 | 
			
		||||
        $client_args"
 | 
			
		||||
 | 
			
		||||
        bash -c "$client_command"
 | 
			
		||||
      echo "Running test case $test_name with qps $qps"
 | 
			
		||||
      echo "Client command: $client_command"
 | 
			
		||||
 | 
			
		||||
        # record the benchmarking commands
 | 
			
		||||
        jq_output=$(jq -n \
 | 
			
		||||
          --arg server "$server_command" \
 | 
			
		||||
          --arg client "$client_command" \
 | 
			
		||||
          --arg gpu "$gpu_type" \
 | 
			
		||||
          '{
 | 
			
		||||
            server_command: $server,
 | 
			
		||||
            client_command: $client,
 | 
			
		||||
            gpu_type: $gpu
 | 
			
		||||
          }')
 | 
			
		||||
        echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
 | 
			
		||||
      bash -c "$client_command"
 | 
			
		||||
 | 
			
		||||
      # record the benchmarking commands
 | 
			
		||||
      jq_output=$(jq -n \
 | 
			
		||||
        --arg server "$server_command" \
 | 
			
		||||
        --arg client "$client_command" \
 | 
			
		||||
        --arg gpu "$gpu_type" \
 | 
			
		||||
        '{
 | 
			
		||||
          server_command: $server,
 | 
			
		||||
          client_command: $client,
 | 
			
		||||
          gpu_type: $gpu
 | 
			
		||||
        }')
 | 
			
		||||
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
 | 
			
		||||
 | 
			
		||||
      done
 | 
			
		||||
    done
 | 
			
		||||
 | 
			
		||||
    # clean up
 | 
			
		||||
@ -445,14 +360,7 @@ run_serving_tests() {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
main() {
 | 
			
		||||
  local ARCH
 | 
			
		||||
  ARCH=''
 | 
			
		||||
  if [ "$ON_CPU" == "1" ];then
 | 
			
		||||
     check_cpus
 | 
			
		||||
     ARCH='-cpu'
 | 
			
		||||
  else
 | 
			
		||||
     check_gpus
 | 
			
		||||
  fi
 | 
			
		||||
  check_gpus
 | 
			
		||||
  check_hf_token
 | 
			
		||||
 | 
			
		||||
  # Set to v1 to run v1 benchmark
 | 
			
		||||
@ -465,7 +373,7 @@ main() {
 | 
			
		||||
  (which jq) || (apt-get update && apt-get -y install jq)
 | 
			
		||||
  (which lsof) || (apt-get update && apt-get install -y lsof)
 | 
			
		||||
 | 
			
		||||
  # get the current IP address, required by `vllm bench serve` command
 | 
			
		||||
  # get the current IP address, required by benchmark_serving.py
 | 
			
		||||
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
 | 
			
		||||
  # turn of the reporting of the status of each request, to clean up the terminal output
 | 
			
		||||
  export VLLM_LOGGING_LEVEL="WARNING"
 | 
			
		||||
@ -478,9 +386,9 @@ main() {
 | 
			
		||||
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 | 
			
		||||
 | 
			
		||||
  # benchmarking
 | 
			
		||||
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
 | 
			
		||||
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
 | 
			
		||||
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
 | 
			
		||||
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
 | 
			
		||||
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
 | 
			
		||||
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
 | 
			
		||||
 | 
			
		||||
  # postprocess benchmarking results
 | 
			
		||||
  pip install tabulate pandas
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import datetime
 | 
			
		||||
import json
 | 
			
		||||
@ -35,8 +34,10 @@ serving_column_mapping = {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    # collect results
 | 
			
		||||
    for test_file in results_folder.glob("*.json"):
 | 
			
		||||
 | 
			
		||||
        with open(test_file) as f:
 | 
			
		||||
            raw_result = json.loads(f.read())
 | 
			
		||||
 | 
			
		||||
@ -55,16 +56,17 @@ if __name__ == "__main__":
 | 
			
		||||
    serving_results = pd.DataFrame.from_dict(serving_results)
 | 
			
		||||
 | 
			
		||||
    if not serving_results.empty:
 | 
			
		||||
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
 | 
			
		||||
            columns=serving_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
        serving_results = serving_results[list(
 | 
			
		||||
            serving_column_mapping.keys())].rename(
 | 
			
		||||
                columns=serving_column_mapping)
 | 
			
		||||
 | 
			
		||||
    serving_md_table_with_headers = tabulate(
 | 
			
		||||
        serving_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    serving_md_table_with_headers = tabulate(serving_results,
 | 
			
		||||
                                             headers='keys',
 | 
			
		||||
                                             tablefmt='pipe',
 | 
			
		||||
                                             showindex=False)
 | 
			
		||||
    # remove the first line of header
 | 
			
		||||
    serving_md_table_lines = serving_md_table_with_headers.split("\n")
 | 
			
		||||
    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
 | 
			
		||||
    serving_md_table_lines = serving_md_table_with_headers.split('\n')
 | 
			
		||||
    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
 | 
			
		||||
 | 
			
		||||
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 | 
			
		||||
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 | 
			
		||||
@ -74,9 +76,10 @@ if __name__ == "__main__":
 | 
			
		||||
        # document results with header.
 | 
			
		||||
        # for those who wants to reproduce our benchmark.
 | 
			
		||||
        f.write(serving_md_table_with_headers)
 | 
			
		||||
        f.write("\n")
 | 
			
		||||
        f.write('\n')
 | 
			
		||||
 | 
			
		||||
    # document benchmarking results in json
 | 
			
		||||
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
 | 
			
		||||
        results = serving_results.to_dict(orient="records")
 | 
			
		||||
 | 
			
		||||
        results = serving_results.to_dict(orient='records')
 | 
			
		||||
        f.write(json.dumps(results))
 | 
			
		||||
 | 
			
		||||
@ -11,7 +11,9 @@
 | 
			
		||||
        },
 | 
			
		||||
        "vllm_server_parameters": {
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "gpu_memory_utilization": 0.9,
 | 
			
		||||
            "num_scheduler_steps": 10,
 | 
			
		||||
            "max_num_seqs": 512,
 | 
			
		||||
            "dtype": "bfloat16"
 | 
			
		||||
        },
 | 
			
		||||
 | 
			
		||||
@ -1,30 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_llama8B_tp1",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "num_iters_warmup": 5,
 | 
			
		||||
            "num_iters": 15
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_llama8B_tp4",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "num_iters_warmup": 5,
 | 
			
		||||
            "num_iters": 15
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -35,7 +35,9 @@
 | 
			
		||||
        }, 
 | 
			
		||||
        "vllm_server_parameters": {
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "gpu_memory_utilization": 0.9,
 | 
			
		||||
            "num_scheduler_steps": 10,
 | 
			
		||||
            "max_num_seqs": 512,
 | 
			
		||||
            "dtype": "bfloat16"
 | 
			
		||||
        },
 | 
			
		||||
@ -88,7 +90,9 @@
 | 
			
		||||
        }, 
 | 
			
		||||
        "vllm_server_parameters": {
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "gpu_memory_utilization": 0.9,
 | 
			
		||||
            "num_scheduler_steps": 10,
 | 
			
		||||
            "max_num_seqs": 512,
 | 
			
		||||
            "dtype": "bfloat16"
 | 
			
		||||
        },
 | 
			
		||||
@ -141,7 +145,9 @@
 | 
			
		||||
        }, 
 | 
			
		||||
        "vllm_server_parameters": {
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "gpu_memory_utilization": 0.9,
 | 
			
		||||
            "num_scheduler_steps": 10,
 | 
			
		||||
            "max_num_seqs": 512,
 | 
			
		||||
            "dtype": "bfloat16"
 | 
			
		||||
        },
 | 
			
		||||
@ -191,7 +197,9 @@
 | 
			
		||||
        }, 
 | 
			
		||||
        "vllm_server_parameters": {
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "gpu_memory_utilization": 0.9,
 | 
			
		||||
            "num_scheduler_steps": 10,
 | 
			
		||||
            "max_num_seqs": 512,
 | 
			
		||||
            "dtype": "bfloat16"
 | 
			
		||||
        },
 | 
			
		||||
@ -243,7 +251,9 @@
 | 
			
		||||
        }, 
 | 
			
		||||
        "vllm_server_parameters": {
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "gpu_memory_utilization": 0.9,
 | 
			
		||||
            "num_scheduler_steps": 10,
 | 
			
		||||
            "max_num_seqs": 512,
 | 
			
		||||
            "dtype": "bfloat16"
 | 
			
		||||
        },
 | 
			
		||||
@ -295,7 +305,9 @@
 | 
			
		||||
        }, 
 | 
			
		||||
        "vllm_server_parameters": {
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "gpu_memory_utilization": 0.9,
 | 
			
		||||
            "num_scheduler_steps": 10,
 | 
			
		||||
            "max_num_seqs": 512,
 | 
			
		||||
            "dtype": "bfloat16"
 | 
			
		||||
        },
 | 
			
		||||
 | 
			
		||||
@ -1,610 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp1_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp4_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp1_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp4_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp1_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp4_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp1_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp4_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,820 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "pipeline_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "pipeline_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_pp1_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "pipeline_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_pp3_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_pp1_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "pipeline_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_pp3_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_pp1_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "pipeline_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_pp3_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_pp1_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "pipeline_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_pp3_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "pipeline_parallel_size": 3,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,168 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp1_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp2_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp4_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp4_random_1024_128",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 1024,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 100
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_pp6_random_1024_128",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "pipeline_parallel_size": 6,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 1024,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 100
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -7,6 +7,7 @@
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "swap_space": 16,
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
@ -25,6 +26,7 @@
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "swap_space": 16,
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
@ -43,6 +45,7 @@
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "swap_space": 16,
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
@ -58,6 +61,7 @@
 | 
			
		||||
        "qps_list": [2],
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
 | 
			
		||||
            "disable_log_requests": "", 
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "swap_space": 16,
 | 
			
		||||
            "speculative_config": {
 | 
			
		||||
 | 
			
		||||
@ -1,32 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_llama8B_tp1",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200,
 | 
			
		||||
            "backend": "vllm"
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_llama8B_tp4",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200,
 | 
			
		||||
            "backend": "vllm"
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,46 +0,0 @@
 | 
			
		||||
# This local pyproject file is part of the migration from yapf to ruff format.
 | 
			
		||||
# It uses the same core rules as the main pyproject.toml file, but with the
 | 
			
		||||
# following differences:
 | 
			
		||||
# - ruff line length is overridden to 88
 | 
			
		||||
# - deprecated typing ignores (UP006, UP035) have been removed
 | 
			
		||||
 | 
			
		||||
[tool.ruff]
 | 
			
		||||
line-length = 88
 | 
			
		||||
 | 
			
		||||
[tool.ruff.lint.per-file-ignores]
 | 
			
		||||
"vllm/third_party/**" = ["ALL"]
 | 
			
		||||
"vllm/version.py" = ["F401"]
 | 
			
		||||
"vllm/_version.py" = ["ALL"]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.lint]
 | 
			
		||||
select = [
 | 
			
		||||
    # pycodestyle
 | 
			
		||||
    "E",
 | 
			
		||||
    # Pyflakes
 | 
			
		||||
    "F",
 | 
			
		||||
    # pyupgrade
 | 
			
		||||
    "UP",
 | 
			
		||||
    # flake8-bugbear
 | 
			
		||||
    "B",
 | 
			
		||||
    # flake8-simplify
 | 
			
		||||
    "SIM",
 | 
			
		||||
    # isort
 | 
			
		||||
    "I",
 | 
			
		||||
    # flake8-logging-format
 | 
			
		||||
    "G",
 | 
			
		||||
]
 | 
			
		||||
ignore = [
 | 
			
		||||
    # star imports
 | 
			
		||||
    "F405", "F403",
 | 
			
		||||
    # lambda expression assignment
 | 
			
		||||
    "E731",
 | 
			
		||||
    # Loop control variable not used within loop body
 | 
			
		||||
    "B007",
 | 
			
		||||
    # f-string format
 | 
			
		||||
    "UP032",
 | 
			
		||||
    # Can remove once 3.10+ is the minimum Python version
 | 
			
		||||
    "UP007",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.format]
 | 
			
		||||
docstring-code-format = true
 | 
			
		||||
@ -1,130 +1,76 @@
 | 
			
		||||
steps:
 | 
			
		||||
  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
 | 
			
		||||
  - label: "Build arm64 wheel - CUDA 12.9"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-arm64-cuda-12-9
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: arm64_cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
 | 
			
		||||
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - label: "Build wheel - CUDA 12.8"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-cuda-12-8
 | 
			
		||||
  - label: "Build wheel - CUDA 12.4"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - label: "Build wheel - CUDA 12.6"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-cuda-12-6
 | 
			
		||||
  - label: "Build wheel - CUDA 12.1"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  # x86 + CUDA builds
 | 
			
		||||
  - label: "Build wheel - CUDA 12.9"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-cuda-12-9
 | 
			
		||||
  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
 | 
			
		||||
  # However, this block can be uncommented to save some compute hours.
 | 
			
		||||
  # - block: "Build CUDA 11.8 wheel"
 | 
			
		||||
  #   key: block-build-cu118-wheel
 | 
			
		||||
 | 
			
		||||
  - label: "Build wheel - CUDA 11.8"
 | 
			
		||||
    # depends_on: block-build-cu118-wheel
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - label: "Build release image (x86)"
 | 
			
		||||
  - block: "Build release image"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-release-image-x86
 | 
			
		||||
    key: block-release-image-build
 | 
			
		||||
 | 
			
		||||
  - label: "Build release image"
 | 
			
		||||
    depends_on: block-release-image-build
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 | 
			
		||||
      # re-tag to default image tag and push, just in case arm64 build fails
 | 
			
		||||
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
			
		||||
 | 
			
		||||
  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
 | 
			
		||||
  - label: "Build release image (arm64)"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-release-image-arm64
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: arm64_cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 | 
			
		||||
 | 
			
		||||
  # Add job to create multi-arch manifest
 | 
			
		||||
  - label: "Create multi-arch manifest"
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - build-release-image-x86
 | 
			
		||||
      - build-release-image-arm64
 | 
			
		||||
    id: create-multi-arch-manifest
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
 | 
			
		||||
      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
			
		||||
 | 
			
		||||
  - label: "Annotate release workflow"
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - create-multi-arch-manifest
 | 
			
		||||
      - build-wheel-cuda-12-8
 | 
			
		||||
    id: annotate-release-workflow
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "bash .buildkite/scripts/annotate-release.sh"
 | 
			
		||||
 | 
			
		||||
  - label: "Build and publish TPU release image"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    if: build.env("NIGHTLY") == "1"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: tpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "yes | docker system prune -a"
 | 
			
		||||
      - "git fetch --all"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
 | 
			
		||||
      - "docker push vllm/vllm-tpu:nightly"
 | 
			
		||||
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
 | 
			
		||||
    plugins:
 | 
			
		||||
      - docker-login#v3.0.0:
 | 
			
		||||
          username: vllmbot
 | 
			
		||||
          username: vllm
 | 
			
		||||
          password-env: DOCKERHUB_TOKEN
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - input: "Provide Release version here"
 | 
			
		||||
    id: input-release-version
 | 
			
		||||
    fields:
 | 
			
		||||
      - text: "What is the release version?"
 | 
			
		||||
        key: release-version
 | 
			
		||||
        key: "release-version"
 | 
			
		||||
 | 
			
		||||
  - block: "Build CPU release image"
 | 
			
		||||
    key: block-cpu-release-image-build
 | 
			
		||||
@ -136,30 +82,7 @@ steps:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - label: "Build and publish nightly multi-arch image to DockerHub"
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - create-multi-arch-manifest
 | 
			
		||||
    if: build.env("NIGHTLY") == "1"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
			
		||||
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
 | 
			
		||||
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
 | 
			
		||||
      - "docker push vllm/vllm-openai:nightly"
 | 
			
		||||
      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
 | 
			
		||||
      # Clean up old nightly builds (keep only last 14)
 | 
			
		||||
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
 | 
			
		||||
    plugins:
 | 
			
		||||
      - docker-login#v3.0.0:
 | 
			
		||||
          username: vllmbot
 | 
			
		||||
          password-env: DOCKERHUB_TOKEN
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
@ -1,46 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Get release version and strip leading 'v' if present
 | 
			
		||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
 | 
			
		||||
 | 
			
		||||
if [ -z "$RELEASE_VERSION" ]; then
 | 
			
		||||
  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 | 
			
		||||
To download the wheel:
 | 
			
		||||
\`\`\`
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 | 
			
		||||
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 | 
			
		||||
\`\`\`
 | 
			
		||||
 | 
			
		||||
To download and upload the image:
 | 
			
		||||
 | 
			
		||||
\`\`\`
 | 
			
		||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 | 
			
		||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
 | 
			
		||||
 | 
			
		||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 | 
			
		||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
 | 
			
		||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 | 
			
		||||
docker push vllm/vllm-openai:latest-x86_64
 | 
			
		||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 | 
			
		||||
 | 
			
		||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 | 
			
		||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 | 
			
		||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 | 
			
		||||
docker push vllm/vllm-openai:latest-aarch64
 | 
			
		||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 | 
			
		||||
 | 
			
		||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
 | 
			
		||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
 | 
			
		||||
docker manifest push vllm/vllm-openai:latest
 | 
			
		||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 | 
			
		||||
\`\`\`
 | 
			
		||||
EOF 
 | 
			
		||||
@ -1,17 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
# Usage: ./ci_clean_log.sh ci.log
 | 
			
		||||
# This script strips timestamps and color codes from CI log files.
 | 
			
		||||
 | 
			
		||||
# Check if argument is given
 | 
			
		||||
if [ $# -lt 1 ]; then
 | 
			
		||||
    echo "Usage: $0 ci.log"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
INPUT_FILE="$1"
 | 
			
		||||
 | 
			
		||||
# Strip timestamps
 | 
			
		||||
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
 | 
			
		||||
 | 
			
		||||
# Strip colorization
 | 
			
		||||
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
 | 
			
		||||
@ -1,97 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
 | 
			
		||||
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
 | 
			
		||||
 | 
			
		||||
# DockerHub API endpoint for vllm/vllm-openai repository
 | 
			
		||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
 | 
			
		||||
 | 
			
		||||
# Get DockerHub token from environment
 | 
			
		||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
 | 
			
		||||
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Function to get all tags from DockerHub
 | 
			
		||||
get_all_tags() {
 | 
			
		||||
    local page=1
 | 
			
		||||
    local all_tags=""
 | 
			
		||||
    
 | 
			
		||||
    while true; do
 | 
			
		||||
        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
 | 
			
		||||
            "$REPO_API_URL?page=$page&page_size=100")
 | 
			
		||||
        
 | 
			
		||||
        # Get both last_updated timestamp and tag name, separated by |
 | 
			
		||||
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
 | 
			
		||||
        
 | 
			
		||||
        if [ -z "$tags" ]; then
 | 
			
		||||
            break
 | 
			
		||||
        fi
 | 
			
		||||
        
 | 
			
		||||
        all_tags="$all_tags$tags"$'\n'
 | 
			
		||||
        page=$((page + 1))
 | 
			
		||||
    done
 | 
			
		||||
    
 | 
			
		||||
    # Sort by timestamp (newest first) and extract just the tag names
 | 
			
		||||
    echo "$all_tags" | sort -r | cut -d'|' -f2
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
delete_tag() {
 | 
			
		||||
    local tag_name="$1"
 | 
			
		||||
    echo "Deleting tag: $tag_name"
 | 
			
		||||
    
 | 
			
		||||
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
 | 
			
		||||
    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
 | 
			
		||||
    
 | 
			
		||||
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
 | 
			
		||||
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
 | 
			
		||||
    else
 | 
			
		||||
        echo "Successfully deleted tag: $tag_name"
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
 | 
			
		||||
echo "Fetching all tags from DockerHub..."
 | 
			
		||||
all_tags=$(get_all_tags)
 | 
			
		||||
 | 
			
		||||
if [ -z "$all_tags" ]; then
 | 
			
		||||
    echo "No tags found to clean up"
 | 
			
		||||
    exit 0
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Count total tags
 | 
			
		||||
total_tags=$(echo "$all_tags" | wc -l)
 | 
			
		||||
echo "Found $total_tags tags"
 | 
			
		||||
 | 
			
		||||
# Keep only the last 14 builds (including the current one)
 | 
			
		||||
tags_to_keep=14
 | 
			
		||||
tags_to_delete=$((total_tags - tags_to_keep))
 | 
			
		||||
 | 
			
		||||
if [ $tags_to_delete -le 0 ]; then
 | 
			
		||||
    echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
 | 
			
		||||
    exit 0
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
 | 
			
		||||
 | 
			
		||||
# Get tags to delete (skip the first $tags_to_keep tags)
 | 
			
		||||
tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
 | 
			
		||||
 | 
			
		||||
if [ -z "$tags_to_delete_list" ]; then
 | 
			
		||||
    echo "No tags to delete"
 | 
			
		||||
    exit 0
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Delete old tags
 | 
			
		||||
echo "Deleting old tags..."
 | 
			
		||||
while IFS= read -r tag; do
 | 
			
		||||
    if [ -n "$tag" ]; then
 | 
			
		||||
        delete_tag "$tag"
 | 
			
		||||
        # Add a small delay to avoid rate limiting
 | 
			
		||||
        sleep 1
 | 
			
		||||
    fi
 | 
			
		||||
done <<< "$tags_to_delete_list"
 | 
			
		||||
 | 
			
		||||
echo "Cleanup completed successfully"
 | 
			
		||||
@ -3,9 +3,6 @@
 | 
			
		||||
# This script runs test inside the corresponding ROCm docker container.
 | 
			
		||||
set -o pipefail
 | 
			
		||||
 | 
			
		||||
# Export Python path
 | 
			
		||||
export PYTHONPATH=".."
 | 
			
		||||
 | 
			
		||||
# Print ROCm version
 | 
			
		||||
echo "--- Confirming Clean Initial State"
 | 
			
		||||
while true; do
 | 
			
		||||
@ -77,66 +74,31 @@ HF_MOUNT="/root/.cache/huggingface"
 | 
			
		||||
 | 
			
		||||
commands=$@
 | 
			
		||||
echo "Commands:$commands"
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#ignore certain kernels tests
 | 
			
		||||
if [[ $commands == *" kernels/core"* ]]; then
 | 
			
		||||
if [[ $commands == *" kernels "* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/core/test_fused_quant_layernorm.py \
 | 
			
		||||
  --ignore=kernels/core/test_permute_cols.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/attention"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/attention/test_attention_selector.py \
 | 
			
		||||
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_flashinfer.py \
 | 
			
		||||
  --ignore=kernels/attention/test_prefix_prefill.py \
 | 
			
		||||
  --ignore=kernels/attention/test_cascade_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_mha_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_lightning_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_attention.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/quantization"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/quantization/test_int8_quant.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_machete_mm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_block_fp8.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_block_int8.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_marlin_gemm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_int8_kernel.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/mamba"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/mamba/test_mamba_mixer2.py \
 | 
			
		||||
  --ignore=kernels/mamba/test_causal_conv1d.py \
 | 
			
		||||
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/moe"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/moe/test_moe.py \
 | 
			
		||||
  --ignore=kernels/moe/test_cutlass_moe.py \
 | 
			
		||||
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 | 
			
		||||
  --ignore=kernels/test_attention_selector.py \
 | 
			
		||||
  --ignore=kernels/test_blocksparse_attention.py \
 | 
			
		||||
  --ignore=kernels/test_causal_conv1d.py \
 | 
			
		||||
  --ignore=kernels/test_cutlass.py \
 | 
			
		||||
  --ignore=kernels/test_encoder_decoder_attn.py \
 | 
			
		||||
  --ignore=kernels/test_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/test_flashinfer.py \
 | 
			
		||||
  --ignore=kernels/test_int8_quant.py \
 | 
			
		||||
  --ignore=kernels/test_machete_gemm.py \
 | 
			
		||||
  --ignore=kernels/test_mamba_ssm.py \
 | 
			
		||||
  --ignore=kernels/test_marlin_gemm.py \
 | 
			
		||||
  --ignore=kernels/test_moe.py \
 | 
			
		||||
  --ignore=kernels/test_prefix_prefill.py \
 | 
			
		||||
  --ignore=kernels/test_rand.py \
 | 
			
		||||
  --ignore=kernels/test_sampler.py \
 | 
			
		||||
  --ignore=kernels/test_cascade_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/test_mamba_mixer2.py \
 | 
			
		||||
  --ignore=kernels/test_aqlm.py \
 | 
			
		||||
  --ignore=kernels/test_machete_mm.py \
 | 
			
		||||
  --ignore=kernels/test_mha_attn.py \
 | 
			
		||||
  --ignore=kernels/test_block_fp8.py \
 | 
			
		||||
  --ignore=kernels/test_permute_cols.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#ignore certain Entrypoints/openai tests
 | 
			
		||||
@ -160,9 +122,16 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
 | 
			
		||||
  --ignore=entrypoints/llm/test_chat.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_accuracy.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_init.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#Obsolete currently
 | 
			
		||||
##ignore certain Entrypoints/llm tests
 | 
			
		||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
 | 
			
		||||
#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 | 
			
		||||
#fi
 | 
			
		||||
 | 
			
		||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
 | 
			
		||||
# --ignore=entrypoints/openai/test_embedding.py \
 | 
			
		||||
# --ignore=entrypoints/openai/test_oot_registration.py
 | 
			
		||||
@ -171,8 +140,6 @@ fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
PARALLEL_JOB_COUNT=8
 | 
			
		||||
MYPYTHONPATH=".."
 | 
			
		||||
 | 
			
		||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 | 
			
		||||
if [[ $commands == *"--shard-id="* ]]; then
 | 
			
		||||
  # assign job count as the number of shards used   
 | 
			
		||||
@ -193,7 +160,6 @@ if [[ $commands == *"--shard-id="* ]]; then
 | 
			
		||||
        -e AWS_SECRET_ACCESS_KEY \
 | 
			
		||||
        -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
        -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
        -e "PYTHONPATH=${MYPYTHONPATH}" \
 | 
			
		||||
        --name "${container_name}_${GPU}" \
 | 
			
		||||
        "${image_name}" \
 | 
			
		||||
        /bin/bash -c "${commands_gpu}" \
 | 
			
		||||
@ -224,7 +190,6 @@ else
 | 
			
		||||
          -e AWS_SECRET_ACCESS_KEY \
 | 
			
		||||
          -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
          -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
          -e "PYTHONPATH=${MYPYTHONPATH}" \
 | 
			
		||||
          --name "${container_name}" \
 | 
			
		||||
          "${image_name}" \
 | 
			
		||||
          /bin/bash -c "${commands}"
 | 
			
		||||
 | 
			
		||||
@ -5,13 +5,7 @@
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() {
 | 
			
		||||
  if [[ -n "$container_id" ]]; then
 | 
			
		||||
      podman stop --all -t0
 | 
			
		||||
      podman rm -f "$container_id" || true
 | 
			
		||||
  fi
 | 
			
		||||
  podman system prune -f
 | 
			
		||||
}
 | 
			
		||||
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
@ -19,31 +13,26 @@ remove_docker_container
 | 
			
		||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
 | 
			
		||||
 | 
			
		||||
# Run the image
 | 
			
		||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
 | 
			
		||||
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
 | 
			
		||||
 | 
			
		||||
function cpu_tests() {
 | 
			
		||||
 | 
			
		||||
  # offline inference
 | 
			
		||||
  podman exec -it "$container_id" bash -c "
 | 
			
		||||
  podman exec cpu-test-ubi9-ppc bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run basic model test
 | 
			
		||||
  podman exec -it "$container_id" bash -c "
 | 
			
		||||
  podman exec cpu-test-ubi9-ppc bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
 | 
			
		||||
    pip install sentence-transformers datamodel_code_generator
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
 | 
			
		||||
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
 | 
			
		||||
    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 | 
			
		||||
    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
 | 
			
		||||
    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
 | 
			
		||||
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# All of CPU tests are expected to be finished less than 40 mins.
 | 
			
		||||
 | 
			
		||||
export container_id
 | 
			
		||||
export -f cpu_tests
 | 
			
		||||
timeout 40m bash -c cpu_tests
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -6,114 +6,89 @@ set -ex
 | 
			
		||||
 | 
			
		||||
# allow to bind to different cores
 | 
			
		||||
CORE_RANGE=${CORE_RANGE:-48-95}
 | 
			
		||||
# used for TP/PP E2E test
 | 
			
		||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 | 
			
		||||
NUMA_NODE=${NUMA_NODE:-1}
 | 
			
		||||
 | 
			
		||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    set -e; 
 | 
			
		||||
    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
 | 
			
		||||
    docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; 
 | 
			
		||||
    docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true; 
 | 
			
		||||
}
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 | 
			
		||||
 | 
			
		||||
# Run the image, setting --shm-size=4g for tensor parallel.
 | 
			
		||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
 | 
			
		||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 | 
			
		||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
 | 
			
		||||
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 | 
			
		||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
 | 
			
		||||
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 | 
			
		||||
 | 
			
		||||
function cpu_tests() {
 | 
			
		||||
  set -e
 | 
			
		||||
  export NUMA_NODE=$2
 | 
			
		||||
 | 
			
		||||
  # list packages
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip list"
 | 
			
		||||
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip list"
 | 
			
		||||
  export BUILDKITE_BUILD_NUMBER=$3
 | 
			
		||||
 | 
			
		||||
  # offline inference
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run kernel tests
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -x -v -s tests/kernels/test_onednn.py"
 | 
			
		||||
 | 
			
		||||
  # Run basic model test
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    # Note: disable until supports V1
 | 
			
		||||
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
 | 
			
		||||
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
 | 
			
		||||
 | 
			
		||||
    pytest -x -v -s tests/models/language/generation -m cpu_model
 | 
			
		||||
    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
 | 
			
		||||
 | 
			
		||||
    pytest -x -v -s tests/models/language/pooling -m cpu_model
 | 
			
		||||
    pytest -x -v -s tests/models/multimodal/generation \
 | 
			
		||||
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
 | 
			
		||||
                -m cpu_model"
 | 
			
		||||
    pytest -v -s tests/kernels/test_cache.py -m cpu_model
 | 
			
		||||
    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/decoder_only/language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/embedding/language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 | 
			
		||||
 | 
			
		||||
  # Run compressed-tensor test
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -x -s -v \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 | 
			
		||||
 | 
			
		||||
  # Note: disable it until supports V1
 | 
			
		||||
  # Run AWQ test
 | 
			
		||||
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
  #   set -e
 | 
			
		||||
  #   VLLM_USE_V1=0 pytest -x -s -v \
 | 
			
		||||
  #   tests/quantization/test_ipex_quant.py"
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/quantization/test_ipex_quant.py"
 | 
			
		||||
 | 
			
		||||
  # Run chunked-prefill and prefix-cache test
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v -k cpu_model \
 | 
			
		||||
    tests/basic_correctness/test_chunked_prefill.py"  
 | 
			
		||||
 | 
			
		||||
  # online serving
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    export VLLM_CPU_KVCACHE_SPACE=10 
 | 
			
		||||
    export VLLM_CPU_OMP_THREADS_BIND=$1
 | 
			
		||||
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
 | 
			
		||||
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 | 
			
		||||
    python3 benchmarks/benchmark_serving.py \
 | 
			
		||||
      --backend vllm \
 | 
			
		||||
      --dataset-name random \
 | 
			
		||||
      --model facebook/opt-125m \
 | 
			
		||||
      --num-prompts 20 \
 | 
			
		||||
      --endpoint /v1/completions \
 | 
			
		||||
      --tokenizer facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run multi-lora tests
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -x -s -v \
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/lora/test_qwen2vl.py"
 | 
			
		||||
 | 
			
		||||
  # online serving: tp+pp
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c '
 | 
			
		||||
    set -e
 | 
			
		||||
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
 | 
			
		||||
    server_pid=$!
 | 
			
		||||
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
 | 
			
		||||
    vllm bench serve \
 | 
			
		||||
      --backend vllm \
 | 
			
		||||
      --dataset-name random \
 | 
			
		||||
      --model meta-llama/Llama-3.2-3B-Instruct \
 | 
			
		||||
      --num-prompts 20 \
 | 
			
		||||
      --endpoint /v1/completions
 | 
			
		||||
    kill -s SIGTERM $server_pid &'
 | 
			
		||||
 | 
			
		||||
  # online serving: tp+dp
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c '
 | 
			
		||||
    set -e
 | 
			
		||||
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
 | 
			
		||||
    server_pid=$!
 | 
			
		||||
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
 | 
			
		||||
    vllm bench serve \
 | 
			
		||||
      --backend vllm \
 | 
			
		||||
      --dataset-name random \
 | 
			
		||||
      --model meta-llama/Llama-3.2-3B-Instruct \
 | 
			
		||||
      --num-prompts 20 \
 | 
			
		||||
      --endpoint /v1/completions
 | 
			
		||||
    kill -s SIGTERM $server_pid &'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# All of CPU tests are expected to be finished less than 40 mins.
 | 
			
		||||
export -f cpu_tests
 | 
			
		||||
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
 | 
			
		||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
 | 
			
		||||
 | 
			
		||||
@ -16,7 +16,8 @@ DOCKER_BUILDKIT=1 docker build . \
 | 
			
		||||
  --build-arg max_jobs=66 \
 | 
			
		||||
  --build-arg nvcc_threads=2 \
 | 
			
		||||
  --build-arg RUN_WHEEL_CHECK=false \
 | 
			
		||||
  --build-arg torch_cuda_arch_list="9.0+PTX"
 | 
			
		||||
  --build-arg torch_cuda_arch_list="9.0+PTX" \
 | 
			
		||||
  --build-arg vllm_fa_cmake_gpu_arches="90-real"
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() { docker rm -f gh200-test || true; }
 | 
			
		||||
 | 
			
		||||
@ -2,55 +2,23 @@
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -exuo pipefail
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
 | 
			
		||||
FROM gaudi-base-image:latest
 | 
			
		||||
 | 
			
		||||
COPY ./ /workspace/vllm
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/vllm
 | 
			
		||||
 | 
			
		||||
ENV no_proxy=localhost,127.0.0.1
 | 
			
		||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 | 
			
		||||
 | 
			
		||||
RUN VLLM_TARGET_DEVICE=empty pip install .
 | 
			
		||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
 | 
			
		||||
 | 
			
		||||
# install development dependencies (for testing)
 | 
			
		||||
RUN python3 -m pip install -e tests/vllm_test_utils
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/
 | 
			
		||||
 | 
			
		||||
RUN git clone https://github.com/vllm-project/vllm-gaudi.git
 | 
			
		||||
 | 
			
		||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 | 
			
		||||
 | 
			
		||||
EOF
 | 
			
		||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
# certain versions of HPU software stack have a bug that can
 | 
			
		||||
# override the exit code of the script, so we need to use
 | 
			
		||||
# separate remove_docker_containers and remove_docker_containers_and_exit
 | 
			
		||||
# separate remove_docker_container and remove_docker_container_and_exit
 | 
			
		||||
# functions, while other platforms only need one remove_docker_container
 | 
			
		||||
# function.
 | 
			
		||||
EXITCODE=1
 | 
			
		||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
 | 
			
		||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 | 
			
		||||
remove_docker_containers
 | 
			
		||||
 | 
			
		||||
echo "Running HPU plugin v1 test"
 | 
			
		||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
 | 
			
		||||
  -e HABANA_VISIBLE_DEVICES=all \
 | 
			
		||||
  hpu-plugin-v1-test-env \
 | 
			
		||||
  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
 | 
			
		||||
remove_docker_container() { docker rm -f hpu-test || true; }
 | 
			
		||||
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
 | 
			
		||||
trap remove_docker_container_and_exit EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Run the image and launch offline inference
 | 
			
		||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 | 
			
		||||
EXITCODE=$?
 | 
			
		||||
if [ $EXITCODE -eq 0 ]; then
 | 
			
		||||
  echo "Test with basic model passed"
 | 
			
		||||
else
 | 
			
		||||
  echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# The trap will handle the container removal and final exit.
 | 
			
		||||
							
								
								
									
										54
									
								
								.buildkite/scripts/hardware_ci/run-neuron-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								.buildkite/scripts/hardware_ci/run-neuron-test.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,54 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the Neuron docker image and run the API server inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -e
 | 
			
		||||
set -v
 | 
			
		||||
 | 
			
		||||
image_name="neuron/vllm-ci"
 | 
			
		||||
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 | 
			
		||||
 | 
			
		||||
HF_CACHE="$(realpath ~)/huggingface"
 | 
			
		||||
mkdir -p "${HF_CACHE}"
 | 
			
		||||
HF_MOUNT="/root/.cache/huggingface"
 | 
			
		||||
 | 
			
		||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 | 
			
		||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 | 
			
		||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 | 
			
		||||
 | 
			
		||||
# prune old image and containers to save disk space, and only once a day
 | 
			
		||||
# by using a timestamp file in tmp.
 | 
			
		||||
if [ -f /tmp/neuron-docker-build-timestamp ]; then
 | 
			
		||||
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
 | 
			
		||||
    current_time=$(date +%s)
 | 
			
		||||
    if [ $((current_time - last_build)) -gt 86400 ]; then
 | 
			
		||||
        # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
        docker image prune -f
 | 
			
		||||
        # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
        docker volume prune -f && docker system prune -f
 | 
			
		||||
        echo "$current_time" > /tmp/neuron-docker-build-timestamp
 | 
			
		||||
    fi
 | 
			
		||||
else
 | 
			
		||||
    date "+%s" > /tmp/neuron-docker-build-timestamp
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() {
 | 
			
		||||
    docker image rm -f "${image_name}" || true;
 | 
			
		||||
}
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Run the image
 | 
			
		||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
 | 
			
		||||
       -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
       -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
 | 
			
		||||
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
 | 
			
		||||
       --name "${container_name}" \
 | 
			
		||||
       ${image_name} \
 | 
			
		||||
       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
 | 
			
		||||
@ -1,167 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -xu
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    docker rm -f tpu-test || true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Build the docker image.
 | 
			
		||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 | 
			
		||||
 | 
			
		||||
# Set up cleanup.
 | 
			
		||||
cleanup_docker() {
 | 
			
		||||
  # Get Docker's root directory
 | 
			
		||||
  docker_root=$(docker info -f '{{.DockerRootDir}}')
 | 
			
		||||
  if [ -z "$docker_root" ]; then
 | 
			
		||||
    echo "Failed to determine Docker root directory."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  echo "Docker root directory: $docker_root"
 | 
			
		||||
  # Check disk usage of the filesystem where Docker's root directory is located
 | 
			
		||||
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 | 
			
		||||
  # Define the threshold
 | 
			
		||||
  threshold=70
 | 
			
		||||
  if [ "$disk_usage" -gt "$threshold" ]; then
 | 
			
		||||
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
 | 
			
		||||
    # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
    docker image prune -f
 | 
			
		||||
    # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
 | 
			
		||||
    echo "Docker images and volumes cleanup completed."
 | 
			
		||||
  else
 | 
			
		||||
    echo "Disk usage is below $threshold%. No cleanup needed."
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
cleanup_docker
 | 
			
		||||
 | 
			
		||||
# For HF_TOKEN.
 | 
			
		||||
source /etc/environment
 | 
			
		||||
 | 
			
		||||
docker run --privileged --net host --shm-size=16G -it \
 | 
			
		||||
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
 | 
			
		||||
    vllm-tpu /bin/bash -c '
 | 
			
		||||
set -e # Exit immediately if a command exits with a non-zero status.
 | 
			
		||||
set -u # Treat unset variables as an error.
 | 
			
		||||
 | 
			
		||||
echo "--- Starting script inside Docker container ---"
 | 
			
		||||
 | 
			
		||||
# Create results directory
 | 
			
		||||
RESULTS_DIR=$(mktemp -d)
 | 
			
		||||
# If mktemp fails, set -e will cause the script to exit.
 | 
			
		||||
echo "Results will be stored in: $RESULTS_DIR"
 | 
			
		||||
 | 
			
		||||
# Install dependencies
 | 
			
		||||
echo "--- Installing Python dependencies ---"
 | 
			
		||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
 | 
			
		||||
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
 | 
			
		||||
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
 | 
			
		||||
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 | 
			
		||||
echo "--- Python dependencies installed ---"
 | 
			
		||||
export VLLM_USE_V1=1
 | 
			
		||||
export VLLM_XLA_CHECK_RECOMPILATION=1
 | 
			
		||||
export VLLM_XLA_CACHE_PATH=
 | 
			
		||||
echo "Using VLLM V1"
 | 
			
		||||
 | 
			
		||||
echo "--- Hardware Information ---"
 | 
			
		||||
# tpu-info
 | 
			
		||||
echo "--- Starting Tests ---"
 | 
			
		||||
set +e
 | 
			
		||||
overall_script_exit_code=0
 | 
			
		||||
 | 
			
		||||
# --- Test Definitions ---
 | 
			
		||||
# If a test fails, this function will print logs and will not cause the main script to exit.
 | 
			
		||||
run_test() {
 | 
			
		||||
    local test_num=$1
 | 
			
		||||
    local test_name=$2
 | 
			
		||||
    local test_command=$3
 | 
			
		||||
    local log_file="$RESULTS_DIR/test_${test_num}.log"
 | 
			
		||||
    local actual_exit_code
 | 
			
		||||
 | 
			
		||||
    echo "--- TEST_$test_num: Running $test_name ---"
 | 
			
		||||
    
 | 
			
		||||
    # Execute the test command.
 | 
			
		||||
    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
 | 
			
		||||
    actual_exit_code=$?
 | 
			
		||||
 | 
			
		||||
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
 | 
			
		||||
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
 | 
			
		||||
 | 
			
		||||
    if [ "$actual_exit_code" -ne 0 ]; then
 | 
			
		||||
        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
 | 
			
		||||
        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
 | 
			
		||||
        if [ -f "$log_file" ]; then
 | 
			
		||||
            cat "$log_file" >&2
 | 
			
		||||
        else
 | 
			
		||||
            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
 | 
			
		||||
        fi
 | 
			
		||||
        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
 | 
			
		||||
        return "$actual_exit_code" # Return the failure code
 | 
			
		||||
    else
 | 
			
		||||
        echo "TEST_$test_num ($test_name) PASSED."
 | 
			
		||||
        return 0 # Return success
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Helper function to call run_test and update the overall script exit code
 | 
			
		||||
run_and_track_test() {
 | 
			
		||||
    local test_num_arg="$1"
 | 
			
		||||
    local test_name_arg="$2"
 | 
			
		||||
    local test_command_arg="$3"
 | 
			
		||||
 | 
			
		||||
    # Run the test
 | 
			
		||||
    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
 | 
			
		||||
    local test_specific_exit_code=$?
 | 
			
		||||
 | 
			
		||||
    # If the test failed, set the overall script exit code to 1
 | 
			
		||||
    if [ "$test_specific_exit_code" -ne 0 ]; then
 | 
			
		||||
        # No need for extra echo here, run_test already logged the failure.
 | 
			
		||||
        overall_script_exit_code=1
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# --- Actual Test Execution ---
 | 
			
		||||
run_and_track_test 1 "test_struct_output_generate.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 | 
			
		||||
run_and_track_test 2 "test_moe_pallas.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 | 
			
		||||
run_and_track_test 3 "test_lora.py" \
 | 
			
		||||
    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
 | 
			
		||||
run_and_track_test 4 "test_tpu_qkv_linear.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
 | 
			
		||||
run_and_track_test 5 "test_spmd_model_weight_loading.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 | 
			
		||||
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
 | 
			
		||||
run_and_track_test 7 "test_tpu_int8.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
 | 
			
		||||
 | 
			
		||||
# After all tests have been attempted, exit with the overall status.
 | 
			
		||||
if [ "$overall_script_exit_code" -ne 0 ]; then
 | 
			
		||||
    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
 | 
			
		||||
else
 | 
			
		||||
    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
 | 
			
		||||
fi
 | 
			
		||||
exit "$overall_script_exit_code"
 | 
			
		||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
 | 
			
		||||
 | 
			
		||||
# Capture the exit code of the docker run command
 | 
			
		||||
DOCKER_RUN_EXIT_CODE=$?
 | 
			
		||||
 | 
			
		||||
# The trap will run for cleanup.
 | 
			
		||||
# Exit the main script with the Docker run command's exit code.
 | 
			
		||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
 | 
			
		||||
    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
 | 
			
		||||
    exit "$DOCKER_RUN_EXIT_CODE"
 | 
			
		||||
else
 | 
			
		||||
    echo "Docker run command completed successfully."
 | 
			
		||||
    exit 0
 | 
			
		||||
fi
 | 
			
		||||
# TODO: This test fails because it uses RANDOM_SEED sampling
 | 
			
		||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 | 
			
		||||
@ -1,175 +1,49 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -xu
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    docker rm -f tpu-test || true; 
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
set -xue
 | 
			
		||||
 | 
			
		||||
# Build the docker image.
 | 
			
		||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 | 
			
		||||
 | 
			
		||||
# Set up cleanup.
 | 
			
		||||
cleanup_docker() {
 | 
			
		||||
  # Get Docker's root directory
 | 
			
		||||
  docker_root=$(docker info -f '{{.DockerRootDir}}')
 | 
			
		||||
  if [ -z "$docker_root" ]; then
 | 
			
		||||
    echo "Failed to determine Docker root directory."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  echo "Docker root directory: $docker_root"
 | 
			
		||||
  # Check disk usage of the filesystem where Docker's root directory is located
 | 
			
		||||
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 | 
			
		||||
  # Define the threshold
 | 
			
		||||
  threshold=70
 | 
			
		||||
  if [ "$disk_usage" -gt "$threshold" ]; then
 | 
			
		||||
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
 | 
			
		||||
    # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
    docker image prune -f
 | 
			
		||||
    # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
 | 
			
		||||
    echo "Docker images and volumes cleanup completed."
 | 
			
		||||
  else
 | 
			
		||||
    echo "Disk usage is below $threshold%. No cleanup needed."
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
cleanup_docker
 | 
			
		||||
remove_docker_container() { docker rm -f tpu-test || true; }
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# For HF_TOKEN.
 | 
			
		||||
source /etc/environment
 | 
			
		||||
 | 
			
		||||
# Run a simple end-to-end example.
 | 
			
		||||
docker run --privileged --net host --shm-size=16G -it \
 | 
			
		||||
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
 | 
			
		||||
    vllm-tpu /bin/bash -c '
 | 
			
		||||
set -e # Exit immediately if a command exits with a non-zero status.
 | 
			
		||||
set -u # Treat unset variables as an error.
 | 
			
		||||
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
 | 
			
		||||
    && python3 -m pip install pytest tpu-info \
 | 
			
		||||
    && python3 -m pip install lm_eval[api]==0.4.4 \
 | 
			
		||||
    && export VLLM_USE_V1=1 \
 | 
			
		||||
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
 | 
			
		||||
    && echo HARDWARE \
 | 
			
		||||
    && tpu-info \
 | 
			
		||||
    && echo TEST_0 \
 | 
			
		||||
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
 | 
			
		||||
    && echo TEST_1 \
 | 
			
		||||
    && pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
 | 
			
		||||
    && echo TEST_2 \
 | 
			
		||||
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
 | 
			
		||||
    && echo TEST_3 \
 | 
			
		||||
    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
 | 
			
		||||
    && echo TEST_4 \
 | 
			
		||||
    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
 | 
			
		||||
    && echo TEST_5 \
 | 
			
		||||
    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
 | 
			
		||||
    && echo TEST_6 \
 | 
			
		||||
    && pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
 | 
			
		||||
    && echo TEST_7 \
 | 
			
		||||
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
 | 
			
		||||
    && echo TEST_8 \
 | 
			
		||||
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
 | 
			
		||||
    && echo TEST_9 \
 | 
			
		||||
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
 | 
			
		||||
 | 
			
		||||
echo "--- Starting script inside Docker container ---"
 | 
			
		||||
 | 
			
		||||
# Create results directory
 | 
			
		||||
RESULTS_DIR=$(mktemp -d)
 | 
			
		||||
# If mktemp fails, set -e will cause the script to exit.
 | 
			
		||||
echo "Results will be stored in: $RESULTS_DIR"
 | 
			
		||||
 | 
			
		||||
# Install dependencies
 | 
			
		||||
echo "--- Installing Python dependencies ---"
 | 
			
		||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
 | 
			
		||||
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
 | 
			
		||||
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
 | 
			
		||||
    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 | 
			
		||||
echo "--- Python dependencies installed ---"
 | 
			
		||||
export VLLM_USE_V1=1
 | 
			
		||||
export VLLM_XLA_CHECK_RECOMPILATION=1
 | 
			
		||||
export VLLM_XLA_CACHE_PATH=
 | 
			
		||||
echo "Using VLLM V1"
 | 
			
		||||
 | 
			
		||||
echo "--- Hardware Information ---"
 | 
			
		||||
# tpu-info
 | 
			
		||||
echo "--- Starting Tests ---"
 | 
			
		||||
set +e
 | 
			
		||||
overall_script_exit_code=0
 | 
			
		||||
 | 
			
		||||
# --- Test Definitions ---
 | 
			
		||||
# If a test fails, this function will print logs and will not cause the main script to exit.
 | 
			
		||||
run_test() {
 | 
			
		||||
    local test_num=$1
 | 
			
		||||
    local test_name=$2
 | 
			
		||||
    local test_command=$3
 | 
			
		||||
    local log_file="$RESULTS_DIR/test_${test_num}.log"
 | 
			
		||||
    local actual_exit_code
 | 
			
		||||
 | 
			
		||||
    echo "--- TEST_$test_num: Running $test_name ---"
 | 
			
		||||
    
 | 
			
		||||
    # Execute the test command.
 | 
			
		||||
    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
 | 
			
		||||
    actual_exit_code=$?
 | 
			
		||||
 | 
			
		||||
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
 | 
			
		||||
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
 | 
			
		||||
 | 
			
		||||
    if [ "$actual_exit_code" -ne 0 ]; then
 | 
			
		||||
        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
 | 
			
		||||
        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
 | 
			
		||||
        if [ -f "$log_file" ]; then
 | 
			
		||||
            cat "$log_file" >&2
 | 
			
		||||
        else
 | 
			
		||||
            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
 | 
			
		||||
        fi
 | 
			
		||||
        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
 | 
			
		||||
        return "$actual_exit_code" # Return the failure code
 | 
			
		||||
    else
 | 
			
		||||
        echo "TEST_$test_num ($test_name) PASSED."
 | 
			
		||||
        return 0 # Return success
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Helper function to call run_test and update the overall script exit code
 | 
			
		||||
run_and_track_test() {
 | 
			
		||||
    local test_num_arg="$1"
 | 
			
		||||
    local test_name_arg="$2"
 | 
			
		||||
    local test_command_arg="$3"
 | 
			
		||||
 | 
			
		||||
    # Run the test
 | 
			
		||||
    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
 | 
			
		||||
    local test_specific_exit_code=$?
 | 
			
		||||
 | 
			
		||||
    # If the test failed, set the overall script exit code to 1
 | 
			
		||||
    if [ "$test_specific_exit_code" -ne 0 ]; then
 | 
			
		||||
        # No need for extra echo here, run_test already logged the failure.
 | 
			
		||||
        overall_script_exit_code=1
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# --- Actual Test Execution ---
 | 
			
		||||
run_and_track_test 0 "test_perf.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
 | 
			
		||||
run_and_track_test 1 "test_compilation.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
 | 
			
		||||
run_and_track_test 2 "test_basic.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
 | 
			
		||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 | 
			
		||||
run_and_track_test 4 "test_quantization_accuracy.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
 | 
			
		||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
 | 
			
		||||
    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
 | 
			
		||||
run_and_track_test 6 "test_tpu_model_runner.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
 | 
			
		||||
run_and_track_test 7 "test_sampler.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
 | 
			
		||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
 | 
			
		||||
run_and_track_test 9 "test_multimodal.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
 | 
			
		||||
run_and_track_test 10 "test_pallas.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 | 
			
		||||
 | 
			
		||||
# After all tests have been attempted, exit with the overall status.
 | 
			
		||||
if [ "$overall_script_exit_code" -ne 0 ]; then
 | 
			
		||||
    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
 | 
			
		||||
else
 | 
			
		||||
    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
 | 
			
		||||
fi
 | 
			
		||||
exit "$overall_script_exit_code"
 | 
			
		||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
 | 
			
		||||
 | 
			
		||||
# Capture the exit code of the docker run command
 | 
			
		||||
DOCKER_RUN_EXIT_CODE=$?
 | 
			
		||||
 | 
			
		||||
# The trap will run for cleanup.
 | 
			
		||||
# Exit the main script with the Docker run command's exit code.
 | 
			
		||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
 | 
			
		||||
    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
 | 
			
		||||
    exit "$DOCKER_RUN_EXIT_CODE"
 | 
			
		||||
else
 | 
			
		||||
    echo "Docker run command completed successfully."
 | 
			
		||||
    exit 0
 | 
			
		||||
fi
 | 
			
		||||
# TODO: This test fails because it uses RANDOM_SEED sampling
 | 
			
		||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 | 
			
		||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 | 
			
		||||
 | 
			
		||||
@ -23,28 +23,9 @@ docker run \
 | 
			
		||||
    --device /dev/dri \
 | 
			
		||||
    -v /dev/dri/by-path:/dev/dri/by-path \
 | 
			
		||||
    --entrypoint="" \
 | 
			
		||||
    -e "HF_TOKEN=${HF_TOKEN}" \
 | 
			
		||||
    -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
 | 
			
		||||
    --name "${container_name}" \
 | 
			
		||||
    "${image_name}" \
 | 
			
		||||
    bash -c '
 | 
			
		||||
    set -e
 | 
			
		||||
    echo $ZE_AFFINITY_MASK
 | 
			
		||||
    pip install tblib==3.1.0
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
 | 
			
		||||
    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
 | 
			
		||||
    cd tests
 | 
			
		||||
    pytest -v -s v1/core
 | 
			
		||||
    pytest -v -s v1/engine
 | 
			
		||||
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
 | 
			
		||||
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
 | 
			
		||||
    pytest -v -s v1/structured_output
 | 
			
		||||
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
 | 
			
		||||
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
 | 
			
		||||
    pytest -v -s v1/test_serial_utils.py
 | 
			
		||||
    pytest -v -s v1/test_utils.py
 | 
			
		||||
    pytest -v -s v1/test_metrics_reader.py
 | 
			
		||||
    sh -c '
 | 
			
		||||
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 | 
			
		||||
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 | 
			
		||||
'
 | 
			
		||||
 | 
			
		||||
@ -1,18 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# Usage: ./rerun_test.sh path/to/test.py::test_name
 | 
			
		||||
 | 
			
		||||
# Check if argument is given
 | 
			
		||||
if [ $# -lt 1 ]; then
 | 
			
		||||
    echo "Usage: $0 path/to/test.py::test_name"
 | 
			
		||||
    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
TEST=$1
 | 
			
		||||
COUNT=1
 | 
			
		||||
 | 
			
		||||
while pytest -sv "$TEST"; do
 | 
			
		||||
    COUNT=$((COUNT + 1))
 | 
			
		||||
    echo "RUN NUMBER ${COUNT}"
 | 
			
		||||
done
 | 
			
		||||
@ -11,10 +11,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 | 
			
		||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 | 
			
		||||
 | 
			
		||||
# run python-based benchmarks and upload the result to buildkite
 | 
			
		||||
vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 | 
			
		||||
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 | 
			
		||||
bench_latency_exit_code=$?
 | 
			
		||||
 | 
			
		||||
vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 | 
			
		||||
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 | 
			
		||||
bench_throughput_exit_code=$?
 | 
			
		||||
 | 
			
		||||
# run server-based benchmarks and upload the result to buildkite
 | 
			
		||||
@ -24,7 +24,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 | 
			
		||||
 | 
			
		||||
# wait for server to start, timeout after 600 seconds
 | 
			
		||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 | 
			
		||||
vllm bench serve \
 | 
			
		||||
python3 benchmarks/benchmark_serving.py \
 | 
			
		||||
    --backend vllm \
 | 
			
		||||
    --dataset-name sharegpt \
 | 
			
		||||
    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
 | 
			
		||||
 | 
			
		||||
@ -1,59 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
# Setup script for Prime-RL integration tests
 | 
			
		||||
# This script prepares the environment for running Prime-RL tests with nightly vLLM
 | 
			
		||||
 | 
			
		||||
set -euo pipefail
 | 
			
		||||
 | 
			
		||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 | 
			
		||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 | 
			
		||||
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 | 
			
		||||
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 | 
			
		||||
 | 
			
		||||
echo "Setting up Prime-RL integration test environment..."
 | 
			
		||||
 | 
			
		||||
# Clean up any existing Prime-RL directory
 | 
			
		||||
if [ -d "${PRIME_RL_DIR}" ]; then
 | 
			
		||||
    echo "Removing existing Prime-RL directory..."
 | 
			
		||||
    rm -rf "${PRIME_RL_DIR}"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Install UV if not available
 | 
			
		||||
if ! command -v uv &> /dev/null; then
 | 
			
		||||
    echo "Installing UV package manager..."
 | 
			
		||||
    curl -LsSf https://astral.sh/uv/install.sh | sh
 | 
			
		||||
    source $HOME/.local/bin/env
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Clone Prime-RL repository at specific branch for reproducible tests
 | 
			
		||||
PRIME_RL_BRANCH="integ-vllm-main"
 | 
			
		||||
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
 | 
			
		||||
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
 | 
			
		||||
cd "${PRIME_RL_DIR}"
 | 
			
		||||
 | 
			
		||||
echo "Setting up UV project environment..."
 | 
			
		||||
export UV_PROJECT_ENVIRONMENT=/usr/local
 | 
			
		||||
ln -s /usr/bin/python3 /usr/local/bin/python
 | 
			
		||||
 | 
			
		||||
# Remove vllm pin from pyproject.toml
 | 
			
		||||
echo "Removing vllm pin from pyproject.toml..."
 | 
			
		||||
sed -i '/vllm==/d' pyproject.toml
 | 
			
		||||
 | 
			
		||||
# Sync Prime-RL dependencies
 | 
			
		||||
echo "Installing Prime-RL dependencies..."
 | 
			
		||||
uv sync --inexact && uv sync --inexact --all-extras
 | 
			
		||||
 | 
			
		||||
# Verify installation
 | 
			
		||||
echo "Verifying installations..."
 | 
			
		||||
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
 | 
			
		||||
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
 | 
			
		||||
 | 
			
		||||
echo "Prime-RL integration test environment setup complete!"
 | 
			
		||||
 | 
			
		||||
echo "Running Prime-RL integration tests..."
 | 
			
		||||
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
 | 
			
		||||
uv run pytest -vs tests/integration/test_rl.py -m gpu
 | 
			
		||||
 | 
			
		||||
echo "Prime-RL integration tests completed!"
 | 
			
		||||
@ -1,24 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -euo pipefail
 | 
			
		||||
 | 
			
		||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
 | 
			
		||||
if [ -z "$docker_root" ]; then
 | 
			
		||||
  echo "Failed to determine Docker root directory."
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
echo "Docker root directory: $docker_root"
 | 
			
		||||
# Check disk usage of the filesystem where Docker's root directory is located
 | 
			
		||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 | 
			
		||||
# Define the threshold
 | 
			
		||||
threshold=70
 | 
			
		||||
if [ "$disk_usage" -gt "$threshold" ]; then
 | 
			
		||||
  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
 | 
			
		||||
  # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
  docker image prune -f
 | 
			
		||||
  # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
  docker volume prune -f && docker system prune --force --filter "until=24h" --all
 | 
			
		||||
  echo "Docker images and volumes cleanup completed."
 | 
			
		||||
else
 | 
			
		||||
  echo "Disk usage is below $threshold%. No cleanup needed."
 | 
			
		||||
fi
 | 
			
		||||
@ -1,14 +0,0 @@
 | 
			
		||||
# Environment config
 | 
			
		||||
TEST_NAME=llama8b
 | 
			
		||||
CONTAINER_NAME=tpu-test
 | 
			
		||||
 | 
			
		||||
# vllm config
 | 
			
		||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
 | 
			
		||||
MAX_NUM_SEQS=256
 | 
			
		||||
MAX_NUM_BATCHED_TOKENS=1024
 | 
			
		||||
TENSOR_PARALLEL_SIZE=1
 | 
			
		||||
MAX_MODEL_LEN=2048
 | 
			
		||||
DOWNLOAD_DIR=/mnt/disks/persist
 | 
			
		||||
EXPECTED_THROUGHPUT=8.0
 | 
			
		||||
INPUT_LEN=1800
 | 
			
		||||
OUTPUT_LEN=128
 | 
			
		||||
@ -1,90 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
if [ ! -f "$1" ]; then
 | 
			
		||||
  echo "Error: The env file '$1' does not exist."
 | 
			
		||||
  exit 1  # Exit the script with a non-zero status to indicate an error
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
ENV_FILE=$1
 | 
			
		||||
 | 
			
		||||
# For testing on local vm, use `set -a` to export all variables
 | 
			
		||||
source /etc/environment
 | 
			
		||||
source $ENV_FILE
 | 
			
		||||
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    docker rm -f $CONTAINER_NAME || true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
LOG_ROOT=$(mktemp -d)
 | 
			
		||||
# If mktemp fails, set -e will cause the script to exit.
 | 
			
		||||
echo "Results will be stored in: $LOG_ROOT"
 | 
			
		||||
 | 
			
		||||
if [ -z "$HF_TOKEN" ]; then
 | 
			
		||||
  echo "Error: HF_TOKEN is not set or is empty."  
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Make sure mounted disk or dir exists
 | 
			
		||||
if [ ! -d "$DOWNLOAD_DIR" ]; then
 | 
			
		||||
    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
echo "Run model $MODEL"
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
echo "starting docker...$CONTAINER_NAME"
 | 
			
		||||
echo    
 | 
			
		||||
docker run \
 | 
			
		||||
 -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
 | 
			
		||||
 --env-file $ENV_FILE \
 | 
			
		||||
 -e HF_TOKEN="$HF_TOKEN" \
 | 
			
		||||
 -e TARGET_COMMIT=$BUILDKITE_COMMIT \
 | 
			
		||||
 -e MODEL=$MODEL \
 | 
			
		||||
 -e WORKSPACE=/workspace \
 | 
			
		||||
 --name $CONTAINER_NAME \
 | 
			
		||||
 -d \
 | 
			
		||||
 --privileged \
 | 
			
		||||
 --network host \
 | 
			
		||||
 -v /dev/shm:/dev/shm \
 | 
			
		||||
 vllm/vllm-tpu-bm tail -f /dev/null
 | 
			
		||||
 | 
			
		||||
echo "run script..."
 | 
			
		||||
echo
 | 
			
		||||
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
 | 
			
		||||
 | 
			
		||||
echo "copy result back..."
 | 
			
		||||
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
 | 
			
		||||
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
 | 
			
		||||
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
 | 
			
		||||
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
 | 
			
		||||
 | 
			
		||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 | 
			
		||||
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
 | 
			
		||||
 | 
			
		||||
if [ "$BUILDKITE" = "true" ]; then
 | 
			
		||||
  echo "Running inside Buildkite"
 | 
			
		||||
  buildkite-agent artifact upload "$VLLM_LOG" 
 | 
			
		||||
  buildkite-agent artifact upload "$BM_LOG"
 | 
			
		||||
else
 | 
			
		||||
  echo "Not running inside Buildkite"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# compare the throughput with EXPECTED_THROUGHPUT 
 | 
			
		||||
# and assert meeting the expectation
 | 
			
		||||
# 
 | 
			
		||||
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
 | 
			
		||||
  echo "Failed to get the throughput"
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
 | 
			
		||||
  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
@ -1,14 +0,0 @@
 | 
			
		||||
# Environment config
 | 
			
		||||
TEST_NAME=llama8bw8a8
 | 
			
		||||
CONTAINER_NAME=tpu-test
 | 
			
		||||
 | 
			
		||||
# vllm config
 | 
			
		||||
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
 | 
			
		||||
MAX_NUM_SEQS=128
 | 
			
		||||
MAX_NUM_BATCHED_TOKENS=1024
 | 
			
		||||
TENSOR_PARALLEL_SIZE=1
 | 
			
		||||
MAX_MODEL_LEN=2048
 | 
			
		||||
DOWNLOAD_DIR=/mnt/disks/persist
 | 
			
		||||
EXPECTED_THROUGHPUT=10.0
 | 
			
		||||
INPUT_LEN=1800
 | 
			
		||||
OUTPUT_LEN=128
 | 
			
		||||
@ -1,93 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -euo pipefail
 | 
			
		||||
 | 
			
		||||
VLLM_LOG="$WORKSPACE/vllm_log.txt"
 | 
			
		||||
BM_LOG="$WORKSPACE/bm_log.txt"
 | 
			
		||||
 | 
			
		||||
if [ -n "$TARGET_COMMIT" ]; then
 | 
			
		||||
  head_hash=$(git rev-parse HEAD)
 | 
			
		||||
  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
 | 
			
		||||
    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
echo "model: $MODEL"
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# create a log folder
 | 
			
		||||
#
 | 
			
		||||
mkdir "$WORKSPACE/log"
 | 
			
		||||
 | 
			
		||||
# TODO: Move to image building.
 | 
			
		||||
pip install pandas
 | 
			
		||||
pip install datasets
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# create sonnet_4x
 | 
			
		||||
#
 | 
			
		||||
echo "Create sonnet_4x.txt"
 | 
			
		||||
echo "" > benchmarks/sonnet_4x.txt
 | 
			
		||||
for _ in {1..4}
 | 
			
		||||
 do
 | 
			
		||||
  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# start vllm service in backend
 | 
			
		||||
#
 | 
			
		||||
echo "lanching vllm..."
 | 
			
		||||
echo "logging to $VLLM_LOG"
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
VLLM_USE_V1=1 vllm serve $MODEL \
 | 
			
		||||
 --seed 42 \
 | 
			
		||||
 --max-num-seqs $MAX_NUM_SEQS \
 | 
			
		||||
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
 | 
			
		||||
 --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 | 
			
		||||
 --no-enable-prefix-caching \
 | 
			
		||||
 --download_dir $DOWNLOAD_DIR \
 | 
			
		||||
 --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
echo "wait for 20 minutes.."
 | 
			
		||||
echo
 | 
			
		||||
# sleep 1200
 | 
			
		||||
# wait for 10 minutes...
 | 
			
		||||
for i in {1..120}; do
 | 
			
		||||
    # TODO: detect other type of errors.
 | 
			
		||||
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
 | 
			
		||||
        echo "Detected RuntimeError, exiting."
 | 
			
		||||
        exit 1
 | 
			
		||||
    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
 | 
			
		||||
        echo "Application started"
 | 
			
		||||
        break
 | 
			
		||||
    else
 | 
			
		||||
        echo "wait for 10 seconds..."
 | 
			
		||||
        sleep 10
 | 
			
		||||
    fi
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# run test
 | 
			
		||||
#
 | 
			
		||||
echo "run benchmark test..."
 | 
			
		||||
echo "logging to $BM_LOG"
 | 
			
		||||
echo
 | 
			
		||||
vllm bench serve \
 | 
			
		||||
    --backend vllm \
 | 
			
		||||
    --model $MODEL  \
 | 
			
		||||
    --dataset-name sonnet \
 | 
			
		||||
    --dataset-path benchmarks/sonnet_4x.txt \
 | 
			
		||||
    --sonnet-input-len $INPUT_LEN \
 | 
			
		||||
    --sonnet-output-len $OUTPUT_LEN \
 | 
			
		||||
    --ignore-eos > "$BM_LOG"
 | 
			
		||||
 | 
			
		||||
echo "completed..."
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 | 
			
		||||
echo "throughput: $throughput"
 | 
			
		||||
echo
 | 
			
		||||
@ -14,19 +14,8 @@ fi
 | 
			
		||||
# Get the single wheel file
 | 
			
		||||
wheel="${wheel_files[0]}"
 | 
			
		||||
 | 
			
		||||
# Detect architecture and rename 'linux' to appropriate manylinux version
 | 
			
		||||
arch=$(uname -m)
 | 
			
		||||
if [[ $arch == "x86_64" ]]; then
 | 
			
		||||
    manylinux_version="manylinux1"
 | 
			
		||||
elif [[ $arch == "aarch64" ]]; then
 | 
			
		||||
    manylinux_version="manylinux2014"
 | 
			
		||||
else
 | 
			
		||||
    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
 | 
			
		||||
    manylinux_version="manylinux1"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Rename 'linux' to the appropriate manylinux version in the wheel filename
 | 
			
		||||
new_wheel="${wheel/linux/$manylinux_version}"
 | 
			
		||||
# Rename 'linux' to 'manylinux1' in the wheel filename
 | 
			
		||||
new_wheel="${wheel/linux/manylinux1}"
 | 
			
		||||
mv -- "$wheel" "$new_wheel"
 | 
			
		||||
wheel="$new_wheel"
 | 
			
		||||
 | 
			
		||||
@ -58,15 +47,14 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 | 
			
		||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
			
		||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
			
		||||
 | 
			
		||||
if [[ $normal_wheel == *"cu126"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu126 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu128"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu128, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu128 wheels"
 | 
			
		||||
if [[ $normal_wheel == *"cu118"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu118, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu118 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu121"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu121, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu121 wheels"
 | 
			
		||||
else
 | 
			
		||||
    # only upload index.html for cu129 wheels (default wheels) as it
 | 
			
		||||
    # is available on both x86 and arm64
 | 
			
		||||
    # only upload index.html for cu124 wheels (default wheels)
 | 
			
		||||
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
 | 
			
		||||
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 | 
			
		||||
fi
 | 
			
		||||
@ -75,17 +63,15 @@ fi
 | 
			
		||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 | 
			
		||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 | 
			
		||||
 | 
			
		||||
if [[ $normal_wheel == *"cu126"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu126 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu128"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu128, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu128 wheels"
 | 
			
		||||
if [[ $normal_wheel == *"cu118"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu118, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu118 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu121"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu121, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu121 wheels"
 | 
			
		||||
else
 | 
			
		||||
    # only upload index.html for cu129 wheels (default wheels) as it
 | 
			
		||||
    # is available on both x86 and arm64
 | 
			
		||||
    # only upload index.html for cu124 wheels (default wheels)
 | 
			
		||||
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 | 
			
		||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										32
									
								
								.coveragerc
									
									
									
									
									
								
							
							
						
						
									
										32
									
								
								.coveragerc
									
									
									
									
									
								
							@ -1,32 +0,0 @@
 | 
			
		||||
[run]
 | 
			
		||||
source = vllm
 | 
			
		||||
omit =
 | 
			
		||||
    */tests/*
 | 
			
		||||
    */test_*
 | 
			
		||||
    */__pycache__/*
 | 
			
		||||
    */build/*
 | 
			
		||||
    */dist/*
 | 
			
		||||
    */vllm.egg-info/*
 | 
			
		||||
    */third_party/*
 | 
			
		||||
    */examples/*
 | 
			
		||||
    */benchmarks/*
 | 
			
		||||
    */docs/*
 | 
			
		||||
 | 
			
		||||
[report]
 | 
			
		||||
exclude_lines =
 | 
			
		||||
    pragma: no cover
 | 
			
		||||
    def __repr__
 | 
			
		||||
    if self.debug:
 | 
			
		||||
    if settings.DEBUG
 | 
			
		||||
    raise AssertionError
 | 
			
		||||
    raise NotImplementedError
 | 
			
		||||
    if 0:
 | 
			
		||||
    if __name__ == .__main__.:
 | 
			
		||||
    class .*\bProtocol\):
 | 
			
		||||
    @(abc\.)?abstractmethod
 | 
			
		||||
 | 
			
		||||
[html]
 | 
			
		||||
directory = htmlcov
 | 
			
		||||
 | 
			
		||||
[xml]
 | 
			
		||||
output = coverage.xml
 | 
			
		||||
@ -1,6 +0,0 @@
 | 
			
		||||
# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
 | 
			
		||||
have_fun: false  # Just review the code
 | 
			
		||||
code_review:
 | 
			
		||||
  comment_severity_threshold: HIGH  # Reduce quantity of comments
 | 
			
		||||
  pull_request_opened:
 | 
			
		||||
    summary: false  # Don't summarize the PR in a separate comment
 | 
			
		||||
							
								
								
									
										24
									
								
								.github/.bc-linter.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										24
									
								
								.github/.bc-linter.yml
									
									
									
									
										vendored
									
									
								
							@ -1,24 +0,0 @@
 | 
			
		||||
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
 | 
			
		||||
version: 1
 | 
			
		||||
paths:
 | 
			
		||||
# We temporarily disable globally, and will only enable with `annotations.include`
 | 
			
		||||
# include:
 | 
			
		||||
#   - "vllm/v1/attetion/*.py"
 | 
			
		||||
#   - "vllm/v1/core/*.py"
 | 
			
		||||
exclude:
 | 
			
		||||
  - "**/*.py"
 | 
			
		||||
 | 
			
		||||
scan:
 | 
			
		||||
  functions: true        # check free functions and methods
 | 
			
		||||
  classes: true          # check classes/dataclasses
 | 
			
		||||
  public_only: true      # ignore names starting with "_" at any level
 | 
			
		||||
 | 
			
		||||
annotations:
 | 
			
		||||
  include:               # decorators that force‑include a symbol
 | 
			
		||||
    - name: "bc_linter_include"  # matched by simple name or dotted suffix
 | 
			
		||||
      propagate_to_members: false # for classes, include methods/inner classes
 | 
			
		||||
  exclude:               # decorators that force‑exclude a symbol
 | 
			
		||||
    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
 | 
			
		||||
      propagate_to_members: true  # for classes, exclude methods/inner classes
 | 
			
		||||
 | 
			
		||||
excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
 | 
			
		||||
							
								
								
									
										131
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										131
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							@ -2,121 +2,40 @@
 | 
			
		||||
# for more info about CODEOWNERS file
 | 
			
		||||
 | 
			
		||||
# This lists cover the "core" components of vLLM that require careful review
 | 
			
		||||
/vllm/attention @LucasWilkinson
 | 
			
		||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 | 
			
		||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 | 
			
		||||
/vllm/model_executor/layers/fused_moe @mgoin
 | 
			
		||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 | 
			
		||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 | 
			
		||||
/vllm/model_executor/layers/mamba @tdoublep
 | 
			
		||||
/vllm/model_executor/model_loader @22quinn
 | 
			
		||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
 | 
			
		||||
/vllm/v1/attention @LucasWilkinson
 | 
			
		||||
/vllm/v1/sample @22quinn @houseroad
 | 
			
		||||
/vllm/vllm_flash_attn @LucasWilkinson
 | 
			
		||||
/vllm/lora @jeejeelee
 | 
			
		||||
/vllm/reasoning @aarnphm @chaunceyjiang
 | 
			
		||||
/vllm/entrypoints @aarnphm @chaunceyjiang
 | 
			
		||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
 | 
			
		||||
/vllm/distributed/kv_transfer @NickLucche @ApostaC
 | 
			
		||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 | 
			
		||||
 | 
			
		||||
# Any change to the VllmConfig changes can have a large user-facing impact,
 | 
			
		||||
# so spam a lot of people
 | 
			
		||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
 | 
			
		||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 | 
			
		||||
/vllm/model_executor/guided_decoding @mgoin @russellb
 | 
			
		||||
/vllm/multimodal @DarkLight1337 @ywang96
 | 
			
		||||
CMakeLists.txt @tlrmchlsmth
 | 
			
		||||
 | 
			
		||||
# vLLM V1
 | 
			
		||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 | 
			
		||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 | 
			
		||||
/vllm/v1/spec_decode @benchislett @luccafong
 | 
			
		||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
 | 
			
		||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
 | 
			
		||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 | 
			
		||||
/vllm/v1/kv_cache_interface.py @heheda12345
 | 
			
		||||
/vllm/v1/offloading @ApostaC
 | 
			
		||||
/vllm/v1/structured_output @mgoin @russellb
 | 
			
		||||
 | 
			
		||||
# Test ownership
 | 
			
		||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
 | 
			
		||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 | 
			
		||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 | 
			
		||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
 | 
			
		||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
 | 
			
		||||
/tests/distributed/test_same_node.py @youkaichao
 | 
			
		||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
 | 
			
		||||
/tests/evals @mgoin
 | 
			
		||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 | 
			
		||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 | 
			
		||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
 | 
			
		||||
/tests/kernels @tlrmchlsmth @WoosukKwon
 | 
			
		||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
 | 
			
		||||
/tests/models @DarkLight1337 @ywang96
 | 
			
		||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 | 
			
		||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 | 
			
		||||
/tests/multi_step @alexm-redhat @comaniac
 | 
			
		||||
/tests/multimodal @DarkLight1337 @ywang96
 | 
			
		||||
/tests/prefix_caching @comaniac @KuntaiDu
 | 
			
		||||
/tests/quantization @mgoin @robertgshaw2-redhat
 | 
			
		||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
 | 
			
		||||
/tests/test_inputs.py @DarkLight1337 @ywang96
 | 
			
		||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 | 
			
		||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
 | 
			
		||||
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 | 
			
		||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
 | 
			
		||||
/tests/lora @jeejeelee
 | 
			
		||||
/tests/models/language/generation/test_hybrid.py @tdoublep
 | 
			
		||||
/tests/v1/kv_connector/nixl_integration @NickLucche 
 | 
			
		||||
/tests/v1/kv_connector @ApostaC
 | 
			
		||||
/tests/v1/offloading @ApostaC
 | 
			
		||||
 | 
			
		||||
# Transformers backend
 | 
			
		||||
/vllm/model_executor/models/transformers.py @hmellor
 | 
			
		||||
/tests/models/test_transformers.py @hmellor
 | 
			
		||||
 | 
			
		||||
# Docs
 | 
			
		||||
/docs/mkdocs @hmellor
 | 
			
		||||
/docs/**/*.yml @hmellor
 | 
			
		||||
/requirements/docs.txt @hmellor
 | 
			
		||||
.readthedocs.yaml @hmellor
 | 
			
		||||
mkdocs.yaml @hmellor
 | 
			
		||||
 | 
			
		||||
# Linting
 | 
			
		||||
.markdownlint.yaml @hmellor
 | 
			
		||||
.pre-commit-config.yaml @hmellor
 | 
			
		||||
/tools/pre_commit @hmellor
 | 
			
		||||
 | 
			
		||||
# CPU
 | 
			
		||||
/vllm/v1/worker/cpu* @bigPYJ1151
 | 
			
		||||
/csrc/cpu @bigPYJ1151
 | 
			
		||||
/vllm/platforms/cpu.py @bigPYJ1151
 | 
			
		||||
/cmake/cpu_extension.cmake @bigPYJ1151
 | 
			
		||||
/docker/Dockerfile.cpu @bigPYJ1151
 | 
			
		||||
 | 
			
		||||
# Intel GPU
 | 
			
		||||
/vllm/v1/worker/xpu* @jikunshang
 | 
			
		||||
/vllm/platforms/xpu.py @jikunshang
 | 
			
		||||
/docker/Dockerfile.xpu @jikunshang
 | 
			
		||||
 | 
			
		||||
# Qwen-specific files
 | 
			
		||||
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
 | 
			
		||||
/vllm/model_executor/models/qwen* @sighingnow
 | 
			
		||||
 | 
			
		||||
# MTP-specific files
 | 
			
		||||
/vllm/model_executor/models/deepseek_mtp.py @luccafong
 | 
			
		||||
 | 
			
		||||
# Mistral-specific files
 | 
			
		||||
/vllm/model_executor/models/mistral*.py @patrickvonplaten
 | 
			
		||||
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
 | 
			
		||||
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
 | 
			
		||||
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
 | 
			
		||||
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
 | 
			
		||||
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
 | 
			
		||||
 | 
			
		||||
# Kernels
 | 
			
		||||
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
 | 
			
		||||
/vllm/attention/ops/triton_unified_attention.py @tdoublep
 | 
			
		||||
 | 
			
		||||
# ROCm related: specify owner with write access to notify AMD folks for careful code review
 | 
			
		||||
/docker/Dockerfile.rocm* @gshtras
 | 
			
		||||
/vllm/v1/attention/backends/rocm*.py @gshtras
 | 
			
		||||
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
 | 
			
		||||
/vllm/attention/ops/rocm*.py @gshtras
 | 
			
		||||
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
 | 
			
		||||
 | 
			
		||||
# TPU
 | 
			
		||||
/vllm/v1/worker/tpu* @NickLucche
 | 
			
		||||
/vllm/platforms/tpu.py @NickLucche
 | 
			
		||||
/vllm/v1/sample/tpu @NickLucche
 | 
			
		||||
/vllm/tests/v1/tpu @NickLucche
 | 
			
		||||
 | 
			
		||||
# KVConnector installation files
 | 
			
		||||
/requirements/kv_connectors.txt @NickLucche
 | 
			
		||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
 | 
			
		||||
/tests/v1/structured_output @mgoin @russellb
 | 
			
		||||
/tests/weight_loading @mgoin @youkaichao
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										20
									
								
								.github/ISSUE_TEMPLATE/400-bug-report.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										20
									
								
								.github/ISSUE_TEMPLATE/400-bug-report.yml
									
									
									
									
										vendored
									
									
								
							@ -8,16 +8,6 @@ body:
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: |
 | 
			
		||||
      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
 | 
			
		||||
      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
 | 
			
		||||
      - Passwords or authentication credentials
 | 
			
		||||
      - Private URLs or endpoints
 | 
			
		||||
      - Personal or confidential data
 | 
			
		||||
      
 | 
			
		||||
      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Your current environment
 | 
			
		||||
@ -31,7 +21,7 @@ body:
 | 
			
		||||
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
 | 
			
		||||
    value: |
 | 
			
		||||
      <details>
 | 
			
		||||
      <summary>The output of <code>python collect_env.py</code></summary>
 | 
			
		||||
      <summary>The output of `python collect_env.py`</summary>
 | 
			
		||||
 | 
			
		||||
      ```text
 | 
			
		||||
      Your output of `python collect_env.py` here
 | 
			
		||||
@ -85,20 +75,20 @@ body:
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
      ```
 | 
			
		||||
      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
 | 
			
		||||
      The error message you got, with the full traceback.
 | 
			
		||||
      ```
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: |
 | 
			
		||||
      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
 | 
			
		||||
    value: >
 | 
			
		||||
      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
 | 
			
		||||
 | 
			
		||||
      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
 | 
			
		||||
 | 
			
		||||
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
 | 
			
		||||
 | 
			
		||||
      Thanks for reporting 🙏!
 | 
			
		||||
      Thanks for contributing 🎉!
 | 
			
		||||
- type: checkboxes
 | 
			
		||||
  id: askllm
 | 
			
		||||
  attributes:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										69
									
								
								.github/ISSUE_TEMPLATE/450-ci-failure.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										69
									
								
								.github/ISSUE_TEMPLATE/450-ci-failure.yml
									
									
									
									
										vendored
									
									
								
							@ -1,69 +0,0 @@
 | 
			
		||||
name: 🧪 CI failure report
 | 
			
		||||
description: Report a failing test.
 | 
			
		||||
title: "[CI Failure]: "
 | 
			
		||||
labels: ["ci-failure"]
 | 
			
		||||
 | 
			
		||||
body:
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      #### Include the name of the failing Buildkite step and test file in the title.
 | 
			
		||||
- type: input
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Name of failing test
 | 
			
		||||
    description: |
 | 
			
		||||
      Paste in the fully-qualified name of the failing test from the logs.
 | 
			
		||||
    placeholder: |
 | 
			
		||||
      `path/to/test_file.py::test_name[params]`
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: checkboxes
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Basic information
 | 
			
		||||
    description: Select all items that apply to the failing test.
 | 
			
		||||
    options:
 | 
			
		||||
      - label: Flaky test
 | 
			
		||||
      - label: Can reproduce locally
 | 
			
		||||
      - label: Caused by external libraries (e.g. bug in `transformers`)
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: 🧪 Describe the failing test
 | 
			
		||||
    description: |
 | 
			
		||||
      Please provide a clear and concise description of the failing test.
 | 
			
		||||
    placeholder: |
 | 
			
		||||
      A clear and concise description of the failing test.
 | 
			
		||||
  
 | 
			
		||||
      ```
 | 
			
		||||
      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
 | 
			
		||||
      ```
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: 📝 History of failing test
 | 
			
		||||
    description: |
 | 
			
		||||
      Since when did the test start to fail?
 | 
			
		||||
      You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
 | 
			
		||||
 | 
			
		||||
      If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
 | 
			
		||||
 | 
			
		||||
      - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
 | 
			
		||||
 | 
			
		||||
      - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
 | 
			
		||||
 | 
			
		||||
      - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
 | 
			
		||||
    placeholder: |
 | 
			
		||||
      Approximate timeline and/or problematic PRs
 | 
			
		||||
 | 
			
		||||
      A link to the Buildkite analytics of the failing test (if available)
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: CC List.
 | 
			
		||||
    description: >
 | 
			
		||||
      The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      Thanks for reporting 🙏!
 | 
			
		||||
							
								
								
									
										4
									
								
								.github/ISSUE_TEMPLATE/750-RFC.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/ISSUE_TEMPLATE/750-RFC.yml
									
									
									
									
										vendored
									
									
								
							@ -43,6 +43,10 @@ body:
 | 
			
		||||
      Any other things you would like to mention.
 | 
			
		||||
  validations:
 | 
			
		||||
    required: false
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      Thanks for contributing 🎉!
 | 
			
		||||
- type: checkboxes
 | 
			
		||||
  id: askllm
 | 
			
		||||
  attributes:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										23
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										23
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							@ -1,21 +1,6 @@
 | 
			
		||||
<!-- markdownlint-disable -->
 | 
			
		||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
 | 
			
		||||
FILL IN THE PR DESCRIPTION HERE
 | 
			
		||||
 | 
			
		||||
## Purpose
 | 
			
		||||
FIX #xxxx (*link existing issues this PR will resolve*)
 | 
			
		||||
 | 
			
		||||
## Test Plan
 | 
			
		||||
 | 
			
		||||
## Test Result
 | 
			
		||||
 | 
			
		||||
---
 | 
			
		||||
<details>
 | 
			
		||||
<summary> Essential Elements of an Effective PR Description Checklist </summary>
 | 
			
		||||
 | 
			
		||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
 | 
			
		||||
- [ ] The test plan, such as providing test command.
 | 
			
		||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
 | 
			
		||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
 | 
			
		||||
- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
 | 
			
		||||
</details>
 | 
			
		||||
 | 
			
		||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
 | 
			
		||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
 | 
			
		||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										198
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										198
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							@ -27,22 +27,6 @@ pull_request_rules:
 | 
			
		||||
      add:
 | 
			
		||||
        - ci/build
 | 
			
		||||
 | 
			
		||||
- name: label-deepseek
 | 
			
		||||
  description: Automatically apply deepseek label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/.*deepseek.*\.py
 | 
			
		||||
      - files~=^tests/.*deepseek.*\.py
 | 
			
		||||
      - files~=^vllm/entrypoints/openai/tool_parsers/.*deepseek.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*deepseek.*\.py
 | 
			
		||||
      - files~=^vllm/reasoning/.*deepseek.*\.py
 | 
			
		||||
      - files~=^vllm/transformers_utils/.*deepseek.*\.py
 | 
			
		||||
      - title~=(?i)DeepSeek
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - deepseek
 | 
			
		||||
 | 
			
		||||
- name: label-frontend
 | 
			
		||||
  description: Automatically apply frontend label
 | 
			
		||||
  conditions:
 | 
			
		||||
@ -52,21 +36,6 @@ pull_request_rules:
 | 
			
		||||
      add:
 | 
			
		||||
        - frontend
 | 
			
		||||
 | 
			
		||||
- name: label-llama
 | 
			
		||||
  description: Automatically apply llama label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/.*llama.*\.py
 | 
			
		||||
      - files~=^tests/.*llama.*\.py
 | 
			
		||||
      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*llama.*\.py
 | 
			
		||||
      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
 | 
			
		||||
      - title~=(?i)llama
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - llama
 | 
			
		||||
 | 
			
		||||
- name: label-multi-modality
 | 
			
		||||
  description: Automatically apply multi-modality label
 | 
			
		||||
  conditions:
 | 
			
		||||
@ -74,105 +43,23 @@ pull_request_rules:
 | 
			
		||||
      - files~=^vllm/multimodal/
 | 
			
		||||
      - files~=^tests/multimodal/
 | 
			
		||||
      - files~=^tests/models/multimodal/
 | 
			
		||||
      - files~=^tests/models/*/audio_language/
 | 
			
		||||
      - files~=^tests/models/*/vision_language/
 | 
			
		||||
      - files=tests/models/test_vision.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - multi-modality
 | 
			
		||||
 | 
			
		||||
- name: label-new-model
 | 
			
		||||
  description: Automatically apply new-model label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - and:
 | 
			
		||||
      - files~=^vllm/model_executor/models/
 | 
			
		||||
      - files=vllm/model_executor/models/registry.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - new-model
 | 
			
		||||
 | 
			
		||||
- name: label-performance
 | 
			
		||||
  description: Automatically apply performance label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^benchmarks/
 | 
			
		||||
      - files~=^vllm/benchmarks/
 | 
			
		||||
      - files~=^tests/benchmarks/
 | 
			
		||||
      - files~=^\.buildkite/nightly-benchmarks/
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - performance
 | 
			
		||||
 | 
			
		||||
- name: label-qwen
 | 
			
		||||
  description: Automatically apply qwen label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/.*qwen.*\.py
 | 
			
		||||
      - files~=^tests/.*qwen.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*qwen.*\.py
 | 
			
		||||
      - files~=^vllm/reasoning/.*qwen.*\.py
 | 
			
		||||
      - title~=(?i)Qwen
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - qwen
 | 
			
		||||
 | 
			
		||||
- name: label-gpt-oss
 | 
			
		||||
  description: Automatically apply gpt-oss label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/.*gpt[-_]?oss.*\.py
 | 
			
		||||
      - files~=^tests/.*gpt[-_]?oss.*\.py
 | 
			
		||||
      - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
 | 
			
		||||
      - files~=^tests/entrypoints/test_context.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
 | 
			
		||||
      - files~=^vllm/entrypoints/harmony_utils.py
 | 
			
		||||
      - files~=^vllm/entrypoints/tool_server.py
 | 
			
		||||
      - files~=^vllm/entrypoints/tool.py
 | 
			
		||||
      - files~=^vllm/entrypoints/context.py
 | 
			
		||||
      - title~=(?i)gpt[-_]?oss
 | 
			
		||||
      - title~=(?i)harmony
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - gpt-oss
 | 
			
		||||
 | 
			
		||||
- name: label-rocm
 | 
			
		||||
  description: Automatically apply rocm label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^csrc/rocm/
 | 
			
		||||
      - files~=^docker/Dockerfile.rocm
 | 
			
		||||
      - files~=^requirements/rocm.*\.txt
 | 
			
		||||
      - files~=^vllm/attention/backends/rocm.*\.py
 | 
			
		||||
      - files~=^vllm/attention/ops/rocm.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
 | 
			
		||||
      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
 | 
			
		||||
      - files~=^tests/kernels/.*_rocm.*\.py
 | 
			
		||||
      - files=vllm/platforms/rocm.py
 | 
			
		||||
      - title~=(?i)AMD
 | 
			
		||||
      - title~=(?i)ROCm
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - rocm
 | 
			
		||||
 | 
			
		||||
- name: label-structured-output
 | 
			
		||||
  description: Automatically apply structured-output label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^benchmarks/structured_schemas/
 | 
			
		||||
      - files=benchmarks/benchmark_serving_structured_output.py
 | 
			
		||||
      - files=benchmarks/run_structured_output_benchmark.sh
 | 
			
		||||
      - files=docs/features/structured_outputs.md
 | 
			
		||||
      - files=examples/offline_inference/structured_outputs.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
 | 
			
		||||
      - files~=^tests/v1/structured_output/
 | 
			
		||||
      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
 | 
			
		||||
      - files~=^vllm/v1/structured_output/
 | 
			
		||||
      - files~=^vllm/model_executor/guided_decoding/
 | 
			
		||||
      - files=tests/model_executor/test_guided_processors.py
 | 
			
		||||
      - files=tests/entrypoints/llm/test_guided_generate.py
 | 
			
		||||
      - files=benchmarks/benchmark_serving_guided.py
 | 
			
		||||
      - files=benchmarks/benchmark_guided.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
@ -182,12 +69,9 @@ pull_request_rules:
 | 
			
		||||
  description: Automatically apply speculative-decoding label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^vllm/v1/spec_decode/
 | 
			
		||||
      - files~=^tests/v1/spec_decode/
 | 
			
		||||
      - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*eagle.*\.py
 | 
			
		||||
      - files=vllm/model_executor/models/mlp_speculator.py
 | 
			
		||||
      - files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
 | 
			
		||||
      - files~=^vllm/spec_decode/
 | 
			
		||||
      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
 | 
			
		||||
      - files~=^tests/spec_decode/
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
@ -234,26 +118,6 @@ pull_request_rules:
 | 
			
		||||
      remove:
 | 
			
		||||
        - tpu
 | 
			
		||||
 | 
			
		||||
- name: label-tool-calling
 | 
			
		||||
  description: Automatically add tool-calling label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^tests/tool_use/
 | 
			
		||||
      - files~=^tests/mistral_tool_use/
 | 
			
		||||
      - files~=^tests/entrypoints/openai/tool_parsers/
 | 
			
		||||
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
 | 
			
		||||
      - files~=^vllm/entrypoints/openai/tool_parsers/
 | 
			
		||||
      - files=docs/features/tool_calling.md
 | 
			
		||||
      - files~=^examples/tool_chat_*
 | 
			
		||||
      - files=examples/offline_inference/chat_with_tools.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - tool-calling
 | 
			
		||||
 | 
			
		||||
- name: ping author on conflicts and add 'needs-rebase' label
 | 
			
		||||
  conditions:
 | 
			
		||||
      - conflict
 | 
			
		||||
@ -269,31 +133,6 @@ pull_request_rules:
 | 
			
		||||
 | 
			
		||||
       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 | 
			
		||||
 | 
			
		||||
- name: assign reviewer for tensorizer changes
 | 
			
		||||
  conditions:
 | 
			
		||||
      - files~=^vllm/model_executor/model_loader/tensorizer.py
 | 
			
		||||
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
 | 
			
		||||
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
 | 
			
		||||
      - files~=^tests/tensorizer_loader/
 | 
			
		||||
  actions:
 | 
			
		||||
    assign:
 | 
			
		||||
      users:
 | 
			
		||||
        - "sangstar"
 | 
			
		||||
 | 
			
		||||
- name: assign reviewer for modelopt changes
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
 | 
			
		||||
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
 | 
			
		||||
        - files~=^tests/models/quantization/test_modelopt\.py$
 | 
			
		||||
        - files~=^tests/quantization/test_modelopt\.py$
 | 
			
		||||
        - files~=^tests/models/quantization/test_nvfp4\.py$
 | 
			
		||||
        - files~=^docs/features/quantization/modelopt\.md$
 | 
			
		||||
  actions:
 | 
			
		||||
    assign:
 | 
			
		||||
      users:
 | 
			
		||||
        - "Edwardf0t1"
 | 
			
		||||
 | 
			
		||||
- name: remove 'needs-rebase' label when conflict is resolved
 | 
			
		||||
  conditions:
 | 
			
		||||
      - -conflict
 | 
			
		||||
@ -302,20 +141,3 @@ pull_request_rules:
 | 
			
		||||
    label:
 | 
			
		||||
      remove:
 | 
			
		||||
        - needs-rebase
 | 
			
		||||
 | 
			
		||||
- name: label-kv-connector
 | 
			
		||||
  description: Automatically apply kv-connector label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/online_serving/disaggregated[^/]*/.*
 | 
			
		||||
      - files~=^examples/offline_inference/disaggregated[^/]*/.*
 | 
			
		||||
      - files~=^examples/others/lmcache/
 | 
			
		||||
      - files~=^tests/v1/kv_connector/
 | 
			
		||||
      - files~=^vllm/distributed/kv_transfer/
 | 
			
		||||
      - title~=(?i)\bP/?D\b
 | 
			
		||||
      - title~=(?i)NIXL
 | 
			
		||||
      - title~=(?i)LMCache
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - kv-connector
 | 
			
		||||
							
								
								
									
										21
									
								
								.github/scale-config.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										21
									
								
								.github/scale-config.yml
									
									
									
									
										vendored
									
									
								
							@ -1,21 +0,0 @@
 | 
			
		||||
# scale-config.yml:
 | 
			
		||||
#   Powers what instance types are available for GHA auto-scaled
 | 
			
		||||
#   runners. Runners listed here will be available as self hosted
 | 
			
		||||
#   runners, configuration is directly pulled from the main branch.
 | 
			
		||||
# runner_types:
 | 
			
		||||
#   runner_label:
 | 
			
		||||
#     instance_type: m4.large
 | 
			
		||||
#     os: linux
 | 
			
		||||
#     # min_available defaults to the global cfg in the ALI Terraform
 | 
			
		||||
#     min_available: undefined
 | 
			
		||||
#     # when max_available value is not defined, no max runners is enforced
 | 
			
		||||
#     max_available: undefined
 | 
			
		||||
#     disk_size: 50
 | 
			
		||||
#     is_ephemeral: true
 | 
			
		||||
 | 
			
		||||
runner_types:
 | 
			
		||||
  linux.2xlarge:
 | 
			
		||||
    disk_size: 150
 | 
			
		||||
    instance_type: c5.2xlarge
 | 
			
		||||
    is_ephemeral: true
 | 
			
		||||
    os: linux
 | 
			
		||||
							
								
								
									
										10
									
								
								.github/scripts/cleanup_pr_body.sh
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								.github/scripts/cleanup_pr_body.sh
									
									
									
									
										vendored
									
									
								
							@ -15,18 +15,18 @@ NEW=/tmp/new_pr_body.txt
 | 
			
		||||
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
 | 
			
		||||
cp "${OLD}" "${NEW}"
 | 
			
		||||
 | 
			
		||||
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
 | 
			
		||||
sed -i '/<!--.*-->$/d' "${NEW}"
 | 
			
		||||
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
 | 
			
		||||
sed -i '/FIX #xxxx.*$/d' "${NEW}"
 | 
			
		||||
 | 
			
		||||
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
 | 
			
		||||
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
 | 
			
		||||
# Remove "FILL IN THE PR DESCRIPTION HERE"
 | 
			
		||||
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
 | 
			
		||||
 | 
			
		||||
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
 | 
			
		||||
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
 | 
			
		||||
 | 
			
		||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
 | 
			
		||||
python3 - <<EOF
 | 
			
		||||
import regex as re
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
with open("${NEW}", "r") as file:
 | 
			
		||||
    content = file.read()
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										4
									
								
								.github/workflows/add_label_automerge.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/add_label_automerge.yml
									
									
									
									
										vendored
									
									
								
							@ -1,6 +1,4 @@
 | 
			
		||||
name: Add label on auto-merge enabled
 | 
			
		||||
permissions:
 | 
			
		||||
    pull-requests: write
 | 
			
		||||
on:
 | 
			
		||||
    pull_request_target:
 | 
			
		||||
        types:
 | 
			
		||||
@ -10,7 +8,7 @@ jobs:
 | 
			
		||||
        runs-on: ubuntu-latest
 | 
			
		||||
        steps:
 | 
			
		||||
            -   name: Add label
 | 
			
		||||
                uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
 | 
			
		||||
                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
 | 
			
		||||
                with:
 | 
			
		||||
                    script: |
 | 
			
		||||
                        github.rest.issues.addLabels({
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										29
									
								
								.github/workflows/bc-lint.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										29
									
								
								.github/workflows/bc-lint.yml
									
									
									
									
										vendored
									
									
								
							@ -1,29 +0,0 @@
 | 
			
		||||
name: BC Lint
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
  pull_request:
 | 
			
		||||
    types:
 | 
			
		||||
      - opened
 | 
			
		||||
      - synchronize
 | 
			
		||||
      - reopened
 | 
			
		||||
      - labeled
 | 
			
		||||
      - unlabeled
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  bc_lint:
 | 
			
		||||
    if: github.repository_owner == 'vllm-project'
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Run BC Lint Action
 | 
			
		||||
        uses: pytorch/test-infra/.github/actions/bc-lint@main
 | 
			
		||||
        with:
 | 
			
		||||
          repo: ${{ github.event.pull_request.head.repo.full_name }}
 | 
			
		||||
          base_sha: ${{ github.event.pull_request.base.sha }}
 | 
			
		||||
          head_sha: ${{ github.event.pull_request.head.sha }}
 | 
			
		||||
          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
 | 
			
		||||
          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
 | 
			
		||||
          config_dir: .github
 | 
			
		||||
 | 
			
		||||
concurrency:
 | 
			
		||||
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
 | 
			
		||||
  cancel-in-progress: true
 | 
			
		||||
							
								
								
									
										9
									
								
								.github/workflows/cleanup_pr_body.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								.github/workflows/cleanup_pr_body.yml
									
									
									
									
										vendored
									
									
								
							@ -16,16 +16,11 @@ jobs:
 | 
			
		||||
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 | 
			
		||||
 | 
			
		||||
      - name: Set up Python
 | 
			
		||||
        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
 | 
			
		||||
        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
 | 
			
		||||
        with:
 | 
			
		||||
          python-version: '3.12'
 | 
			
		||||
 | 
			
		||||
      - name: Install Python dependencies
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 -m pip install --upgrade pip
 | 
			
		||||
          python3 -m pip install regex
 | 
			
		||||
 | 
			
		||||
      - name: Update PR description
 | 
			
		||||
        env:
 | 
			
		||||
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 | 
			
		||||
        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
 | 
			
		||||
        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										309
									
								
								.github/workflows/issue_autolabel.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										309
									
								
								.github/workflows/issue_autolabel.yml
									
									
									
									
										vendored
									
									
								
							@ -1,309 +0,0 @@
 | 
			
		||||
name: Label issues based on keywords
 | 
			
		||||
on:
 | 
			
		||||
  issues:
 | 
			
		||||
    types: [opened, edited, reopened]
 | 
			
		||||
permissions:
 | 
			
		||||
  issues: write          # needed so the workflow can add labels
 | 
			
		||||
  contents: read
 | 
			
		||||
concurrency:
 | 
			
		||||
  group: issue-labeler-${{ github.event.issue.number }}
 | 
			
		||||
  cancel-in-progress: true
 | 
			
		||||
jobs:
 | 
			
		||||
  add-labels:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Label issues based on keywords
 | 
			
		||||
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
 | 
			
		||||
        with:
 | 
			
		||||
          script: |
 | 
			
		||||
            // Configuration: Add new labels and keywords here
 | 
			
		||||
            const labelConfig = {
 | 
			
		||||
              rocm: {
 | 
			
		||||
                // Keyword search - matches whole words only (with word boundaries)
 | 
			
		||||
                keywords: [
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "composable kernel",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "rccl",
 | 
			
		||||
                    searchIn: "body"  // only search in body
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "migraphx",
 | 
			
		||||
                    searchIn: "title"  // only search in title
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "hipgraph",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "ROCm System Management Interface",
 | 
			
		||||
                    searchIn: "body"
 | 
			
		||||
                  },
 | 
			
		||||
                ],
 | 
			
		||||
                
 | 
			
		||||
                // Substring search - matches anywhere in text (partial matches)
 | 
			
		||||
                substrings: [
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "VLLM_ROCM_",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "aiter",
 | 
			
		||||
                    searchIn: "title"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "rocm",
 | 
			
		||||
                    searchIn: "title"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "amd",
 | 
			
		||||
                    searchIn: "title"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "hip-",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "gfx",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "cdna",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "rdna",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "torch_hip",
 | 
			
		||||
                    searchIn: "body"  // only in body
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "_hip",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "hip_",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                  
 | 
			
		||||
                  // ROCm tools and libraries
 | 
			
		||||
                  {
 | 
			
		||||
                    term: "hipify",
 | 
			
		||||
                    searchIn: "both"
 | 
			
		||||
                  },
 | 
			
		||||
                ],
 | 
			
		||||
                
 | 
			
		||||
                // Regex patterns - for complex pattern matching
 | 
			
		||||
                regexPatterns: [
 | 
			
		||||
                  {
 | 
			
		||||
                    pattern: "\\bmi\\d{3}[a-z]*\\b",
 | 
			
		||||
                    description: "AMD GPU names (mi + 3 digits + optional letters)",
 | 
			
		||||
                    flags: "gi",
 | 
			
		||||
                    searchIn: "both"  // "title", "body", or "both"
 | 
			
		||||
                  }
 | 
			
		||||
                ],
 | 
			
		||||
              },
 | 
			
		||||
            };
 | 
			
		||||
            
 | 
			
		||||
            // Helper function to create regex based on search type
 | 
			
		||||
            function createSearchRegex(term, type) {
 | 
			
		||||
              // Escape special regex characters in the term
 | 
			
		||||
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
 | 
			
		||||
              
 | 
			
		||||
              switch (type) {
 | 
			
		||||
                case 'keyword':
 | 
			
		||||
                  // Word boundary search - matches whole words only
 | 
			
		||||
                  return new RegExp(`\\b${escapedTerm}\\b`, "gi");
 | 
			
		||||
                case 'substring':
 | 
			
		||||
                  // Substring search - matches anywhere in the text
 | 
			
		||||
                  return new RegExp(escapedTerm, "gi");
 | 
			
		||||
                default:
 | 
			
		||||
                  throw new Error(`Unknown search type: ${type}`);
 | 
			
		||||
              }
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // Helper function to find matching terms in text with line information
 | 
			
		||||
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
 | 
			
		||||
              const matches = [];
 | 
			
		||||
              const lines = text.split('\n');
 | 
			
		||||
              
 | 
			
		||||
              for (const termConfig of searchTerms) {
 | 
			
		||||
                let regex;
 | 
			
		||||
                let term, searchIn, pattern, description, flags;
 | 
			
		||||
                
 | 
			
		||||
                // Handle different input formats (string or object)
 | 
			
		||||
                if (typeof termConfig === 'string') {
 | 
			
		||||
                  term = termConfig;
 | 
			
		||||
                  searchIn = 'both'; // default
 | 
			
		||||
                } else {
 | 
			
		||||
                  term = termConfig.term;
 | 
			
		||||
                  searchIn = termConfig.searchIn || 'both';
 | 
			
		||||
                  pattern = termConfig.pattern;
 | 
			
		||||
                  description = termConfig.description;
 | 
			
		||||
                  flags = termConfig.flags;
 | 
			
		||||
                }
 | 
			
		||||
                
 | 
			
		||||
                // Skip if this term shouldn't be searched in the current location
 | 
			
		||||
                if (searchIn !== 'both' && searchIn !== searchLocation) {
 | 
			
		||||
                  continue;
 | 
			
		||||
                }
 | 
			
		||||
                
 | 
			
		||||
                // Create appropriate regex
 | 
			
		||||
                if (searchType === 'regex') {
 | 
			
		||||
                  regex = new RegExp(pattern, flags || "gi");
 | 
			
		||||
                } else {
 | 
			
		||||
                  regex = createSearchRegex(term, searchType);
 | 
			
		||||
                }
 | 
			
		||||
                
 | 
			
		||||
                const termMatches = [];
 | 
			
		||||
                
 | 
			
		||||
                // Check each line for matches
 | 
			
		||||
                lines.forEach((line, lineIndex) => {
 | 
			
		||||
                  const lineMatches = line.match(regex);
 | 
			
		||||
                  if (lineMatches) {
 | 
			
		||||
                    lineMatches.forEach(match => {
 | 
			
		||||
                      termMatches.push({
 | 
			
		||||
                        match: match,
 | 
			
		||||
                        lineNumber: lineIndex + 1,
 | 
			
		||||
                        lineContent: line.trim(),
 | 
			
		||||
                        searchType: searchType,
 | 
			
		||||
                        searchLocation: searchLocation,
 | 
			
		||||
                        originalTerm: term || pattern,
 | 
			
		||||
                        description: description,
 | 
			
		||||
                        // Show context around the match in the line
 | 
			
		||||
                        context: line.length > 100 ? 
 | 
			
		||||
                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
 | 
			
		||||
                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
 | 
			
		||||
                          : line.trim()
 | 
			
		||||
                      });
 | 
			
		||||
                    });
 | 
			
		||||
                  }
 | 
			
		||||
                });
 | 
			
		||||
                
 | 
			
		||||
                if (termMatches.length > 0) {
 | 
			
		||||
                  matches.push({
 | 
			
		||||
                    term: term || (description || pattern),
 | 
			
		||||
                    searchType: searchType,
 | 
			
		||||
                    searchLocation: searchLocation,
 | 
			
		||||
                    searchIn: searchIn,
 | 
			
		||||
                    pattern: pattern,
 | 
			
		||||
                    matches: termMatches,
 | 
			
		||||
                    count: termMatches.length
 | 
			
		||||
                  });
 | 
			
		||||
                }
 | 
			
		||||
              }
 | 
			
		||||
              
 | 
			
		||||
              return matches;
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // Helper function to check if label should be added
 | 
			
		||||
            async function processLabel(labelName, config) {
 | 
			
		||||
              const body = context.payload.issue.body || "";
 | 
			
		||||
              const title = context.payload.issue.title || "";
 | 
			
		||||
              
 | 
			
		||||
              core.notice(`Processing label: ${labelName}`);
 | 
			
		||||
              core.notice(`Issue Title: "${title}"`);
 | 
			
		||||
              core.notice(`Issue Body length: ${body.length} characters`);
 | 
			
		||||
              
 | 
			
		||||
              let shouldAddLabel = false;
 | 
			
		||||
              let allMatches = [];
 | 
			
		||||
              let reason = '';
 | 
			
		||||
              
 | 
			
		||||
              const keywords = config.keywords || [];
 | 
			
		||||
              const substrings = config.substrings || [];
 | 
			
		||||
              const regexPatterns = config.regexPatterns || [];
 | 
			
		||||
              
 | 
			
		||||
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
 | 
			
		||||
              
 | 
			
		||||
              // Search in title
 | 
			
		||||
              if (title.trim()) {
 | 
			
		||||
                core.notice(`Searching in title: "${title}"`);
 | 
			
		||||
                
 | 
			
		||||
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
 | 
			
		||||
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
 | 
			
		||||
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
 | 
			
		||||
                
 | 
			
		||||
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
 | 
			
		||||
              }
 | 
			
		||||
              
 | 
			
		||||
              // Search in body
 | 
			
		||||
              if (body.trim()) {
 | 
			
		||||
                core.notice(`Searching in body (${body.length} characters)`);
 | 
			
		||||
                
 | 
			
		||||
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
 | 
			
		||||
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
 | 
			
		||||
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
 | 
			
		||||
                
 | 
			
		||||
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
 | 
			
		||||
              }
 | 
			
		||||
              
 | 
			
		||||
              if (allMatches.length > 0) {
 | 
			
		||||
                core.notice(`Found ${allMatches.length} matching term(s):`);
 | 
			
		||||
                
 | 
			
		||||
                for (const termMatch of allMatches) {
 | 
			
		||||
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
 | 
			
		||||
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
 | 
			
		||||
                  
 | 
			
		||||
                  if (termMatch.searchType === 'regex') {
 | 
			
		||||
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
 | 
			
		||||
                  } else {
 | 
			
		||||
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
 | 
			
		||||
                  }
 | 
			
		||||
                  
 | 
			
		||||
                  // Show details for each match
 | 
			
		||||
                  termMatch.matches.forEach((match, index) => {
 | 
			
		||||
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
 | 
			
		||||
                    if (match.description) {
 | 
			
		||||
                      core.notice(`       Description: ${match.description}`);
 | 
			
		||||
                    }
 | 
			
		||||
                    core.notice(`       Context: ${match.context}`);
 | 
			
		||||
                    if (match.lineContent !== match.context) {
 | 
			
		||||
                      core.notice(`       Full line: ${match.lineContent}`);
 | 
			
		||||
                    }
 | 
			
		||||
                  });
 | 
			
		||||
                }
 | 
			
		||||
                
 | 
			
		||||
                shouldAddLabel = true;
 | 
			
		||||
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
 | 
			
		||||
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
 | 
			
		||||
                const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
 | 
			
		||||
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
 | 
			
		||||
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
 | 
			
		||||
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
 | 
			
		||||
                
 | 
			
		||||
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
 | 
			
		||||
              }
 | 
			
		||||
              
 | 
			
		||||
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
 | 
			
		||||
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
 | 
			
		||||
              
 | 
			
		||||
              if (shouldAddLabel) {
 | 
			
		||||
                const existingLabels = context.payload.issue.labels.map(l => l.name);
 | 
			
		||||
                if (!existingLabels.includes(labelName)) {
 | 
			
		||||
                  await github.rest.issues.addLabels({
 | 
			
		||||
                    owner: context.repo.owner,
 | 
			
		||||
                    repo: context.repo.repo,
 | 
			
		||||
                    issue_number: context.issue.number,
 | 
			
		||||
                    labels: [labelName],
 | 
			
		||||
                  });
 | 
			
		||||
                  core.notice(`Label "${labelName}" added. ${reason}`);
 | 
			
		||||
                  return true;
 | 
			
		||||
                }
 | 
			
		||||
                core.notice(`Label "${labelName}" already present.`);
 | 
			
		||||
                return false;
 | 
			
		||||
              }
 | 
			
		||||
              
 | 
			
		||||
              core.notice(`No matching terms found for label "${labelName}".`);
 | 
			
		||||
              return false;
 | 
			
		||||
            }
 | 
			
		||||
            
 | 
			
		||||
            // Process all configured labels
 | 
			
		||||
            const processLabels = Object.entries(labelConfig)
 | 
			
		||||
              .map(([labelName, config]) => processLabel(labelName, config));
 | 
			
		||||
            const labelsAdded = await Promise.all(processLabels);
 | 
			
		||||
            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
 | 
			
		||||
            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
 | 
			
		||||
							
								
								
									
										82
									
								
								.github/workflows/lint-and-deploy.yaml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										82
									
								
								.github/workflows/lint-and-deploy.yaml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@ -0,0 +1,82 @@
 | 
			
		||||
name: Lint and Deploy Charts
 | 
			
		||||
 | 
			
		||||
on: pull_request
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  lint-and-deploy:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Checkout
 | 
			
		||||
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 | 
			
		||||
        with:
 | 
			
		||||
          fetch-depth: 0
 | 
			
		||||
 | 
			
		||||
      - name: Set up Helm
 | 
			
		||||
        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
 | 
			
		||||
        with:
 | 
			
		||||
          version: v3.14.4
 | 
			
		||||
 | 
			
		||||
       #Python is required because ct lint runs Yamale and yamllint which require Python.
 | 
			
		||||
      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
 | 
			
		||||
        with:
 | 
			
		||||
          python-version: '3.13'
 | 
			
		||||
 | 
			
		||||
      - name: Set up chart-testing
 | 
			
		||||
        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
 | 
			
		||||
        with:
 | 
			
		||||
          version: v3.10.1
 | 
			
		||||
 | 
			
		||||
      - name: Run chart-testing (lint)
 | 
			
		||||
        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
 | 
			
		||||
 | 
			
		||||
      - name: Setup minio
 | 
			
		||||
        run: |
 | 
			
		||||
          docker network create vllm-net
 | 
			
		||||
          docker run -d -p 9000:9000 --name minio --net vllm-net \
 | 
			
		||||
                     -e "MINIO_ACCESS_KEY=minioadmin" \
 | 
			
		||||
                     -e "MINIO_SECRET_KEY=minioadmin" \
 | 
			
		||||
                     -v /tmp/data:/data \
 | 
			
		||||
                     -v /tmp/config:/root/.minio \
 | 
			
		||||
                     minio/minio server /data
 | 
			
		||||
          export AWS_ACCESS_KEY_ID=minioadmin
 | 
			
		||||
          export AWS_SECRET_ACCESS_KEY=minioadmin
 | 
			
		||||
          export AWS_EC2_METADATA_DISABLED=true
 | 
			
		||||
          mkdir opt-125m
 | 
			
		||||
          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
 | 
			
		||||
          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
 | 
			
		||||
          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
 | 
			
		||||
 | 
			
		||||
      - name: Create kind cluster
 | 
			
		||||
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
 | 
			
		||||
 | 
			
		||||
      - name: Build the Docker image vllm cpu
 | 
			
		||||
        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
 | 
			
		||||
 | 
			
		||||
      - name: Configuration of docker images, network and namespace for the kind cluster
 | 
			
		||||
        run: |
 | 
			
		||||
          docker pull amazon/aws-cli:2.6.4
 | 
			
		||||
          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
 | 
			
		||||
          kind load docker-image vllm-cpu-env:latest --name chart-testing
 | 
			
		||||
          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
 | 
			
		||||
          kubectl create ns ns-vllm
 | 
			
		||||
 | 
			
		||||
      - name: Run chart-testing (install)
 | 
			
		||||
        run: |
 | 
			
		||||
          export AWS_ACCESS_KEY_ID=minioadmin
 | 
			
		||||
          export AWS_SECRET_ACCESS_KEY=minioadmin
 | 
			
		||||
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
 | 
			
		||||
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
 | 
			
		||||
    
 | 
			
		||||
      - name: curl test
 | 
			
		||||
        run: |
 | 
			
		||||
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
 | 
			
		||||
          sleep 10
 | 
			
		||||
          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
 | 
			
		||||
                  --header "Content-Type: application/json" \
 | 
			
		||||
                  --data '{
 | 
			
		||||
                          "model": "opt-125m",
 | 
			
		||||
                          "prompt": "San Francisco is a",
 | 
			
		||||
                          "max_tokens": 7,
 | 
			
		||||
                          "temperature": 0
 | 
			
		||||
                  }'):$CODE"
 | 
			
		||||
          echo "$CODE"
 | 
			
		||||
							
								
								
									
										17
									
								
								.github/workflows/matchers/markdownlint.json
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										17
									
								
								.github/workflows/matchers/markdownlint.json
									
									
									
									
										vendored
									
									
								
							@ -1,17 +0,0 @@
 | 
			
		||||
{
 | 
			
		||||
  "problemMatcher": [
 | 
			
		||||
    {
 | 
			
		||||
      "owner": "markdownlint",
 | 
			
		||||
      "pattern": [
 | 
			
		||||
        {
 | 
			
		||||
          "regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
 | 
			
		||||
          "file": 1,
 | 
			
		||||
          "line": 2,
 | 
			
		||||
          "column": 3,
 | 
			
		||||
          "code": 4,
 | 
			
		||||
          "message": 5
 | 
			
		||||
        }
 | 
			
		||||
      ]
 | 
			
		||||
    }
 | 
			
		||||
  ]
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										10
									
								
								.github/workflows/pre-commit.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								.github/workflows/pre-commit.yml
									
									
									
									
										vendored
									
									
								
							@ -5,23 +5,15 @@ on:
 | 
			
		||||
  push:
 | 
			
		||||
    branches: [main]
 | 
			
		||||
 | 
			
		||||
concurrency:
 | 
			
		||||
  group: ${{ github.workflow }}-${{ github.ref }}
 | 
			
		||||
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 | 
			
		||||
 | 
			
		||||
permissions:
 | 
			
		||||
  contents: read
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  pre-commit:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 | 
			
		||||
    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
 | 
			
		||||
    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
 | 
			
		||||
      with:
 | 
			
		||||
        python-version: "3.12"
 | 
			
		||||
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
 | 
			
		||||
    - run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
 | 
			
		||||
    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
 | 
			
		||||
    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
 | 
			
		||||
      with:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										111
									
								
								.github/workflows/publish.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										111
									
								
								.github/workflows/publish.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@ -0,0 +1,111 @@
 | 
			
		||||
# This workflow will upload a Python Package to Release asset
 | 
			
		||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
 | 
			
		||||
 | 
			
		||||
name: Create Release
 | 
			
		||||
 | 
			
		||||
on:
 | 
			
		||||
  push:
 | 
			
		||||
    tags:
 | 
			
		||||
      - v*
 | 
			
		||||
 | 
			
		||||
# Needed to create release and upload assets
 | 
			
		||||
permissions:
 | 
			
		||||
  contents: write
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  release:
 | 
			
		||||
    # Retrieve tag and create release
 | 
			
		||||
    name: Create Release
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    outputs:
 | 
			
		||||
      upload_url: ${{ steps.create_release.outputs.upload_url }}
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Checkout
 | 
			
		||||
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 | 
			
		||||
 | 
			
		||||
      - name: Extract branch info
 | 
			
		||||
        shell: bash
 | 
			
		||||
        run: |
 | 
			
		||||
          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
 | 
			
		||||
 | 
			
		||||
      - name: Create Release
 | 
			
		||||
        id: create_release
 | 
			
		||||
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
 | 
			
		||||
        env:
 | 
			
		||||
          RELEASE_TAG: ${{ env.release_tag }}
 | 
			
		||||
        with:
 | 
			
		||||
          github-token: "${{ secrets.GITHUB_TOKEN }}"
 | 
			
		||||
          script: |
 | 
			
		||||
            const script = require('.github/workflows/scripts/create_release.js')
 | 
			
		||||
            await script(github, context, core)
 | 
			
		||||
 | 
			
		||||
  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
 | 
			
		||||
  # wheel:
 | 
			
		||||
  #   name: Build Wheel
 | 
			
		||||
  #   runs-on: ${{ matrix.os }}
 | 
			
		||||
  #   needs: release
 | 
			
		||||
 | 
			
		||||
  #   strategy:
 | 
			
		||||
  #     fail-fast: false
 | 
			
		||||
  #     matrix:
 | 
			
		||||
  #         os: ['ubuntu-20.04']
 | 
			
		||||
  #         python-version: ['3.9', '3.10', '3.11', '3.12']
 | 
			
		||||
  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
 | 
			
		||||
  #         cuda-version: ['11.8', '12.1']
 | 
			
		||||
 | 
			
		||||
  #   steps:
 | 
			
		||||
  #     - name: Checkout
 | 
			
		||||
  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 | 
			
		||||
 | 
			
		||||
  #     - name: Setup ccache
 | 
			
		||||
  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
 | 
			
		||||
  #       with:
 | 
			
		||||
  #         create-symlink: true
 | 
			
		||||
  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
 | 
			
		||||
 | 
			
		||||
  #     - name: Set up Linux Env
 | 
			
		||||
  #       if: ${{ runner.os == 'Linux' }}
 | 
			
		||||
  #       run: |
 | 
			
		||||
  #         bash -x .github/workflows/scripts/env.sh
 | 
			
		||||
 | 
			
		||||
  #     - name: Set up Python
 | 
			
		||||
  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
 | 
			
		||||
  #       with:
 | 
			
		||||
  #           python-version: ${{ matrix.python-version }}
 | 
			
		||||
 | 
			
		||||
  #     - name: Install CUDA ${{ matrix.cuda-version }}
 | 
			
		||||
  #       run: |
 | 
			
		||||
  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
 | 
			
		||||
 | 
			
		||||
  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
 | 
			
		||||
  #       run: |
 | 
			
		||||
  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
 | 
			
		||||
 | 
			
		||||
  #     - name: Build wheel
 | 
			
		||||
  #       shell: bash
 | 
			
		||||
  #       env:
 | 
			
		||||
  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
 | 
			
		||||
  #       run: |
 | 
			
		||||
  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
 | 
			
		||||
  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
 | 
			
		||||
  #         asset_name=${wheel_name//"linux"/"manylinux1"}
 | 
			
		||||
  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
 | 
			
		||||
  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
 | 
			
		||||
 | 
			
		||||
  #     - name: Upload Release Asset
 | 
			
		||||
  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
 | 
			
		||||
  #       env:
 | 
			
		||||
  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 | 
			
		||||
  #       with:
 | 
			
		||||
  #         upload_url: ${{ needs.release.outputs.upload_url }}
 | 
			
		||||
  #         asset_path: ./dist/${{ env.wheel_name }}
 | 
			
		||||
  #         asset_name: ${{ env.asset_name }}
 | 
			
		||||
  #         asset_content_type: application/*
 | 
			
		||||
 | 
			
		||||
      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
 | 
			
		||||
      # - name: Publish package
 | 
			
		||||
      #   uses: pypa/gh-action-pypi-publish@release/v1.8
 | 
			
		||||
      #   with:
 | 
			
		||||
      #     repository-url: https://test.pypi.org/legacy/
 | 
			
		||||
      #     password: ${{ secrets.PYPI_API_TOKEN }}
 | 
			
		||||
      #     skip-existing: true
 | 
			
		||||
							
								
								
									
										53
									
								
								.github/workflows/reminder_comment.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										53
									
								
								.github/workflows/reminder_comment.yml
									
									
									
									
										vendored
									
									
								
							@ -1,6 +1,4 @@
 | 
			
		||||
name: PR Reminder Comment Bot
 | 
			
		||||
permissions:
 | 
			
		||||
  pull-requests: write
 | 
			
		||||
on:
 | 
			
		||||
  pull_request_target:
 | 
			
		||||
    types: [opened]
 | 
			
		||||
@ -9,46 +7,19 @@ jobs:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      - name: Remind to run full CI on PR
 | 
			
		||||
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
 | 
			
		||||
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
 | 
			
		||||
        with:
 | 
			
		||||
          script: |
 | 
			
		||||
            try {
 | 
			
		||||
              // Get the PR author
 | 
			
		||||
              const prAuthor = context.payload.pull_request.user.login;
 | 
			
		||||
              
 | 
			
		||||
              // Check if this is the author's first PR in this repository
 | 
			
		||||
              // Use GitHub's search API to find all PRs by this author
 | 
			
		||||
              const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
 | 
			
		||||
                q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
 | 
			
		||||
                per_page: 100  
 | 
			
		||||
              });
 | 
			
		||||
              
 | 
			
		||||
              const authorPRCount = searchResults.total_count;
 | 
			
		||||
              
 | 
			
		||||
              console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
 | 
			
		||||
              
 | 
			
		||||
              // Only post comment if this is the first PR (only one PR by this author)
 | 
			
		||||
              if (authorPRCount === 1) {
 | 
			
		||||
                console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
 | 
			
		||||
                await github.rest.issues.createComment({
 | 
			
		||||
                owner: context.repo.owner,
 | 
			
		||||
                repo: context.repo.repo,
 | 
			
		||||
                issue_number: context.issue.number,
 | 
			
		||||
                body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
 | 
			
		||||
                  '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
 | 
			
		||||
                  'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
 | 
			
		||||
                  'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
 | 
			
		||||
                  'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
 | 
			
		||||
                  'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
 | 
			
		||||
                  'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
 | 
			
		||||
                  '🚀'
 | 
			
		||||
                });
 | 
			
		||||
              } else {
 | 
			
		||||
                console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
 | 
			
		||||
              }
 | 
			
		||||
            } catch (error) {
 | 
			
		||||
              console.error('Error checking PR history or posting comment:', error);
 | 
			
		||||
              // Don't fail the workflow, just log the error
 | 
			
		||||
            }
 | 
			
		||||
            github.rest.issues.createComment({
 | 
			
		||||
              owner: context.repo.owner,
 | 
			
		||||
              repo: context.repo.repo,
 | 
			
		||||
              issue_number: context.issue.number,
 | 
			
		||||
              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
 | 
			
		||||
                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
 | 
			
		||||
                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
 | 
			
		||||
                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
 | 
			
		||||
                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
 | 
			
		||||
                '🚀'
 | 
			
		||||
            })
 | 
			
		||||
        env:
 | 
			
		||||
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										1
									
								
								.github/workflows/scripts/build.sh
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								.github/workflows/scripts/build.sh
									
									
									
									
										vendored
									
									
								
							@ -15,6 +15,7 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
 | 
			
		||||
export MAX_JOBS=1
 | 
			
		||||
# Make sure release wheels are built for the following architectures
 | 
			
		||||
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 | 
			
		||||
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
 | 
			
		||||
 | 
			
		||||
bash tools/check_repo.sh
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/stale.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/stale.yml
									
									
									
									
										vendored
									
									
								
							@ -13,7 +13,7 @@ jobs:
 | 
			
		||||
      actions: write
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    steps:
 | 
			
		||||
      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
 | 
			
		||||
      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
 | 
			
		||||
        with:
 | 
			
		||||
          # Increasing this value ensures that changes to this workflow
 | 
			
		||||
          # propagate to all issues and PRs in days rather than months
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										24
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										24
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -3,9 +3,7 @@
 | 
			
		||||
 | 
			
		||||
# vllm-flash-attn built from source
 | 
			
		||||
vllm/vllm_flash_attn/*
 | 
			
		||||
 | 
			
		||||
# triton jit
 | 
			
		||||
.triton
 | 
			
		||||
!vllm/vllm_flash_attn/fa_utils.py
 | 
			
		||||
 | 
			
		||||
# Byte-compiled / optimized / DLL files
 | 
			
		||||
__pycache__/
 | 
			
		||||
@ -80,6 +78,10 @@ instance/
 | 
			
		||||
# Scrapy stuff:
 | 
			
		||||
.scrapy
 | 
			
		||||
 | 
			
		||||
# Sphinx documentation
 | 
			
		||||
docs/_build/
 | 
			
		||||
docs/source/getting_started/examples/
 | 
			
		||||
 | 
			
		||||
# PyBuilder
 | 
			
		||||
.pybuilder/
 | 
			
		||||
target/
 | 
			
		||||
@ -149,9 +151,6 @@ venv.bak/
 | 
			
		||||
 | 
			
		||||
# mkdocs documentation
 | 
			
		||||
/site
 | 
			
		||||
docs/argparse
 | 
			
		||||
docs/examples/*
 | 
			
		||||
!docs/examples/README.md
 | 
			
		||||
 | 
			
		||||
# mypy
 | 
			
		||||
.mypy_cache/
 | 
			
		||||
@ -177,14 +176,6 @@ cython_debug/
 | 
			
		||||
# VSCode
 | 
			
		||||
.vscode/
 | 
			
		||||
 | 
			
		||||
# Claude
 | 
			
		||||
CLAUDE.md
 | 
			
		||||
.claude/
 | 
			
		||||
 | 
			
		||||
# Codex
 | 
			
		||||
AGENTS.md
 | 
			
		||||
.codex/
 | 
			
		||||
 | 
			
		||||
# DS Store
 | 
			
		||||
.DS_Store
 | 
			
		||||
 | 
			
		||||
@ -213,8 +204,5 @@ benchmarks/**/*.json
 | 
			
		||||
actionlint
 | 
			
		||||
shellcheck*/
 | 
			
		||||
 | 
			
		||||
# Ignore moe/marlin_moe gen code
 | 
			
		||||
# Ingore moe/marlin_moe gen code
 | 
			
		||||
csrc/moe/marlin_moe_wna16/kernel_*
 | 
			
		||||
 | 
			
		||||
# Ignore ep_kernels_workspace folder
 | 
			
		||||
ep_kernels_workspace/
 | 
			
		||||
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user