mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-11-04 09:24:33 +08:00 
			
		
		
		
	Compare commits
	
		
			4 Commits
		
	
	
		
			benchmark
			...
			v1-block-t
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 44d638a896 | |||
| caacd1ddfb | |||
| e68f63ef83 | |||
| 223e17424c | 
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import sys
 | 
			
		||||
@ -9,12 +8,12 @@ import zipfile
 | 
			
		||||
# Note that we have 400 MiB quota, please use it wisely.
 | 
			
		||||
# See https://github.com/pypi/support/issues/3792 .
 | 
			
		||||
# Please also sync the value with the one in Dockerfile.
 | 
			
		||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
 | 
			
		||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def print_top_10_largest_files(zip_file):
 | 
			
		||||
    """Print the top 10 largest files in the given zip file."""
 | 
			
		||||
    with zipfile.ZipFile(zip_file, "r") as z:
 | 
			
		||||
    with zipfile.ZipFile(zip_file, 'r') as z:
 | 
			
		||||
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
 | 
			
		||||
        file_sizes.sort(key=lambda x: x[1], reverse=True)
 | 
			
		||||
        for f, size in file_sizes[:10]:
 | 
			
		||||
@ -29,18 +28,14 @@ def check_wheel_size(directory):
 | 
			
		||||
                wheel_path = os.path.join(root, file_name)
 | 
			
		||||
                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
 | 
			
		||||
                if wheel_size_mb > VLLM_MAX_SIZE_MB:
 | 
			
		||||
                    print(
 | 
			
		||||
                        f"Not allowed: Wheel {wheel_path} is larger "
 | 
			
		||||
                        f"({wheel_size_mb:.2f} MB) than the limit "
 | 
			
		||||
                        f"({VLLM_MAX_SIZE_MB} MB)."
 | 
			
		||||
                    )
 | 
			
		||||
                    print(f"Not allowed: Wheel {wheel_path} is larger "
 | 
			
		||||
                          f"({wheel_size_mb:.2f} MB) than the limit "
 | 
			
		||||
                          f"({VLLM_MAX_SIZE_MB} MB).")
 | 
			
		||||
                    print_top_10_largest_files(wheel_path)
 | 
			
		||||
                    return 1
 | 
			
		||||
                else:
 | 
			
		||||
                    print(
 | 
			
		||||
                        f"Wheel {wheel_path} is within the allowed size "
 | 
			
		||||
                        f"({wheel_size_mb:.2f} MB)."
 | 
			
		||||
                    )
 | 
			
		||||
                    print(f"Wheel {wheel_path} is within the allowed size "
 | 
			
		||||
                          f"({wheel_size_mb:.2f} MB).")
 | 
			
		||||
    return 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -50,4 +45,4 @@ if __name__ == "__main__":
 | 
			
		||||
        sys.exit(1)
 | 
			
		||||
 | 
			
		||||
    directory = sys.argv[1]
 | 
			
		||||
    sys.exit(check_wheel_size(directory))
 | 
			
		||||
    sys.exit(check_wheel_size(directory))
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import os
 | 
			
		||||
@ -23,5 +22,5 @@ with open("index.html", "w") as f:
 | 
			
		||||
    print(f"Generated index.html for {args.wheel}")
 | 
			
		||||
    # cloudfront requires escaping the '+' character
 | 
			
		||||
    f.write(
 | 
			
		||||
        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
 | 
			
		||||
    )
 | 
			
		||||
        template.format(wheel=filename,
 | 
			
		||||
                        wheel_html_escaped=filename.replace("+", "%2B")))
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 | 
			
		||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 | 
			
		||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,11 +0,0 @@
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.335
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.323
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "mgoin/Minitron-4B-Base-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 | 
			
		||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 | 
			
		||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 | 
			
		||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
 | 
			
		||||
@ -1,12 +0,0 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.30
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.465
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 | 
			
		||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,11 +0,0 @@
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.54
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.59
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,11 +0,0 @@
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.47
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.64
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 | 
			
		||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 | 
			
		||||
Mixtral-8x7B-Instruct-v0.1.yaml
 | 
			
		||||
Qwen2-57B-A14-Instruct.yaml
 | 
			
		||||
DeepSeek-V2-Lite-Chat.yaml
 | 
			
		||||
Meta-Llama-3-8B-QQQ.yaml
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,10 @@
 | 
			
		||||
Qwen2.5-1.5B-Instruct.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 | 
			
		||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 | 
			
		||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 | 
			
		||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 | 
			
		||||
Minitron-4B-Base-FP8.yaml
 | 
			
		||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 | 
			
		||||
Qwen2-1.5B-Instruct-FP8W8.yaml
 | 
			
		||||
Meta-Llama-3-8B-QQQ.yaml
 | 
			
		||||
 | 
			
		||||
@ -1,44 +0,0 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pytest_addoption(parser):
 | 
			
		||||
    parser.addoption(
 | 
			
		||||
        "--config-list-file",
 | 
			
		||||
        action="store",
 | 
			
		||||
        help="Path to the file listing model config YAMLs (one per line)",
 | 
			
		||||
    )
 | 
			
		||||
    parser.addoption(
 | 
			
		||||
        "--tp-size",
 | 
			
		||||
        action="store",
 | 
			
		||||
        default="1",
 | 
			
		||||
        help="Tensor parallel size to use for evaluation",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def config_list_file(pytestconfig, config_dir):
 | 
			
		||||
    rel_path = pytestconfig.getoption("--config-list-file")
 | 
			
		||||
    return config_dir / rel_path
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def tp_size(pytestconfig):
 | 
			
		||||
    return pytestconfig.getoption("--tp-size")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pytest_generate_tests(metafunc):
 | 
			
		||||
    if "config_filename" in metafunc.fixturenames:
 | 
			
		||||
        rel_path = metafunc.config.getoption("--config-list-file")
 | 
			
		||||
        config_list_file = Path(rel_path).resolve()
 | 
			
		||||
        config_dir = config_list_file.parent
 | 
			
		||||
        with open(config_list_file, encoding="utf-8") as f:
 | 
			
		||||
            configs = [
 | 
			
		||||
                config_dir / line.strip()
 | 
			
		||||
                for line in f
 | 
			
		||||
                if line.strip() and not line.startswith("#")
 | 
			
		||||
            ]
 | 
			
		||||
        metafunc.parametrize("config_filename", configs)
 | 
			
		||||
@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
lm_eval --model vllm \
 | 
			
		||||
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
 | 
			
		||||
  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
 | 
			
		||||
  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
 | 
			
		||||
  --batch_size "$BATCH_SIZE"
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										59
									
								
								.buildkite/lm-eval-harness/run-tests.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								.buildkite/lm-eval-harness/run-tests.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,59 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
usage() {
 | 
			
		||||
    echo``
 | 
			
		||||
    echo "Runs lm eval harness on GSM8k using vllm and compares to "
 | 
			
		||||
    echo "precomputed baseline (measured by HF transformers.)"
 | 
			
		||||
    echo
 | 
			
		||||
    echo "usage: ${0} <options>"
 | 
			
		||||
    echo
 | 
			
		||||
    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
 | 
			
		||||
    echo "  -t    - tensor parallel size"
 | 
			
		||||
    echo
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
SUCCESS=0
 | 
			
		||||
 | 
			
		||||
while getopts "c:t:" OPT; do
 | 
			
		||||
  case ${OPT} in
 | 
			
		||||
    c ) 
 | 
			
		||||
        CONFIG="$OPTARG"
 | 
			
		||||
        ;;
 | 
			
		||||
    t )
 | 
			
		||||
        TP_SIZE="$OPTARG"
 | 
			
		||||
        ;;
 | 
			
		||||
    \? )
 | 
			
		||||
        usage
 | 
			
		||||
        exit 1
 | 
			
		||||
        ;;
 | 
			
		||||
  esac
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
# Parse list of configs.
 | 
			
		||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 | 
			
		||||
 | 
			
		||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 | 
			
		||||
do
 | 
			
		||||
    LOCAL_SUCCESS=0
 | 
			
		||||
    
 | 
			
		||||
    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
 | 
			
		||||
 | 
			
		||||
    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
 | 
			
		||||
    export LM_EVAL_TP_SIZE=$TP_SIZE
 | 
			
		||||
    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
 | 
			
		||||
 | 
			
		||||
    if [[ $LOCAL_SUCCESS == 0 ]]; then
 | 
			
		||||
        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
 | 
			
		||||
    else
 | 
			
		||||
        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
 | 
			
		||||
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
if [ "${SUCCESS}" -eq "0" ]; then
 | 
			
		||||
    exit 0
 | 
			
		||||
else
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
@ -1,57 +1,69 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
"""
 | 
			
		||||
LM eval harness on model to compare vs HF baseline computed offline.
 | 
			
		||||
Configs are found in configs/$MODEL.yaml
 | 
			
		||||
 | 
			
		||||
pytest -s -v test_lm_eval_correctness.py \
 | 
			
		||||
    --config-list-file=configs/models-small.txt \
 | 
			
		||||
    --tp-size=1
 | 
			
		||||
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
 | 
			
		||||
* export LM_EVAL_TP_SIZE=4 
 | 
			
		||||
* pytest -s test_lm_eval_correctness.py
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
import lm_eval
 | 
			
		||||
import numpy as np
 | 
			
		||||
import numpy
 | 
			
		||||
import pytest
 | 
			
		||||
import yaml
 | 
			
		||||
 | 
			
		||||
RTOL = 0.08
 | 
			
		||||
RTOL = 0.05
 | 
			
		||||
TEST_DATA_FILE = os.environ.get(
 | 
			
		||||
    "LM_EVAL_TEST_DATA_FILE",
 | 
			
		||||
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
 | 
			
		||||
 | 
			
		||||
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def launch_lm_eval(eval_config, tp_size):
 | 
			
		||||
    trust_remote_code = eval_config.get("trust_remote_code", False)
 | 
			
		||||
    max_model_len = eval_config.get("max_model_len", 4096)
 | 
			
		||||
    model_args = (
 | 
			
		||||
        f"pretrained={eval_config['model_name']},"
 | 
			
		||||
        f"tensor_parallel_size={tp_size},"
 | 
			
		||||
        f"enforce_eager=true,"
 | 
			
		||||
        f"add_bos_token=true,"
 | 
			
		||||
        f"trust_remote_code={trust_remote_code},"
 | 
			
		||||
        f"max_model_len={max_model_len}"
 | 
			
		||||
    )
 | 
			
		||||
def launch_lm_eval(eval_config):
 | 
			
		||||
    trust_remote_code = eval_config.get('trust_remote_code', False)
 | 
			
		||||
 | 
			
		||||
    model_args = f"pretrained={eval_config['model_name']}," \
 | 
			
		||||
                 f"tensor_parallel_size={TP_SIZE}," \
 | 
			
		||||
                 f"add_bos_token=true," \
 | 
			
		||||
                 f"trust_remote_code={trust_remote_code}"
 | 
			
		||||
 | 
			
		||||
    results = lm_eval.simple_evaluate(
 | 
			
		||||
        model="vllm",
 | 
			
		||||
        model_args=model_args,
 | 
			
		||||
        tasks=[task["name"] for task in eval_config["tasks"]],
 | 
			
		||||
        num_fewshot=eval_config["num_fewshot"],
 | 
			
		||||
        limit=eval_config["limit"],
 | 
			
		||||
        batch_size="auto",
 | 
			
		||||
    )
 | 
			
		||||
        batch_size="auto")
 | 
			
		||||
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_lm_eval_correctness_param(config_filename, tp_size):
 | 
			
		||||
    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
 | 
			
		||||
def test_lm_eval_correctness():
 | 
			
		||||
    eval_config = yaml.safe_load(
 | 
			
		||||
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
 | 
			
		||||
 | 
			
		||||
    results = launch_lm_eval(eval_config, tp_size)
 | 
			
		||||
    if eval_config[
 | 
			
		||||
            "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform":  #noqa: E501
 | 
			
		||||
        pytest.skip("FBGEMM is currently failing on main.")
 | 
			
		||||
 | 
			
		||||
    # Launch eval requests.
 | 
			
		||||
    results = launch_lm_eval(eval_config)
 | 
			
		||||
 | 
			
		||||
    # Confirm scores match ground truth.
 | 
			
		||||
    success = True
 | 
			
		||||
    for task in eval_config["tasks"]:
 | 
			
		||||
        for metric in task["metrics"]:
 | 
			
		||||
            ground_truth = metric["value"]
 | 
			
		||||
            measured_value = results["results"][task["name"]][metric["name"]]
 | 
			
		||||
            print(
 | 
			
		||||
                f"{task['name']} | {metric['name']}: "
 | 
			
		||||
                f"ground_truth={ground_truth} | measured={measured_value}"
 | 
			
		||||
            )
 | 
			
		||||
            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
 | 
			
		||||
            print(f'{task["name"]} | {metric["name"]}: '
 | 
			
		||||
                  f'ground_truth={ground_truth} | measured={measured_value}')
 | 
			
		||||
            success = success and numpy.isclose(
 | 
			
		||||
                ground_truth, measured_value, rtol=RTOL)
 | 
			
		||||
 | 
			
		||||
    # Assert at the end, print all scores even on failure for debugging.
 | 
			
		||||
    assert success
 | 
			
		||||
 | 
			
		||||
@ -11,7 +11,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
 | 
			
		||||
 | 
			
		||||
## Performance benchmark quick overview
 | 
			
		||||
 | 
			
		||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
 | 
			
		||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
 | 
			
		||||
 | 
			
		||||
**Benchmarking Duration**: about 1hr.
 | 
			
		||||
 | 
			
		||||
@ -31,27 +31,13 @@ Performance benchmark will be triggered when:
 | 
			
		||||
- A PR being merged into vllm.
 | 
			
		||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 | 
			
		||||
 | 
			
		||||
Manually Trigger the benchmark
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Runtime environment variables:
 | 
			
		||||
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
 | 
			
		||||
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
 | 
			
		||||
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
 | 
			
		||||
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
 | 
			
		||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 | 
			
		||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 | 
			
		||||
 | 
			
		||||
Nightly benchmark will be triggered when:
 | 
			
		||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 | 
			
		||||
 | 
			
		||||
## Performance benchmark details
 | 
			
		||||
 | 
			
		||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 | 
			
		||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
 | 
			
		||||
 | 
			
		||||
### Latency test
 | 
			
		||||
 | 
			
		||||
Here is an example of one test inside `latency-tests.json`:
 | 
			
		||||
@ -127,36 +113,12 @@ WARNING: The benchmarking script will save json results by itself, so please do
 | 
			
		||||
 | 
			
		||||
### Visualizing the results
 | 
			
		||||
 | 
			
		||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
 | 
			
		||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 | 
			
		||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 | 
			
		||||
If you do not see the table, please wait till the benchmark finish running.
 | 
			
		||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 | 
			
		||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
 | 
			
		||||
 | 
			
		||||
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
 | 
			
		||||
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
 | 
			
		||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
 | 
			
		||||
 | 
			
		||||
Here is an example using the script to compare result_a and result_b without detail test name.
 | 
			
		||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
 | 
			
		||||
 | 
			
		||||
|    | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
 | 
			
		||||
|----|----------------------------------------|----------------------------------------|----------|
 | 
			
		||||
| 0  | 142.633982                             | 156.526018                             | 1.097396 |
 | 
			
		||||
| 1  | 241.620334                             | 294.018783                             | 1.216863 |
 | 
			
		||||
| 2  | 218.298905                             | 262.664916                             | 1.203235 |
 | 
			
		||||
| 3  | 242.743860                             | 299.816190                             | 1.235113 |
 | 
			
		||||
 | 
			
		||||
Here is an example using the script to compare result_a and result_b with detail test name.
 | 
			
		||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 | 
			
		||||
|   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        |
 | 
			
		||||
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
 | 
			
		||||
| 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 |
 | 
			
		||||
| 1 | serving_llama8B_tp1_sharegpt_qps_16         | 241.620334                             | serving_llama8B_tp1_sharegpt_qps_16         | 294.018783                             | 1.216863 |
 | 
			
		||||
| 2 | serving_llama8B_tp1_sharegpt_qps_4          | 218.298905                             | serving_llama8B_tp1_sharegpt_qps_4          | 262.664916                             | 1.203235 |
 | 
			
		||||
| 3 | serving_llama8B_tp1_sharegpt_qps_inf        | 242.743860                             | serving_llama8B_tp1_sharegpt_qps_inf        | 299.816190                             | 1.235113 |
 | 
			
		||||
| 4 | serving_llama8B_tp2_random_1024_128_qps_1   | 96.613390                              | serving_llama8B_tp4_random_1024_128_qps_1   | 108.404853                             | 1.122048 |
 | 
			
		||||
 | 
			
		||||
## Nightly test details
 | 
			
		||||
 | 
			
		||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
 | 
			
		||||
 | 
			
		||||
@ -16,7 +16,7 @@ Please download the visualization scripts in the post
 | 
			
		||||
  - Download `nightly-benchmarks.zip`.
 | 
			
		||||
  - In the same folder, run the following code:
 | 
			
		||||
 | 
			
		||||
  ```bash
 | 
			
		||||
  ```console
 | 
			
		||||
  export HF_TOKEN=<your HF token>
 | 
			
		||||
  apt update
 | 
			
		||||
  apt install -y git
 | 
			
		||||
 | 
			
		||||
@ -4,8 +4,7 @@
 | 
			
		||||
- Input length: 32 tokens.
 | 
			
		||||
- Output length: 128 tokens.
 | 
			
		||||
- Batch size: fixed (8).
 | 
			
		||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- CPU Models: llama-3.1 8B.
 | 
			
		||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
 | 
			
		||||
 | 
			
		||||
{latency_tests_markdown_table}
 | 
			
		||||
@ -15,8 +14,7 @@
 | 
			
		||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 | 
			
		||||
- Output length: the corresponding output length of these 200 prompts.
 | 
			
		||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
 | 
			
		||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- CPU Models: llama-3.1 8B.
 | 
			
		||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- Evaluation metrics: throughput.
 | 
			
		||||
 | 
			
		||||
{throughput_tests_markdown_table}
 | 
			
		||||
@ -27,18 +25,12 @@
 | 
			
		||||
- Output length: the corresponding output length of these 200 prompts.
 | 
			
		||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 | 
			
		||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 | 
			
		||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 | 
			
		||||
- CPU Models: llama-3.1 8B.
 | 
			
		||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- We also added a speculative decoding test for llama-3 70B, under QPS 2
 | 
			
		||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 | 
			
		||||
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
 | 
			
		||||
 | 
			
		||||
{serving_tests_markdown_table}
 | 
			
		||||
 | 
			
		||||
## Platform Information
 | 
			
		||||
 | 
			
		||||
{platform_markdown_table}
 | 
			
		||||
 | 
			
		||||
## json version of the benchmarking tables
 | 
			
		||||
 | 
			
		||||
This section contains the data of the markdown tables above in JSON format.
 | 
			
		||||
 | 
			
		||||
@ -1,66 +0,0 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
import argparse
 | 
			
		||||
 | 
			
		||||
import pandas as pd
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def compare_data_columns(
 | 
			
		||||
    files, name_column, data_column, drop_column, ignore_test_name=False
 | 
			
		||||
):
 | 
			
		||||
    print("\ncompare_data_column: " + data_column)
 | 
			
		||||
    frames = []
 | 
			
		||||
    compare_frames = []
 | 
			
		||||
    for file in files:
 | 
			
		||||
        data_df = pd.read_json(file)
 | 
			
		||||
        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
 | 
			
		||||
        if ignore_test_name is False:
 | 
			
		||||
            serving_df = serving_df.rename(columns={name_column: file + "_name"})
 | 
			
		||||
            frames.append(serving_df[file + "_name"])
 | 
			
		||||
        serving_df = serving_df.rename(columns={data_column: file})
 | 
			
		||||
        frames.append(serving_df[file])
 | 
			
		||||
        compare_frames.append(serving_df[file])
 | 
			
		||||
        if len(compare_frames) >= 2:
 | 
			
		||||
            # Compare numbers among two files
 | 
			
		||||
            ratio_df = compare_frames[1] / compare_frames[0]
 | 
			
		||||
            frames.append(ratio_df)
 | 
			
		||||
            compare_frames.pop(1)
 | 
			
		||||
 | 
			
		||||
    concat_df = pd.concat(frames, axis=1)
 | 
			
		||||
    return concat_df
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    parser = argparse.ArgumentParser()
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "-f", "--file", action="append", type=str, help="input file name"
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--ignore_test_name", action="store_true", help="ignore_test_name or not"
 | 
			
		||||
    )
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    files = args.file
 | 
			
		||||
    print("comparing : " + ", ".join(files))
 | 
			
		||||
 | 
			
		||||
    drop_column = "P99"
 | 
			
		||||
    name_column = "Test name"
 | 
			
		||||
    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
 | 
			
		||||
    html_msgs_for_data_cols = [
 | 
			
		||||
        "Compare Output Tokens /n",
 | 
			
		||||
        "Median TTFT /n",
 | 
			
		||||
        "Median TPOT /n",
 | 
			
		||||
    ]
 | 
			
		||||
    ignore_test_name = args.ignore_test_name
 | 
			
		||||
    with open("perf_comparison.html", "w") as text_file:
 | 
			
		||||
        for i in range(len(data_cols_to_compare)):
 | 
			
		||||
            output_df = compare_data_columns(
 | 
			
		||||
                files,
 | 
			
		||||
                name_column,
 | 
			
		||||
                data_cols_to_compare[i],
 | 
			
		||||
                drop_column,
 | 
			
		||||
                ignore_test_name=ignore_test_name,
 | 
			
		||||
            )
 | 
			
		||||
            print(output_df)
 | 
			
		||||
            html = output_df.to_html()
 | 
			
		||||
            text_file.write(html_msgs_for_data_cols[i])
 | 
			
		||||
            text_file.write(html)
 | 
			
		||||
@ -1,13 +1,10 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
from importlib import util
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
import pandas as pd
 | 
			
		||||
import psutil
 | 
			
		||||
from tabulate import tabulate
 | 
			
		||||
 | 
			
		||||
results_folder = Path("results/")
 | 
			
		||||
@ -31,11 +28,11 @@ throughput_results = []
 | 
			
		||||
throughput_results_column_mapping = {
 | 
			
		||||
    "test_name": "Test name",
 | 
			
		||||
    "gpu_type": "GPU",
 | 
			
		||||
    "num_requests": "# of req.",
 | 
			
		||||
    "total_num_tokens": "Total # of tokens",
 | 
			
		||||
    "elapsed_time": "Elapsed time (s)",
 | 
			
		||||
    # "num_requests": "# of req.",
 | 
			
		||||
    # "total_num_tokens": "Total # of tokens",
 | 
			
		||||
    # "elapsed_time": "Elapsed time (s)",
 | 
			
		||||
    "requests_per_second": "Tput (req/s)",
 | 
			
		||||
    "tokens_per_second": "Tput (tok/s)",
 | 
			
		||||
    # "tokens_per_second": "Tput (tok/s)",
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# serving results and the keys that will be printed into markdown
 | 
			
		||||
@ -43,18 +40,16 @@ serving_results = []
 | 
			
		||||
serving_column_mapping = {
 | 
			
		||||
    "test_name": "Test name",
 | 
			
		||||
    "gpu_type": "GPU",
 | 
			
		||||
    "completed": "# of req.",
 | 
			
		||||
    # "completed": "# of req.",
 | 
			
		||||
    "request_throughput": "Tput (req/s)",
 | 
			
		||||
    "total_token_throughput": "Total Token Tput (tok/s)",
 | 
			
		||||
    "output_throughput": "Output Tput (tok/s)",
 | 
			
		||||
    "total_input_tokens": "Total input tokens",
 | 
			
		||||
    "total_output_tokens": "Total output tokens",
 | 
			
		||||
    # "input_throughput": "Input Tput (tok/s)",
 | 
			
		||||
    # "output_throughput": "Output Tput (tok/s)",
 | 
			
		||||
    "mean_ttft_ms": "Mean TTFT (ms)",
 | 
			
		||||
    "median_ttft_ms": "Median TTFT (ms)",
 | 
			
		||||
    "p99_ttft_ms": "P99 TTFT (ms)",
 | 
			
		||||
    "mean_tpot_ms": "Mean TPOT (ms)",
 | 
			
		||||
    "median_tpot_ms": "Median",
 | 
			
		||||
    "p99_tpot_ms": "P99",
 | 
			
		||||
    # "mean_tpot_ms": "Mean TPOT (ms)",
 | 
			
		||||
    # "median_tpot_ms": "Median",
 | 
			
		||||
    # "p99_tpot_ms": "P99",
 | 
			
		||||
    "mean_itl_ms": "Mean ITL (ms)",
 | 
			
		||||
    "median_itl_ms": "Median ITL (ms)",
 | 
			
		||||
    "p99_itl_ms": "P99 ITL (ms)",
 | 
			
		||||
@ -70,32 +65,18 @@ def read_markdown(file):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def results_to_json(latency, throughput, serving):
 | 
			
		||||
    return json.dumps(
 | 
			
		||||
        {
 | 
			
		||||
            "latency": latency.to_dict(),
 | 
			
		||||
            "throughput": throughput.to_dict(),
 | 
			
		||||
            "serving": serving.to_dict(),
 | 
			
		||||
        }
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_size_with_unit(bytes, suffix="B"):
 | 
			
		||||
    """
 | 
			
		||||
    Scale bytes to its proper format
 | 
			
		||||
    e.g:
 | 
			
		||||
        1253656 => '1.20MB'
 | 
			
		||||
        1253656678 => '1.17GB'
 | 
			
		||||
    """
 | 
			
		||||
    factor = 1024
 | 
			
		||||
    for unit in ["", "K", "M", "G", "T", "P"]:
 | 
			
		||||
        if bytes < factor:
 | 
			
		||||
            return f"{bytes:.2f}{unit}{suffix}"
 | 
			
		||||
        bytes /= factor
 | 
			
		||||
    return json.dumps({
 | 
			
		||||
        'latency': latency.to_dict(),
 | 
			
		||||
        'throughput': throughput.to_dict(),
 | 
			
		||||
        'serving': serving.to_dict()
 | 
			
		||||
    })
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    # collect results
 | 
			
		||||
    for test_file in results_folder.glob("*.json"):
 | 
			
		||||
 | 
			
		||||
        with open(test_file) as f:
 | 
			
		||||
            raw_result = json.loads(f.read())
 | 
			
		||||
 | 
			
		||||
@ -139,8 +120,7 @@ if __name__ == "__main__":
 | 
			
		||||
            for perc in [10, 25, 50, 75, 90, 99]:
 | 
			
		||||
                # Multiply 1000 to convert the time unit from s to ms
 | 
			
		||||
                raw_result.update(
 | 
			
		||||
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
 | 
			
		||||
                )
 | 
			
		||||
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
 | 
			
		||||
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
 | 
			
		||||
 | 
			
		||||
            # add the result to raw_result
 | 
			
		||||
@ -173,48 +153,26 @@ if __name__ == "__main__":
 | 
			
		||||
    serving_results = pd.DataFrame.from_dict(serving_results)
 | 
			
		||||
    throughput_results = pd.DataFrame.from_dict(throughput_results)
 | 
			
		||||
 | 
			
		||||
    svmem = psutil.virtual_memory()
 | 
			
		||||
    platform_data = {
 | 
			
		||||
        "Physical cores": [psutil.cpu_count(logical=False)],
 | 
			
		||||
        "Total cores": [psutil.cpu_count(logical=True)],
 | 
			
		||||
        "Total Memory": [get_size_with_unit(svmem.total)],
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if util.find_spec("numa") is not None:
 | 
			
		||||
        from numa import info
 | 
			
		||||
 | 
			
		||||
        platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
 | 
			
		||||
 | 
			
		||||
    if util.find_spec("cpuinfo") is not None:
 | 
			
		||||
        from cpuinfo import get_cpu_info
 | 
			
		||||
 | 
			
		||||
        platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
 | 
			
		||||
 | 
			
		||||
    platform_results = pd.DataFrame.from_dict(
 | 
			
		||||
        platform_data, orient="index", columns=["Platform Info"]
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    raw_results_json = results_to_json(
 | 
			
		||||
        latency_results, throughput_results, serving_results
 | 
			
		||||
    )
 | 
			
		||||
    raw_results_json = results_to_json(latency_results, throughput_results,
 | 
			
		||||
                                       serving_results)
 | 
			
		||||
 | 
			
		||||
    # remapping the key, for visualization purpose
 | 
			
		||||
    if not latency_results.empty:
 | 
			
		||||
        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
 | 
			
		||||
            columns=latency_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
        latency_results = latency_results[list(
 | 
			
		||||
            latency_column_mapping.keys())].rename(
 | 
			
		||||
                columns=latency_column_mapping)
 | 
			
		||||
    if not serving_results.empty:
 | 
			
		||||
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
 | 
			
		||||
            columns=serving_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
        serving_results = serving_results[list(
 | 
			
		||||
            serving_column_mapping.keys())].rename(
 | 
			
		||||
                columns=serving_column_mapping)
 | 
			
		||||
    if not throughput_results.empty:
 | 
			
		||||
        throughput_results = throughput_results[
 | 
			
		||||
            list(throughput_results_column_mapping.keys())
 | 
			
		||||
        ].rename(columns=throughput_results_column_mapping)
 | 
			
		||||
        throughput_results = throughput_results[list(
 | 
			
		||||
            throughput_results_column_mapping.keys())].rename(
 | 
			
		||||
                columns=throughput_results_column_mapping)
 | 
			
		||||
 | 
			
		||||
    processed_results_json = results_to_json(
 | 
			
		||||
        latency_results, throughput_results, serving_results
 | 
			
		||||
    )
 | 
			
		||||
    processed_results_json = results_to_json(latency_results,
 | 
			
		||||
                                             throughput_results,
 | 
			
		||||
                                             serving_results)
 | 
			
		||||
 | 
			
		||||
    for df in [latency_results, serving_results, throughput_results]:
 | 
			
		||||
        if df.empty:
 | 
			
		||||
@ -226,43 +184,38 @@ if __name__ == "__main__":
 | 
			
		||||
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
 | 
			
		||||
        # we want to turn it into "8xGPUTYPE"
 | 
			
		||||
        df["GPU"] = df["GPU"].apply(
 | 
			
		||||
            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
 | 
			
		||||
        )
 | 
			
		||||
            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
 | 
			
		||||
 | 
			
		||||
    # get markdown tables
 | 
			
		||||
    latency_md_table = tabulate(
 | 
			
		||||
        latency_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    serving_md_table = tabulate(
 | 
			
		||||
        serving_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    throughput_md_table = tabulate(
 | 
			
		||||
        throughput_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    platform_md_table = tabulate(
 | 
			
		||||
        platform_results, headers="keys", tablefmt="pipe", showindex=True
 | 
			
		||||
    )
 | 
			
		||||
    latency_md_table = tabulate(latency_results,
 | 
			
		||||
                                headers='keys',
 | 
			
		||||
                                tablefmt='pipe',
 | 
			
		||||
                                showindex=False)
 | 
			
		||||
    serving_md_table = tabulate(serving_results,
 | 
			
		||||
                                headers='keys',
 | 
			
		||||
                                tablefmt='pipe',
 | 
			
		||||
                                showindex=False)
 | 
			
		||||
    throughput_md_table = tabulate(throughput_results,
 | 
			
		||||
                                   headers='keys',
 | 
			
		||||
                                   tablefmt='pipe',
 | 
			
		||||
                                   showindex=False)
 | 
			
		||||
 | 
			
		||||
    # document the result
 | 
			
		||||
    with open(results_folder / "benchmark_results.md", "w") as f:
 | 
			
		||||
        results = read_markdown(
 | 
			
		||||
            "../.buildkite/nightly-benchmarks/"
 | 
			
		||||
            + "performance-benchmarks-descriptions.md"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        results = read_markdown("../.buildkite/nightly-benchmarks/" +
 | 
			
		||||
                                "performance-benchmarks-descriptions.md")
 | 
			
		||||
        results = results.format(
 | 
			
		||||
            latency_tests_markdown_table=latency_md_table,
 | 
			
		||||
            throughput_tests_markdown_table=throughput_md_table,
 | 
			
		||||
            serving_tests_markdown_table=serving_md_table,
 | 
			
		||||
            platform_markdown_table=platform_md_table,
 | 
			
		||||
            benchmarking_results_in_json_string=processed_results_json,
 | 
			
		||||
        )
 | 
			
		||||
            benchmarking_results_in_json_string=processed_results_json)
 | 
			
		||||
        f.write(results)
 | 
			
		||||
 | 
			
		||||
    # document benchmarking results in json
 | 
			
		||||
    with open(results_folder / "benchmark_results.json", "w") as f:
 | 
			
		||||
        results = (
 | 
			
		||||
            latency_results.to_dict(orient="records")
 | 
			
		||||
            + throughput_results.to_dict(orient="records")
 | 
			
		||||
            + serving_results.to_dict(orient="records")
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        results = latency_results.to_dict(
 | 
			
		||||
            orient='records') + throughput_results.to_dict(
 | 
			
		||||
                orient='records') + serving_results.to_dict(orient='records')
 | 
			
		||||
        f.write(json.dumps(results))
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
 | 
			
		||||
@ -15,12 +14,15 @@ def main(model, cachedir):
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    parser = argparse.ArgumentParser(
 | 
			
		||||
        description="Download and save Hugging Face tokenizer"
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument("--model", type=str, required=True, help="Name of the model")
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
 | 
			
		||||
    )
 | 
			
		||||
        description="Download and save Hugging Face tokenizer")
 | 
			
		||||
    parser.add_argument("--model",
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help="Name of the model")
 | 
			
		||||
    parser.add_argument("--cachedir",
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help="Directory to save the tokenizer")
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    main(args.model, args.cachedir)
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import json
 | 
			
		||||
@ -12,33 +11,33 @@ from tabulate import tabulate
 | 
			
		||||
 | 
			
		||||
def parse_arguments():
 | 
			
		||||
    parser = argparse.ArgumentParser(
 | 
			
		||||
        description="Parse command line arguments for summary-nightly-results script."
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--results-folder",
 | 
			
		||||
        type=str,
 | 
			
		||||
        required=True,
 | 
			
		||||
        help="The folder where the results are stored.",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--description", type=str, required=True, help="Description of the results."
 | 
			
		||||
    )
 | 
			
		||||
        description=
 | 
			
		||||
        'Parse command line arguments for summary-nightly-results script.')
 | 
			
		||||
    parser.add_argument('--results-folder',
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help='The folder where the results are stored.')
 | 
			
		||||
    parser.add_argument('--description',
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help='Description of the results.')
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    return args
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_perf(df, method, model, metric):
 | 
			
		||||
 | 
			
		||||
    means = []
 | 
			
		||||
 | 
			
		||||
    for qps in [2, 4, 8, 16, "inf"]:
 | 
			
		||||
        target = df["Test name"].str.contains(model)
 | 
			
		||||
        target = target & df["Engine"].str.contains(method)
 | 
			
		||||
        target = target & df["Test name"].str.contains("qps_" + str(qps))
 | 
			
		||||
        target = df['Test name'].str.contains(model)
 | 
			
		||||
        target = target & df['Engine'].str.contains(method)
 | 
			
		||||
        target = target & df['Test name'].str.contains("qps_" + str(qps))
 | 
			
		||||
        filtered_df = df[target]
 | 
			
		||||
 | 
			
		||||
        if filtered_df.empty:
 | 
			
		||||
            means.append(0.0)
 | 
			
		||||
            means.append(0.)
 | 
			
		||||
        else:
 | 
			
		||||
            means.append(filtered_df[metric].values[0])
 | 
			
		||||
 | 
			
		||||
@ -46,6 +45,7 @@ def get_perf(df, method, model, metric):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_perf_w_std(df, method, model, metric):
 | 
			
		||||
 | 
			
		||||
    if metric in ["TTFT", "ITL"]:
 | 
			
		||||
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
 | 
			
		||||
        mean = mean.tolist()
 | 
			
		||||
@ -60,8 +60,7 @@ def get_perf_w_std(df, method, model, metric):
 | 
			
		||||
    else:
 | 
			
		||||
        assert metric == "Tput"
 | 
			
		||||
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
 | 
			
		||||
            df, method, model, "Output Tput (tok/s)"
 | 
			
		||||
        )
 | 
			
		||||
            df, method, model, "Output Tput (tok/s)")
 | 
			
		||||
        mean = mean.tolist()
 | 
			
		||||
        std = None
 | 
			
		||||
 | 
			
		||||
@ -81,17 +80,18 @@ def main(args):
 | 
			
		||||
    # generate markdown table
 | 
			
		||||
    df = pd.DataFrame.from_dict(results)
 | 
			
		||||
 | 
			
		||||
    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
 | 
			
		||||
    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
 | 
			
		||||
 | 
			
		||||
    with open(args.description) as f:
 | 
			
		||||
        description = f.read()
 | 
			
		||||
 | 
			
		||||
    description = description.format(nightly_results_benchmarking_table=md_table)
 | 
			
		||||
    description = description.format(
 | 
			
		||||
        nightly_results_benchmarking_table=md_table)
 | 
			
		||||
 | 
			
		||||
    with open("nightly_results.md", "w") as f:
 | 
			
		||||
        f.write(description)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    args = parse_arguments()
 | 
			
		||||
    main(args)
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
from lmdeploy.serve.openai.api_client import APIClient
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -10,38 +10,15 @@ set -x
 | 
			
		||||
set -o pipefail
 | 
			
		||||
 | 
			
		||||
check_gpus() {
 | 
			
		||||
  if command -v nvidia-smi; then
 | 
			
		||||
    # check the number of GPUs and GPU type.
 | 
			
		||||
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
 | 
			
		||||
  elif command -v amd-smi; then
 | 
			
		||||
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  # check the number of GPUs and GPU type.
 | 
			
		||||
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
 | 
			
		||||
  if [[ $gpu_count -gt 0 ]]; then
 | 
			
		||||
    echo "GPU found."
 | 
			
		||||
  else
 | 
			
		||||
    echo "Need at least 1 GPU to run benchmarking."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  if command -v nvidia-smi; then
 | 
			
		||||
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
 | 
			
		||||
  elif command -v amd-smi; then
 | 
			
		||||
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
 | 
			
		||||
  fi
 | 
			
		||||
  echo "GPU type is $gpu_type"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
check_cpus() {
 | 
			
		||||
  # check the number of CPUs and NUMA Node and GPU type.
 | 
			
		||||
  declare -g numa_count=$(python3 -c  "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
 | 
			
		||||
  if [[ $numa_count -gt 0 ]]; then
 | 
			
		||||
    echo "NUMA found."
 | 
			
		||||
    echo $numa_count
 | 
			
		||||
  else
 | 
			
		||||
    echo "Need at least 1 NUMA to run benchmarking."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  declare -g gpu_type="cpu"
 | 
			
		||||
  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
 | 
			
		||||
  echo "GPU type is $gpu_type"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -83,22 +60,6 @@ json2args() {
 | 
			
		||||
  echo "$args"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
json2envs() {
 | 
			
		||||
  # transforms the JSON string to environment variables.
 | 
			
		||||
  # example:
 | 
			
		||||
  # input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
 | 
			
		||||
  # output: VLLM_CPU_KVCACHE_SPACE=5
 | 
			
		||||
  local json_string=$1
 | 
			
		||||
  local args=$(
 | 
			
		||||
    echo "$json_string" | jq -r '
 | 
			
		||||
      to_entries |
 | 
			
		||||
      map((.key ) + "=" + (.value | tostring)) |
 | 
			
		||||
      join(" ")
 | 
			
		||||
    '
 | 
			
		||||
  )
 | 
			
		||||
  echo "$args"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
wait_for_server() {
 | 
			
		||||
  # wait for vllm server to start
 | 
			
		||||
  # return 1 if vllm server crashes
 | 
			
		||||
@ -129,15 +90,9 @@ kill_gpu_processes() {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # wait until GPU memory usage smaller than 1GB
 | 
			
		||||
  if command -v nvidia-smi; then
 | 
			
		||||
    while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
 | 
			
		||||
      sleep 1
 | 
			
		||||
    done
 | 
			
		||||
  elif command -v amd-smi; then
 | 
			
		||||
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
 | 
			
		||||
      sleep 1
 | 
			
		||||
    done
 | 
			
		||||
  fi
 | 
			
		||||
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
 | 
			
		||||
    sleep 1
 | 
			
		||||
  done
 | 
			
		||||
 | 
			
		||||
  # remove vllm config file
 | 
			
		||||
  rm -rf ~/.config/vllm
 | 
			
		||||
@ -188,24 +143,15 @@ run_latency_tests() {
 | 
			
		||||
    # get arguments
 | 
			
		||||
    latency_params=$(echo "$params" | jq -r '.parameters')
 | 
			
		||||
    latency_args=$(json2args "$latency_params")
 | 
			
		||||
    latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
 | 
			
		||||
    latency_envs=$(json2envs "$latency_environment_variables")
 | 
			
		||||
 | 
			
		||||
    # check if there is enough GPU to run the test
 | 
			
		||||
    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
 | 
			
		||||
    if [ "$ON_CPU" == "1" ];then
 | 
			
		||||
      if [[ $numa_count -lt $tp ]]; then
 | 
			
		||||
        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    else
 | 
			
		||||
      if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    latency_command=" $latency_envs python3 benchmark_latency.py \
 | 
			
		||||
    latency_command="python3 benchmark_latency.py \
 | 
			
		||||
      --output-json $RESULTS_FOLDER/${test_name}.json \
 | 
			
		||||
      $latency_args"
 | 
			
		||||
 | 
			
		||||
@ -255,24 +201,15 @@ run_throughput_tests() {
 | 
			
		||||
    # get arguments
 | 
			
		||||
    throughput_params=$(echo "$params" | jq -r '.parameters')
 | 
			
		||||
    throughput_args=$(json2args "$throughput_params")
 | 
			
		||||
    throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
 | 
			
		||||
    throughput_envs=$(json2envs "$throughput_environment_variables")
 | 
			
		||||
 | 
			
		||||
    # check if there is enough GPU to run the test
 | 
			
		||||
    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
 | 
			
		||||
    if [ "$ON_CPU" == "1" ];then
 | 
			
		||||
      if [[ $numa_count -lt $tp ]]; then
 | 
			
		||||
        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    else
 | 
			
		||||
      if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    throughput_command=" $throughput_envs python3 benchmark_throughput.py \
 | 
			
		||||
    throughput_command="python3 benchmark_throughput.py \
 | 
			
		||||
      --output-json $RESULTS_FOLDER/${test_name}.json \
 | 
			
		||||
      $throughput_args"
 | 
			
		||||
 | 
			
		||||
@ -320,27 +257,18 @@ run_serving_tests() {
 | 
			
		||||
 | 
			
		||||
    # get client and server arguments
 | 
			
		||||
    server_params=$(echo "$params" | jq -r '.server_parameters')
 | 
			
		||||
    server_envs=$(echo "$params" | jq -r '.server_environment_variables')
 | 
			
		||||
    client_params=$(echo "$params" | jq -r '.client_parameters')
 | 
			
		||||
    server_args=$(json2args "$server_params")
 | 
			
		||||
    server_envs=$(json2envs "$server_envs")
 | 
			
		||||
    client_args=$(json2args "$client_params")
 | 
			
		||||
    qps_list=$(echo "$params" | jq -r '.qps_list')
 | 
			
		||||
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
 | 
			
		||||
    echo "Running over qps list $qps_list"
 | 
			
		||||
 | 
			
		||||
    # check if there is enough resources to run the test
 | 
			
		||||
    # check if there is enough GPU to run the test
 | 
			
		||||
    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
 | 
			
		||||
    if [ "$ON_CPU" == "1" ];then
 | 
			
		||||
      if [[ $numa_count -lt $tp ]]; then
 | 
			
		||||
        echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    else
 | 
			
		||||
      if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
        echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
        continue
 | 
			
		||||
      fi
 | 
			
		||||
    if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # check if server model and client model is aligned
 | 
			
		||||
@ -351,33 +279,23 @@ run_serving_tests() {
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    server_command="$server_envs python3 \
 | 
			
		||||
    server_command="python3 \
 | 
			
		||||
      -m vllm.entrypoints.openai.api_server \
 | 
			
		||||
      $server_args"
 | 
			
		||||
 | 
			
		||||
    # run the server
 | 
			
		||||
    echo "Running test case $test_name"
 | 
			
		||||
    echo "Server command: $server_command"
 | 
			
		||||
    # support remote vllm server
 | 
			
		||||
    client_remote_args=""
 | 
			
		||||
    if [[ -z "${REMOTE_HOST}" ]]; then
 | 
			
		||||
      bash -c "$server_command" &
 | 
			
		||||
      server_pid=$!
 | 
			
		||||
      # wait until the server is alive
 | 
			
		||||
      if wait_for_server; then
 | 
			
		||||
        echo ""
 | 
			
		||||
        echo "vLLM server is up and running."
 | 
			
		||||
      else
 | 
			
		||||
        echo ""
 | 
			
		||||
        echo "vLLM failed to start within the timeout period."
 | 
			
		||||
      fi
 | 
			
		||||
    bash -c "$server_command" &
 | 
			
		||||
    server_pid=$!
 | 
			
		||||
 | 
			
		||||
    # wait until the server is alive
 | 
			
		||||
    if wait_for_server; then
 | 
			
		||||
      echo ""
 | 
			
		||||
      echo "vllm server is up and running."
 | 
			
		||||
    else
 | 
			
		||||
      server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
 | 
			
		||||
      if [[ ${REMOTE_PORT} ]]; then
 | 
			
		||||
        client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
 | 
			
		||||
      else
 | 
			
		||||
        client_remote_args=" --host=$REMOTE_HOST "
 | 
			
		||||
      fi
 | 
			
		||||
      echo ""
 | 
			
		||||
      echo "vllm failed to start within the timeout period."
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # iterate over different QPS
 | 
			
		||||
@ -399,7 +317,7 @@ run_serving_tests() {
 | 
			
		||||
        --result-filename ${new_test_name}.json \
 | 
			
		||||
        --request-rate $qps \
 | 
			
		||||
        --metadata "tensor_parallel_size=$tp" \
 | 
			
		||||
        $client_args $client_remote_args "
 | 
			
		||||
        $client_args"
 | 
			
		||||
 | 
			
		||||
      echo "Running test case $test_name with qps $qps"
 | 
			
		||||
      echo "Client command: $client_command"
 | 
			
		||||
@ -427,14 +345,7 @@ run_serving_tests() {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
main() {
 | 
			
		||||
  local ARCH
 | 
			
		||||
  ARCH=''
 | 
			
		||||
  if [ "$ON_CPU" == "1" ];then
 | 
			
		||||
     check_cpus
 | 
			
		||||
     ARCH='-cpu'
 | 
			
		||||
  else
 | 
			
		||||
     check_gpus
 | 
			
		||||
  fi
 | 
			
		||||
  check_gpus
 | 
			
		||||
  check_hf_token
 | 
			
		||||
 | 
			
		||||
  # Set to v1 to run v1 benchmark
 | 
			
		||||
@ -460,9 +371,9 @@ main() {
 | 
			
		||||
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 | 
			
		||||
 | 
			
		||||
  # benchmarking
 | 
			
		||||
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
 | 
			
		||||
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
 | 
			
		||||
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
 | 
			
		||||
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
 | 
			
		||||
  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
 | 
			
		||||
  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
 | 
			
		||||
 | 
			
		||||
  # postprocess benchmarking results
 | 
			
		||||
  pip install tabulate pandas
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import datetime
 | 
			
		||||
import json
 | 
			
		||||
@ -35,8 +34,10 @@ serving_column_mapping = {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    # collect results
 | 
			
		||||
    for test_file in results_folder.glob("*.json"):
 | 
			
		||||
 | 
			
		||||
        with open(test_file) as f:
 | 
			
		||||
            raw_result = json.loads(f.read())
 | 
			
		||||
 | 
			
		||||
@ -55,16 +56,17 @@ if __name__ == "__main__":
 | 
			
		||||
    serving_results = pd.DataFrame.from_dict(serving_results)
 | 
			
		||||
 | 
			
		||||
    if not serving_results.empty:
 | 
			
		||||
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
 | 
			
		||||
            columns=serving_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
        serving_results = serving_results[list(
 | 
			
		||||
            serving_column_mapping.keys())].rename(
 | 
			
		||||
                columns=serving_column_mapping)
 | 
			
		||||
 | 
			
		||||
    serving_md_table_with_headers = tabulate(
 | 
			
		||||
        serving_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    serving_md_table_with_headers = tabulate(serving_results,
 | 
			
		||||
                                             headers='keys',
 | 
			
		||||
                                             tablefmt='pipe',
 | 
			
		||||
                                             showindex=False)
 | 
			
		||||
    # remove the first line of header
 | 
			
		||||
    serving_md_table_lines = serving_md_table_with_headers.split("\n")
 | 
			
		||||
    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
 | 
			
		||||
    serving_md_table_lines = serving_md_table_with_headers.split('\n')
 | 
			
		||||
    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
 | 
			
		||||
 | 
			
		||||
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 | 
			
		||||
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 | 
			
		||||
@ -74,9 +76,10 @@ if __name__ == "__main__":
 | 
			
		||||
        # document results with header.
 | 
			
		||||
        # for those who wants to reproduce our benchmark.
 | 
			
		||||
        f.write(serving_md_table_with_headers)
 | 
			
		||||
        f.write("\n")
 | 
			
		||||
        f.write('\n')
 | 
			
		||||
 | 
			
		||||
    # document benchmarking results in json
 | 
			
		||||
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
 | 
			
		||||
        results = serving_results.to_dict(orient="records")
 | 
			
		||||
 | 
			
		||||
        results = serving_results.to_dict(orient='records')
 | 
			
		||||
        f.write(json.dumps(results))
 | 
			
		||||
 | 
			
		||||
@ -1,30 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_llama8B_tp1",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "num_iters_warmup": 5,
 | 
			
		||||
            "num_iters": 15
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_llama8B_tp4",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "num_iters_warmup": 5,
 | 
			
		||||
            "num_iters": 15
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,158 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp1_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
	    "max_concurrency": 60,
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp2_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
	    "max_concurrency": 60,
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp4_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
	    "max_concurrency": 60,
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp4_random_1024_128",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 1024,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
	    "max_concurrency": 100,
 | 
			
		||||
            "num_prompts": 100
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_pp6_random_1024_128",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "pipeline_parallel_size": 6,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "disable_log_requests": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 1024,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
	    "max_concurrency": 100,
 | 
			
		||||
            "num_prompts": 100
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -63,12 +63,10 @@
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
 | 
			
		||||
            "disable_log_requests": "", 
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "swap_space": 16,
 | 
			
		||||
            "speculative_config": {
 | 
			
		||||
                "model": "turboderp/Qwama-0.5B-Instruct",
 | 
			
		||||
                "num_speculative_tokens": 4,
 | 
			
		||||
                "draft_tensor_parallel_size": 1
 | 
			
		||||
            }
 | 
			
		||||
            "swap_space": 16, 
 | 
			
		||||
            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
 | 
			
		||||
            "num_speculative_tokens": 4,
 | 
			
		||||
            "speculative_draft_tensor_parallel_size": 1
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
 | 
			
		||||
 | 
			
		||||
@ -1,32 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_llama8B_tp1",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200,
 | 
			
		||||
            "backend": "vllm"
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_llama8B_tp4",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200,
 | 
			
		||||
            "backend": "vllm"
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,46 +0,0 @@
 | 
			
		||||
# This local pyproject file is part of the migration from yapf to ruff format.
 | 
			
		||||
# It uses the same core rules as the main pyproject.toml file, but with the
 | 
			
		||||
# following differences:
 | 
			
		||||
# - ruff line length is overridden to 88
 | 
			
		||||
# - deprecated typing ignores (UP006, UP035) have been removed
 | 
			
		||||
 | 
			
		||||
[tool.ruff]
 | 
			
		||||
line-length = 88
 | 
			
		||||
 | 
			
		||||
[tool.ruff.lint.per-file-ignores]
 | 
			
		||||
"vllm/third_party/**" = ["ALL"]
 | 
			
		||||
"vllm/version.py" = ["F401"]
 | 
			
		||||
"vllm/_version.py" = ["ALL"]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.lint]
 | 
			
		||||
select = [
 | 
			
		||||
    # pycodestyle
 | 
			
		||||
    "E",
 | 
			
		||||
    # Pyflakes
 | 
			
		||||
    "F",
 | 
			
		||||
    # pyupgrade
 | 
			
		||||
    "UP",
 | 
			
		||||
    # flake8-bugbear
 | 
			
		||||
    "B",
 | 
			
		||||
    # flake8-simplify
 | 
			
		||||
    "SIM",
 | 
			
		||||
    # isort
 | 
			
		||||
    "I",
 | 
			
		||||
    # flake8-logging-format
 | 
			
		||||
    "G",
 | 
			
		||||
]
 | 
			
		||||
ignore = [
 | 
			
		||||
    # star imports
 | 
			
		||||
    "F405", "F403",
 | 
			
		||||
    # lambda expression assignment
 | 
			
		||||
    "E731",
 | 
			
		||||
    # Loop control variable not used within loop body
 | 
			
		||||
    "B007",
 | 
			
		||||
    # f-string format
 | 
			
		||||
    "UP032",
 | 
			
		||||
    # Can remove once 3.10+ is the minimum Python version
 | 
			
		||||
    "UP007",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.format]
 | 
			
		||||
docstring-code-format = true
 | 
			
		||||
@ -1,25 +1,23 @@
 | 
			
		||||
steps:
 | 
			
		||||
  - label: "Build wheel - CUDA 12.8"
 | 
			
		||||
    id: build-wheel-cuda-12-8
 | 
			
		||||
  - label: "Build wheel - CUDA 12.4"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
      - "bash .buildkite/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - label: "Build wheel - CUDA 12.6"
 | 
			
		||||
    id: build-wheel-cuda-12-6
 | 
			
		||||
  - label: "Build wheel - CUDA 12.1"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
      - "bash .buildkite/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
@ -30,14 +28,13 @@ steps:
 | 
			
		||||
 | 
			
		||||
  - label: "Build wheel - CUDA 11.8"
 | 
			
		||||
    # depends_on: block-build-cu118-wheel
 | 
			
		||||
    id: build-wheel-cuda-11-8
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
      - "bash .buildkite/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
@ -47,49 +44,33 @@ steps:
 | 
			
		||||
 | 
			
		||||
  - label: "Build release image"
 | 
			
		||||
    depends_on: block-release-image-build
 | 
			
		||||
    id: build-release-image
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
			
		||||
 | 
			
		||||
  - label: "Annotate release workflow"
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - build-release-image
 | 
			
		||||
      - build-wheel-cuda-12-8
 | 
			
		||||
      - build-wheel-cuda-12-6
 | 
			
		||||
      - build-wheel-cuda-11-8
 | 
			
		||||
    id: annotate-release-workflow
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "bash .buildkite/scripts/annotate-release.sh"
 | 
			
		||||
 | 
			
		||||
  - label: "Build and publish TPU release image"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    if: build.env("NIGHTLY") == "1"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: tpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "yes | docker system prune -a"
 | 
			
		||||
      - "git fetch --all"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
 | 
			
		||||
      - "docker push vllm/vllm-tpu:nightly"
 | 
			
		||||
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
 | 
			
		||||
    plugins:
 | 
			
		||||
      - docker-login#v3.0.0:
 | 
			
		||||
          username: vllmbot
 | 
			
		||||
          username: vllm
 | 
			
		||||
          password-env: DOCKERHUB_TOKEN
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - input: "Provide Release version here"
 | 
			
		||||
    id: input-release-version
 | 
			
		||||
    fields:
 | 
			
		||||
      - text: "What is the release version?"
 | 
			
		||||
        key: release-version
 | 
			
		||||
        key: "release-version"
 | 
			
		||||
 | 
			
		||||
  - block: "Build CPU release image"
 | 
			
		||||
    key: block-cpu-release-image-build
 | 
			
		||||
@ -101,24 +82,7 @@ steps:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain -f Dockerfile.cpu ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - block: "Build Neuron release image"
 | 
			
		||||
    key: block-neuron-release-image-build
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
 | 
			
		||||
  - label: "Build and publish Neuron release image"
 | 
			
		||||
    depends_on: block-neuron-release-image-build
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: neuron-postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
@ -3,9 +3,6 @@
 | 
			
		||||
# This script runs test inside the corresponding ROCm docker container.
 | 
			
		||||
set -o pipefail
 | 
			
		||||
 | 
			
		||||
# Export Python path
 | 
			
		||||
export PYTHONPATH=".."
 | 
			
		||||
 | 
			
		||||
# Print ROCm version
 | 
			
		||||
echo "--- Confirming Clean Initial State"
 | 
			
		||||
while true; do
 | 
			
		||||
@ -77,105 +74,50 @@ HF_MOUNT="/root/.cache/huggingface"
 | 
			
		||||
 | 
			
		||||
commands=$@
 | 
			
		||||
echo "Commands:$commands"
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
 | 
			
		||||
  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#ignore certain kernels tests
 | 
			
		||||
if [[ $commands == *" kernels/core"* ]]; then
 | 
			
		||||
if [[ $commands == *" kernels "* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/core/test_fused_quant_layernorm.py \
 | 
			
		||||
  --ignore=kernels/core/test_permute_cols.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/attention"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/attention/test_attention_selector.py \
 | 
			
		||||
  --ignore=kernels/attention/test_blocksparse_attention.py \
 | 
			
		||||
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_flashinfer.py \
 | 
			
		||||
  --ignore=kernels/attention/test_prefix_prefill.py \
 | 
			
		||||
  --ignore=kernels/attention/test_cascade_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_mha_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_lightning_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_attention.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/quantization"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/quantization/test_int8_quant.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_aqlm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_machete_mm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_block_fp8.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_block_int8.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_marlin_gemm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_int8_kernel.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/mamba"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/mamba/test_mamba_mixer2.py \
 | 
			
		||||
  --ignore=kernels/mamba/test_causal_conv1d.py \
 | 
			
		||||
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/moe"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/moe/test_moe.py \
 | 
			
		||||
  --ignore=kernels/moe/test_cutlass_moe.py \
 | 
			
		||||
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 | 
			
		||||
  --ignore=kernels/test_attention_selector.py \
 | 
			
		||||
  --ignore=kernels/test_blocksparse_attention.py \
 | 
			
		||||
  --ignore=kernels/test_causal_conv1d.py \
 | 
			
		||||
  --ignore=kernels/test_cutlass.py \
 | 
			
		||||
  --ignore=kernels/test_encoder_decoder_attn.py \
 | 
			
		||||
  --ignore=kernels/test_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/test_flashinfer.py \
 | 
			
		||||
  --ignore=kernels/test_int8_quant.py \
 | 
			
		||||
  --ignore=kernels/test_machete_gemm.py \
 | 
			
		||||
  --ignore=kernels/test_mamba_ssm.py \
 | 
			
		||||
  --ignore=kernels/test_marlin_gemm.py \
 | 
			
		||||
  --ignore=kernels/test_moe.py \
 | 
			
		||||
  --ignore=kernels/test_prefix_prefill.py \
 | 
			
		||||
  --ignore=kernels/test_rand.py \
 | 
			
		||||
  --ignore=kernels/test_sampler.py \
 | 
			
		||||
  --ignore=kernels/test_cascade_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/test_mamba_mixer2.py \
 | 
			
		||||
  --ignore=kernels/test_aqlm.py \
 | 
			
		||||
  --ignore=kernels/test_machete_mm.py \
 | 
			
		||||
  --ignore=kernels/test_mha_attn.py \
 | 
			
		||||
  --ignore=kernels/test_block_fp8.py \
 | 
			
		||||
  --ignore=kernels/test_permute_cols.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#ignore certain Entrypoints/openai tests
 | 
			
		||||
if [[ $commands == *" entrypoints/openai "* ]]; then
 | 
			
		||||
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
 | 
			
		||||
  --ignore=entrypoints/openai/test_audio.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_chat.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_shutdown.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_completion.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_sleep.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_models.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_lora_adapters.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_root_path.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_tokenization.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#ignore certain Entrypoints/llm tests
 | 
			
		||||
if [[ $commands == *" entrypoints/llm "* ]]; then
 | 
			
		||||
  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
 | 
			
		||||
  --ignore=entrypoints/llm/test_chat.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_accuracy.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_init.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 | 
			
		||||
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
 | 
			
		||||
  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#Obsolete currently
 | 
			
		||||
##ignore certain Entrypoints/llm tests
 | 
			
		||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
 | 
			
		||||
#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 | 
			
		||||
#fi
 | 
			
		||||
 | 
			
		||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
 | 
			
		||||
# --ignore=entrypoints/openai/test_embedding.py \
 | 
			
		||||
# --ignore=entrypoints/openai/test_oot_registration.py
 | 
			
		||||
@ -184,8 +126,6 @@ fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
PARALLEL_JOB_COUNT=8
 | 
			
		||||
MYPYTHONPATH=".."
 | 
			
		||||
 | 
			
		||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 | 
			
		||||
if [[ $commands == *"--shard-id="* ]]; then
 | 
			
		||||
  # assign job count as the number of shards used   
 | 
			
		||||
@ -194,10 +134,9 @@ if [[ $commands == *"--shard-id="* ]]; then
 | 
			
		||||
    # assign shard-id for each shard
 | 
			
		||||
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
 | 
			
		||||
    echo "Shard ${GPU} commands:$commands_gpu"
 | 
			
		||||
    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
 | 
			
		||||
    docker run \
 | 
			
		||||
        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
 | 
			
		||||
        --network=host \
 | 
			
		||||
        --device /dev/kfd --device /dev/dri \
 | 
			
		||||
        --network host \
 | 
			
		||||
        --shm-size=16gb \
 | 
			
		||||
        --rm \
 | 
			
		||||
        -e HIP_VISIBLE_DEVICES="${GPU}" \
 | 
			
		||||
@ -206,7 +145,6 @@ if [[ $commands == *"--shard-id="* ]]; then
 | 
			
		||||
        -e AWS_SECRET_ACCESS_KEY \
 | 
			
		||||
        -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
        -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
        -e "PYTHONPATH=${MYPYTHONPATH}" \
 | 
			
		||||
        --name "${container_name}_${GPU}" \
 | 
			
		||||
        "${image_name}" \
 | 
			
		||||
        /bin/bash -c "${commands_gpu}" \
 | 
			
		||||
@ -225,10 +163,9 @@ if [[ $commands == *"--shard-id="* ]]; then
 | 
			
		||||
    fi
 | 
			
		||||
  done
 | 
			
		||||
else
 | 
			
		||||
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
 | 
			
		||||
  docker run \
 | 
			
		||||
          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
 | 
			
		||||
          --network=host \
 | 
			
		||||
          --device /dev/kfd --device /dev/dri \
 | 
			
		||||
          --network host \
 | 
			
		||||
          --shm-size=16gb \
 | 
			
		||||
          --rm \
 | 
			
		||||
          -e HIP_VISIBLE_DEVICES=0 \
 | 
			
		||||
@ -237,7 +174,6 @@ else
 | 
			
		||||
          -e AWS_SECRET_ACCESS_KEY \
 | 
			
		||||
          -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
          -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
          -e "PYTHONPATH=${MYPYTHONPATH}" \
 | 
			
		||||
          --name "${container_name}" \
 | 
			
		||||
          "${image_name}" \
 | 
			
		||||
          /bin/bash -c "${commands}"
 | 
			
		||||
@ -5,8 +5,8 @@
 | 
			
		||||
set -ex
 | 
			
		||||
set -o pipefail
 | 
			
		||||
 | 
			
		||||
# cd 2 levels into the working directory
 | 
			
		||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 | 
			
		||||
# cd into parent directory of this file
 | 
			
		||||
cd "$(dirname "${BASH_SOURCE[0]}")/.."
 | 
			
		||||
 | 
			
		||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 | 
			
		||||
 | 
			
		||||
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
 | 
			
		||||
docker build -t cpu-test -f Dockerfile.ppc64le .
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										92
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										92
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,92 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# allow to bind to different cores
 | 
			
		||||
CORE_RANGE=${CORE_RANGE:-48-95}
 | 
			
		||||
NUMA_NODE=${NUMA_NODE:-1}
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Run the image, setting --shm-size=4g for tensor parallel.
 | 
			
		||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
 | 
			
		||||
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 | 
			
		||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
 | 
			
		||||
 --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 | 
			
		||||
 | 
			
		||||
function cpu_tests() {
 | 
			
		||||
  set -e
 | 
			
		||||
  export NUMA_NODE=$2
 | 
			
		||||
  export BUILDKITE_BUILD_NUMBER=$3
 | 
			
		||||
 | 
			
		||||
  # offline inference
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run basic model test
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip install -r vllm/requirements/test.txt
 | 
			
		||||
    pip install -r vllm/requirements/cpu.txt
 | 
			
		||||
    pytest -v -s tests/kernels/test_cache.py -m cpu_model
 | 
			
		||||
    pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/decoder_only/language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/embedding/language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 | 
			
		||||
 | 
			
		||||
  # Run compressed-tensor test
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 | 
			
		||||
 | 
			
		||||
  # Run AWQ test
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/quantization/test_ipex_quant.py"
 | 
			
		||||
 | 
			
		||||
  # Run chunked-prefill and prefix-cache test
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v -k cpu_model \
 | 
			
		||||
    tests/basic_correctness/test_chunked_prefill.py"  
 | 
			
		||||
 | 
			
		||||
  # online serving
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    export VLLM_CPU_KVCACHE_SPACE=10 
 | 
			
		||||
    export VLLM_CPU_OMP_THREADS_BIND=$1
 | 
			
		||||
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
 | 
			
		||||
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 | 
			
		||||
    python3 benchmarks/benchmark_serving.py \
 | 
			
		||||
      --backend vllm \
 | 
			
		||||
      --dataset-name random \
 | 
			
		||||
      --model facebook/opt-125m \
 | 
			
		||||
      --num-prompts 20 \
 | 
			
		||||
      --endpoint /v1/completions \
 | 
			
		||||
      --tokenizer facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run multi-lora tests
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/lora/test_qwen2vl.py"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# All of CPU tests are expected to be finished less than 40 mins.
 | 
			
		||||
export -f cpu_tests
 | 
			
		||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
 | 
			
		||||
@ -9,7 +9,6 @@ python3 use_existing_torch.py
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
DOCKER_BUILDKIT=1 docker build . \
 | 
			
		||||
  --file docker/Dockerfile \
 | 
			
		||||
  --target vllm-openai \
 | 
			
		||||
  --platform "linux/arm64" \
 | 
			
		||||
  -t gh200-test \
 | 
			
		||||
							
								
								
									
										24
									
								
								.buildkite/run-hpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								.buildkite/run-hpu-test.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,24 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
docker build -t hpu-test-env -f Dockerfile.hpu .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
# certain versions of HPU software stack have a bug that can
 | 
			
		||||
# override the exit code of the script, so we need to use
 | 
			
		||||
# separate remove_docker_container and remove_docker_container_and_exit
 | 
			
		||||
# functions, while other platforms only need one remove_docker_container
 | 
			
		||||
# function.
 | 
			
		||||
EXITCODE=1
 | 
			
		||||
remove_docker_container() { docker rm -f hpu-test || true; }
 | 
			
		||||
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
 | 
			
		||||
trap remove_docker_container_and_exit EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Run the image and launch offline inference
 | 
			
		||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 | 
			
		||||
EXITCODE=$?
 | 
			
		||||
@ -3,7 +3,7 @@
 | 
			
		||||
set -euox pipefail
 | 
			
		||||
 | 
			
		||||
if [[ $# -lt 4 ]]; then
 | 
			
		||||
    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
 | 
			
		||||
    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 | 
			
		||||
HF_CACHE="$(realpath ~)/huggingface"
 | 
			
		||||
mkdir -p "${HF_CACHE}"
 | 
			
		||||
HF_MOUNT="/root/.cache/huggingface"
 | 
			
		||||
HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
 | 
			
		||||
 | 
			
		||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 | 
			
		||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 | 
			
		||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
 | 
			
		||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 | 
			
		||||
 | 
			
		||||
# prune old image and containers to save disk space, and only once a day
 | 
			
		||||
# by using a timestamp file in tmp.
 | 
			
		||||
@ -36,7 +35,7 @@ else
 | 
			
		||||
    date "+%s" > /tmp/neuron-docker-build-timestamp
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 | 
			
		||||
docker build -t "${image_name}" -f Dockerfile.neuron .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() {
 | 
			
		||||
@ -48,17 +47,8 @@ trap remove_docker_container EXIT
 | 
			
		||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
 | 
			
		||||
       -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
       -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
       -e "HF_TOKEN=${HF_TOKEN}" \
 | 
			
		||||
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
 | 
			
		||||
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
 | 
			
		||||
       --name "${container_name}" \
 | 
			
		||||
       ${image_name} \
 | 
			
		||||
       /bin/bash -c "
 | 
			
		||||
            set -e; # Exit on first error
 | 
			
		||||
            python3 /workspace/vllm/examples/offline_inference/neuron.py;
 | 
			
		||||
            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
 | 
			
		||||
            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
 | 
			
		||||
                echo \"Running test file: \$f\";
 | 
			
		||||
                python3 -m pytest \$f -v --capture=tee-sys;
 | 
			
		||||
            done
 | 
			
		||||
       "
 | 
			
		||||
       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
 | 
			
		||||
							
								
								
									
										40
									
								
								.buildkite/run-tpu-v1-test.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										40
									
								
								.buildkite/run-tpu-v1-test.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,40 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -e
 | 
			
		||||
 | 
			
		||||
# Build the docker image.
 | 
			
		||||
docker build -f Dockerfile.tpu -t vllm-tpu .
 | 
			
		||||
 | 
			
		||||
# Set up cleanup.
 | 
			
		||||
remove_docker_container() { docker rm -f tpu-test || true; }
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# For HF_TOKEN.
 | 
			
		||||
source /etc/environment
 | 
			
		||||
# Run a simple end-to-end example.
 | 
			
		||||
docker run --privileged --net host --shm-size=16G -it \
 | 
			
		||||
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
 | 
			
		||||
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
 | 
			
		||||
    && python3 -m pip install pytest \
 | 
			
		||||
    && python3 -m pip install lm_eval[api]==0.4.4 \
 | 
			
		||||
    && export VLLM_USE_V1=1 \
 | 
			
		||||
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
 | 
			
		||||
    && echo TEST_1 \
 | 
			
		||||
    && pytest /workspace/vllm/tests/tpu/test_compilation.py \
 | 
			
		||||
    && echo TEST_2 \
 | 
			
		||||
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
 | 
			
		||||
    && echo TEST_3 \
 | 
			
		||||
    && pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
 | 
			
		||||
    && echo TEST_4 \
 | 
			
		||||
    && pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
 | 
			
		||||
    && echo TEST_5 \
 | 
			
		||||
    && python3 /workspace/vllm/examples/offline_inference/tpu.py \
 | 
			
		||||
    && echo TEST_6 \
 | 
			
		||||
    && pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py" \
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# TODO: This test fails because it uses RANDOM_SEED sampling
 | 
			
		||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 | 
			
		||||
 | 
			
		||||
@ -8,11 +8,11 @@ image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 | 
			
		||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
 | 
			
		||||
docker build -t ${image_name} -f Dockerfile.xpu .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() {
 | 
			
		||||
  docker rm -f "${container_name}" || true;
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
  docker rm -f "${container_name}" || true; 
 | 
			
		||||
  docker image rm -f "${image_name}" || true;
 | 
			
		||||
  docker system prune -f || true;
 | 
			
		||||
}
 | 
			
		||||
@ -26,9 +26,6 @@ docker run \
 | 
			
		||||
    --name "${container_name}" \
 | 
			
		||||
    "${image_name}" \
 | 
			
		||||
    sh -c '
 | 
			
		||||
    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
 | 
			
		||||
    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
 | 
			
		||||
    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
 | 
			
		||||
    cd tests
 | 
			
		||||
    pytest -v -s v1/core
 | 
			
		||||
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 | 
			
		||||
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 | 
			
		||||
'
 | 
			
		||||
@ -1,31 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Get release version and strip leading 'v' if present
 | 
			
		||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
 | 
			
		||||
 | 
			
		||||
if [ -z "$RELEASE_VERSION" ]; then
 | 
			
		||||
  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 | 
			
		||||
To download the wheel:
 | 
			
		||||
\`\`\`
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
 | 
			
		||||
\`\`\`
 | 
			
		||||
 | 
			
		||||
To download and upload the image:
 | 
			
		||||
 | 
			
		||||
\`\`\`
 | 
			
		||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
 | 
			
		||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
 | 
			
		||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
 | 
			
		||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
 | 
			
		||||
docker push vllm/vllm-openai:latest
 | 
			
		||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
 | 
			
		||||
\`\`\`
 | 
			
		||||
EOF 
 | 
			
		||||
@ -1,17 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
# Usage: ./ci_clean_log.sh ci.log
 | 
			
		||||
# This script strips timestamps and color codes from CI log files.
 | 
			
		||||
 | 
			
		||||
# Check if argument is given
 | 
			
		||||
if [ $# -lt 1 ]; then
 | 
			
		||||
    echo "Usage: $0 ci.log"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
INPUT_FILE="$1"
 | 
			
		||||
 | 
			
		||||
# Strip timestamps
 | 
			
		||||
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
 | 
			
		||||
 | 
			
		||||
# Strip colorization
 | 
			
		||||
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
 | 
			
		||||
@ -1,49 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() {
 | 
			
		||||
  if [[ -n "$container_id" ]]; then
 | 
			
		||||
      podman stop --all -t0
 | 
			
		||||
      podman rm -f "$container_id" || true
 | 
			
		||||
  fi
 | 
			
		||||
  podman system prune -f
 | 
			
		||||
}
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
 | 
			
		||||
 | 
			
		||||
# Run the image
 | 
			
		||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
 | 
			
		||||
 | 
			
		||||
function cpu_tests() {
 | 
			
		||||
 | 
			
		||||
  # offline inference
 | 
			
		||||
  podman exec -it "$container_id" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run basic model test
 | 
			
		||||
  podman exec -it "$container_id" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
 | 
			
		||||
    pip install sentence-transformers datamodel_code_generator
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
 | 
			
		||||
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
 | 
			
		||||
    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# All of CPU tests are expected to be finished less than 40 mins.
 | 
			
		||||
 | 
			
		||||
export container_id
 | 
			
		||||
export -f cpu_tests
 | 
			
		||||
timeout 40m bash -c cpu_tests
 | 
			
		||||
 | 
			
		||||
@ -1,102 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# allow to bind to different cores
 | 
			
		||||
CORE_RANGE=${CORE_RANGE:-48-95}
 | 
			
		||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 | 
			
		||||
NUMA_NODE=${NUMA_NODE:-1}
 | 
			
		||||
 | 
			
		||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    set -e; 
 | 
			
		||||
    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
 | 
			
		||||
}
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 | 
			
		||||
 | 
			
		||||
# Run the image, setting --shm-size=4g for tensor parallel.
 | 
			
		||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
 | 
			
		||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 | 
			
		||||
 | 
			
		||||
function cpu_tests() {
 | 
			
		||||
  set -e
 | 
			
		||||
  export NUMA_NODE=$2
 | 
			
		||||
 | 
			
		||||
  # list packages
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip list"
 | 
			
		||||
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip list"
 | 
			
		||||
 | 
			
		||||
  # offline inference
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run basic model test
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    # Note: disable until supports V1
 | 
			
		||||
    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
 | 
			
		||||
    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
 | 
			
		||||
 | 
			
		||||
    # Note: disable Bart until supports V1
 | 
			
		||||
    pytest -v -s tests/models/language/generation -m cpu_model \
 | 
			
		||||
                --ignore=tests/models/language/generation/test_bart.py
 | 
			
		||||
    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
 | 
			
		||||
                --ignore=tests/models/language/generation/test_bart.py
 | 
			
		||||
 | 
			
		||||
    pytest -v -s tests/models/language/pooling -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/multimodal/generation \
 | 
			
		||||
                --ignore=tests/models/multimodal/generation/test_mllama.py \
 | 
			
		||||
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
 | 
			
		||||
                -m cpu_model"
 | 
			
		||||
 | 
			
		||||
  # Run compressed-tensor test
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" 
 | 
			
		||||
 | 
			
		||||
  # Note: disable it until supports V1
 | 
			
		||||
  # Run AWQ test
 | 
			
		||||
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
  #   set -e
 | 
			
		||||
  #   VLLM_USE_V1=0 pytest -s -v \
 | 
			
		||||
  #   tests/quantization/test_ipex_quant.py"
 | 
			
		||||
 | 
			
		||||
  # online serving
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
 | 
			
		||||
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 | 
			
		||||
    VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
 | 
			
		||||
      --backend vllm \
 | 
			
		||||
      --dataset-name random \
 | 
			
		||||
      --model facebook/opt-125m \
 | 
			
		||||
      --num-prompts 20 \
 | 
			
		||||
      --endpoint /v1/completions \
 | 
			
		||||
      --tokenizer facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run multi-lora tests
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/lora/test_qwen2vl.py"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# All of CPU tests are expected to be finished less than 40 mins.
 | 
			
		||||
export -f cpu_tests
 | 
			
		||||
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
 | 
			
		||||
@ -1,58 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -exuo pipefail
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
 | 
			
		||||
FROM 1.22-413-pt2.7.1:latest
 | 
			
		||||
 | 
			
		||||
COPY ./ /workspace/vllm
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/vllm
 | 
			
		||||
 | 
			
		||||
RUN pip install -v -r requirements/hpu.txt
 | 
			
		||||
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
 | 
			
		||||
 | 
			
		||||
ENV no_proxy=localhost,127.0.0.1
 | 
			
		||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 | 
			
		||||
 | 
			
		||||
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
 | 
			
		||||
 | 
			
		||||
# install development dependencies (for testing)
 | 
			
		||||
RUN python3 -m pip install -e tests/vllm_test_utils
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/
 | 
			
		||||
 | 
			
		||||
RUN git clone https://github.com/vllm-project/vllm-gaudi.git
 | 
			
		||||
 | 
			
		||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 | 
			
		||||
 | 
			
		||||
EOF
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
# certain versions of HPU software stack have a bug that can
 | 
			
		||||
# override the exit code of the script, so we need to use
 | 
			
		||||
# separate remove_docker_containers and remove_docker_containers_and_exit
 | 
			
		||||
# functions, while other platforms only need one remove_docker_container
 | 
			
		||||
# function.
 | 
			
		||||
EXITCODE=1
 | 
			
		||||
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
 | 
			
		||||
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
 | 
			
		||||
remove_docker_containers
 | 
			
		||||
 | 
			
		||||
echo "Running HPU plugin v1 test"
 | 
			
		||||
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
 | 
			
		||||
  -e HABANA_VISIBLE_DEVICES=all \
 | 
			
		||||
  hpu-plugin-v1-test-env \
 | 
			
		||||
  /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
 | 
			
		||||
 | 
			
		||||
EXITCODE=$?
 | 
			
		||||
if [ $EXITCODE -eq 0 ]; then
 | 
			
		||||
  echo "Test with basic model passed"
 | 
			
		||||
else
 | 
			
		||||
  echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# The trap will handle the container removal and final exit.
 | 
			
		||||
@ -1,187 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -xu
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    docker rm -f tpu-test || true; 
 | 
			
		||||
    docker rm -f vllm-tpu || true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Build the docker image.
 | 
			
		||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 | 
			
		||||
 | 
			
		||||
# Set up cleanup.
 | 
			
		||||
cleanup_docker() {
 | 
			
		||||
  # Get Docker's root directory
 | 
			
		||||
  docker_root=$(docker info -f '{{.DockerRootDir}}')
 | 
			
		||||
  if [ -z "$docker_root" ]; then
 | 
			
		||||
    echo "Failed to determine Docker root directory."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  echo "Docker root directory: $docker_root"
 | 
			
		||||
  # Check disk usage of the filesystem where Docker's root directory is located
 | 
			
		||||
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 | 
			
		||||
  # Define the threshold
 | 
			
		||||
  threshold=70
 | 
			
		||||
  if [ "$disk_usage" -gt "$threshold" ]; then
 | 
			
		||||
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
 | 
			
		||||
    # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
    docker image prune -f
 | 
			
		||||
    # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
 | 
			
		||||
    echo "Docker images and volumes cleanup completed."
 | 
			
		||||
  else
 | 
			
		||||
    echo "Disk usage is below $threshold%. No cleanup needed."
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
cleanup_docker
 | 
			
		||||
 | 
			
		||||
# For HF_TOKEN.
 | 
			
		||||
source /etc/environment
 | 
			
		||||
 | 
			
		||||
docker run --privileged --net host --shm-size=16G -it \
 | 
			
		||||
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
 | 
			
		||||
    vllm-tpu /bin/bash -c '
 | 
			
		||||
set -e # Exit immediately if a command exits with a non-zero status.
 | 
			
		||||
set -u # Treat unset variables as an error.
 | 
			
		||||
 | 
			
		||||
echo "--- Starting script inside Docker container ---"
 | 
			
		||||
 | 
			
		||||
# Create results directory
 | 
			
		||||
RESULTS_DIR=$(mktemp -d)
 | 
			
		||||
# If mktemp fails, set -e will cause the script to exit.
 | 
			
		||||
echo "Results will be stored in: $RESULTS_DIR"
 | 
			
		||||
 | 
			
		||||
# Install dependencies
 | 
			
		||||
echo "--- Installing Python dependencies ---"
 | 
			
		||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
 | 
			
		||||
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
 | 
			
		||||
    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
 | 
			
		||||
echo "--- Python dependencies installed ---"
 | 
			
		||||
export VLLM_USE_V1=1
 | 
			
		||||
export VLLM_XLA_CHECK_RECOMPILATION=1
 | 
			
		||||
export VLLM_XLA_CACHE_PATH=
 | 
			
		||||
echo "Using VLLM V1"
 | 
			
		||||
 | 
			
		||||
echo "--- Hardware Information ---"
 | 
			
		||||
tpu-info
 | 
			
		||||
echo "--- Starting Tests ---"
 | 
			
		||||
set +e
 | 
			
		||||
overall_script_exit_code=0
 | 
			
		||||
 | 
			
		||||
# --- Test Definitions ---
 | 
			
		||||
# If a test fails, this function will print logs and will not cause the main script to exit.
 | 
			
		||||
run_test() {
 | 
			
		||||
    local test_num=$1
 | 
			
		||||
    local test_name=$2
 | 
			
		||||
    local test_command=$3
 | 
			
		||||
    local log_file="$RESULTS_DIR/test_${test_num}.log"
 | 
			
		||||
    local actual_exit_code
 | 
			
		||||
 | 
			
		||||
    echo "--- TEST_$test_num: Running $test_name ---"
 | 
			
		||||
    
 | 
			
		||||
    # Execute the test command.
 | 
			
		||||
    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
 | 
			
		||||
    actual_exit_code=$?
 | 
			
		||||
 | 
			
		||||
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
 | 
			
		||||
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
 | 
			
		||||
 | 
			
		||||
    if [ "$actual_exit_code" -ne 0 ]; then
 | 
			
		||||
        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
 | 
			
		||||
        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
 | 
			
		||||
        if [ -f "$log_file" ]; then
 | 
			
		||||
            cat "$log_file" >&2
 | 
			
		||||
        else
 | 
			
		||||
            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
 | 
			
		||||
        fi
 | 
			
		||||
        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
 | 
			
		||||
        return "$actual_exit_code" # Return the failure code
 | 
			
		||||
    else
 | 
			
		||||
        echo "TEST_$test_num ($test_name) PASSED."
 | 
			
		||||
        return 0 # Return success
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Helper function to call run_test and update the overall script exit code
 | 
			
		||||
run_and_track_test() {
 | 
			
		||||
    local test_num_arg="$1"
 | 
			
		||||
    local test_name_arg="$2"
 | 
			
		||||
    local test_command_arg="$3"
 | 
			
		||||
 | 
			
		||||
    # Run the test
 | 
			
		||||
    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
 | 
			
		||||
    local test_specific_exit_code=$?
 | 
			
		||||
 | 
			
		||||
    # If the test failed, set the overall script exit code to 1
 | 
			
		||||
    if [ "$test_specific_exit_code" -ne 0 ]; then
 | 
			
		||||
        # No need for extra echo here, run_test already logged the failure.
 | 
			
		||||
        overall_script_exit_code=1
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# --- Actual Test Execution ---
 | 
			
		||||
run_and_track_test 0 "test_perf.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
 | 
			
		||||
run_and_track_test 1 "test_compilation.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
 | 
			
		||||
run_and_track_test 2 "test_basic.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
 | 
			
		||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 | 
			
		||||
run_and_track_test 4 "test_quantization_accuracy.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
 | 
			
		||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
 | 
			
		||||
    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
 | 
			
		||||
run_and_track_test 6 "test_tpu_model_runner.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
 | 
			
		||||
run_and_track_test 7 "test_sampler.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
 | 
			
		||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
 | 
			
		||||
run_and_track_test 9 "test_multimodal.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
 | 
			
		||||
run_and_track_test 10 "test_pallas.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 | 
			
		||||
run_and_track_test 11 "test_struct_output_generate.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 | 
			
		||||
run_and_track_test 12 "test_moe_pallas.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 | 
			
		||||
run_and_track_test 13 "test_lora.py" \
 | 
			
		||||
    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
 | 
			
		||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
 | 
			
		||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 | 
			
		||||
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
 | 
			
		||||
 | 
			
		||||
# After all tests have been attempted, exit with the overall status.
 | 
			
		||||
if [ "$overall_script_exit_code" -ne 0 ]; then
 | 
			
		||||
    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
 | 
			
		||||
else
 | 
			
		||||
    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
 | 
			
		||||
fi
 | 
			
		||||
exit "$overall_script_exit_code"
 | 
			
		||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
 | 
			
		||||
 | 
			
		||||
# Capture the exit code of the docker run command
 | 
			
		||||
DOCKER_RUN_EXIT_CODE=$?
 | 
			
		||||
 | 
			
		||||
# The trap will run for cleanup.
 | 
			
		||||
# Exit the main script with the Docker run command's exit code.
 | 
			
		||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
 | 
			
		||||
    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
 | 
			
		||||
    exit "$DOCKER_RUN_EXIT_CODE"
 | 
			
		||||
else
 | 
			
		||||
    echo "Docker run command completed successfully."
 | 
			
		||||
    exit 0
 | 
			
		||||
fi
 | 
			
		||||
# TODO: This test fails because it uses RANDOM_SEED sampling
 | 
			
		||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 | 
			
		||||
@ -1,18 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# Usage: ./rerun_test.sh path/to/test.py::test_name
 | 
			
		||||
 | 
			
		||||
# Check if argument is given
 | 
			
		||||
if [ $# -lt 1 ]; then
 | 
			
		||||
    echo "Usage: $0 path/to/test.py::test_name"
 | 
			
		||||
    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
TEST=$1
 | 
			
		||||
COUNT=1
 | 
			
		||||
 | 
			
		||||
while pytest -sv "$TEST"; do
 | 
			
		||||
    COUNT=$((COUNT + 1))
 | 
			
		||||
    echo "RUN NUMBER ${COUNT}"
 | 
			
		||||
done
 | 
			
		||||
@ -1,24 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -euo pipefail
 | 
			
		||||
 | 
			
		||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
 | 
			
		||||
if [ -z "$docker_root" ]; then
 | 
			
		||||
  echo "Failed to determine Docker root directory."
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
echo "Docker root directory: $docker_root"
 | 
			
		||||
# Check disk usage of the filesystem where Docker's root directory is located
 | 
			
		||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 | 
			
		||||
# Define the threshold
 | 
			
		||||
threshold=70
 | 
			
		||||
if [ "$disk_usage" -gt "$threshold" ]; then
 | 
			
		||||
  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
 | 
			
		||||
  # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
  docker image prune -f
 | 
			
		||||
  # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
  docker volume prune -f && docker system prune --force --filter "until=72h" --all
 | 
			
		||||
  echo "Docker images and volumes cleanup completed."
 | 
			
		||||
else
 | 
			
		||||
  echo "Disk usage is below $threshold%. No cleanup needed."
 | 
			
		||||
fi
 | 
			
		||||
@ -1,14 +0,0 @@
 | 
			
		||||
# Environment config
 | 
			
		||||
TEST_NAME=llama8b
 | 
			
		||||
CONTAINER_NAME=vllm-tpu
 | 
			
		||||
 | 
			
		||||
# vllm config
 | 
			
		||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
 | 
			
		||||
MAX_NUM_SEQS=256
 | 
			
		||||
MAX_NUM_BATCHED_TOKENS=1024
 | 
			
		||||
TENSOR_PARALLEL_SIZE=1
 | 
			
		||||
MAX_MODEL_LEN=2048
 | 
			
		||||
DOWNLOAD_DIR=/mnt/disks/persist
 | 
			
		||||
EXPECTED_THROUGHPUT=8.0
 | 
			
		||||
INPUT_LEN=1800
 | 
			
		||||
OUTPUT_LEN=128
 | 
			
		||||
@ -1,92 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
if [ ! -f "$1" ]; then
 | 
			
		||||
  echo "Error: The env file '$1' does not exist."
 | 
			
		||||
  exit 1  # Exit the script with a non-zero status to indicate an error
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
ENV_FILE=$1
 | 
			
		||||
 | 
			
		||||
# For testing on local vm, use `set -a` to export all variables
 | 
			
		||||
source /etc/environment
 | 
			
		||||
source $ENV_FILE
 | 
			
		||||
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    docker rm -f tpu-test || true; 
 | 
			
		||||
    docker rm -f vllm-tpu || true;
 | 
			
		||||
    docker rm -f $CONTAINER_NAME || true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
LOG_ROOT=$(mktemp -d)
 | 
			
		||||
# If mktemp fails, set -e will cause the script to exit.
 | 
			
		||||
echo "Results will be stored in: $LOG_ROOT"
 | 
			
		||||
 | 
			
		||||
if [ -z "$HF_TOKEN" ]; then
 | 
			
		||||
  echo "Error: HF_TOKEN is not set or is empty."  
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Make sure mounted disk or dir exists
 | 
			
		||||
if [ ! -d "$DOWNLOAD_DIR" ]; then
 | 
			
		||||
    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
echo "Run model $MODEL"
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
echo "starting docker...$CONTAINER_NAME"
 | 
			
		||||
echo    
 | 
			
		||||
docker run \
 | 
			
		||||
 -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
 | 
			
		||||
 --env-file $ENV_FILE \
 | 
			
		||||
 -e HF_TOKEN="$HF_TOKEN" \
 | 
			
		||||
 -e TARGET_COMMIT=$BUILDKITE_COMMIT \
 | 
			
		||||
 -e MODEL=$MODEL \
 | 
			
		||||
 -e WORKSPACE=/workspace \
 | 
			
		||||
 --name $CONTAINER_NAME \
 | 
			
		||||
 -d \
 | 
			
		||||
 --privileged \
 | 
			
		||||
 --network host \
 | 
			
		||||
 -v /dev/shm:/dev/shm \
 | 
			
		||||
 vllm/vllm-tpu-bm tail -f /dev/null
 | 
			
		||||
 | 
			
		||||
echo "run script..."
 | 
			
		||||
echo
 | 
			
		||||
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
 | 
			
		||||
 | 
			
		||||
echo "copy result back..."
 | 
			
		||||
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
 | 
			
		||||
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
 | 
			
		||||
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
 | 
			
		||||
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
 | 
			
		||||
 | 
			
		||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 | 
			
		||||
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
 | 
			
		||||
 | 
			
		||||
if [ "$BUILDKITE" = "true" ]; then
 | 
			
		||||
  echo "Running inside Buildkite"
 | 
			
		||||
  buildkite-agent artifact upload "$VLLM_LOG" 
 | 
			
		||||
  buildkite-agent artifact upload "$BM_LOG"
 | 
			
		||||
else
 | 
			
		||||
  echo "Not running inside Buildkite"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# compare the throughput with EXPECTED_THROUGHPUT 
 | 
			
		||||
# and assert meeting the expectation
 | 
			
		||||
# 
 | 
			
		||||
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
 | 
			
		||||
  echo "Failed to get the throughput"
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
 | 
			
		||||
  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
@ -1,14 +0,0 @@
 | 
			
		||||
# Environment config
 | 
			
		||||
TEST_NAME=llama8bw8a8
 | 
			
		||||
CONTAINER_NAME=vllm-tpu
 | 
			
		||||
 | 
			
		||||
# vllm config
 | 
			
		||||
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
 | 
			
		||||
MAX_NUM_SEQS=128
 | 
			
		||||
MAX_NUM_BATCHED_TOKENS=1024
 | 
			
		||||
TENSOR_PARALLEL_SIZE=1
 | 
			
		||||
MAX_MODEL_LEN=2048
 | 
			
		||||
DOWNLOAD_DIR=/mnt/disks/persist
 | 
			
		||||
EXPECTED_THROUGHPUT=10.0
 | 
			
		||||
INPUT_LEN=1800
 | 
			
		||||
OUTPUT_LEN=128
 | 
			
		||||
@ -1,94 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -euo pipefail
 | 
			
		||||
 | 
			
		||||
VLLM_LOG="$WORKSPACE/vllm_log.txt"
 | 
			
		||||
BM_LOG="$WORKSPACE/bm_log.txt"
 | 
			
		||||
 | 
			
		||||
if [ -n "$TARGET_COMMIT" ]; then
 | 
			
		||||
  head_hash=$(git rev-parse HEAD)
 | 
			
		||||
  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
 | 
			
		||||
    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
echo "model: $MODEL"
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# create a log folder
 | 
			
		||||
#
 | 
			
		||||
mkdir "$WORKSPACE/log"
 | 
			
		||||
 | 
			
		||||
# TODO: Move to image building.
 | 
			
		||||
pip install pandas
 | 
			
		||||
pip install datasets
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# create sonnet_4x
 | 
			
		||||
#
 | 
			
		||||
echo "Create sonnet_4x.txt"
 | 
			
		||||
echo "" > benchmarks/sonnet_4x.txt
 | 
			
		||||
for _ in {1..4}
 | 
			
		||||
 do
 | 
			
		||||
  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# start vllm service in backend
 | 
			
		||||
#
 | 
			
		||||
echo "lanching vllm..."
 | 
			
		||||
echo "logging to $VLLM_LOG"
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
VLLM_USE_V1=1 vllm serve $MODEL \
 | 
			
		||||
 --seed 42 \
 | 
			
		||||
 --disable-log-requests \
 | 
			
		||||
 --max-num-seqs $MAX_NUM_SEQS \
 | 
			
		||||
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
 | 
			
		||||
 --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 | 
			
		||||
 --no-enable-prefix-caching \
 | 
			
		||||
 --download_dir $DOWNLOAD_DIR \
 | 
			
		||||
 --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
echo "wait for 20 minutes.."
 | 
			
		||||
echo
 | 
			
		||||
# sleep 1200
 | 
			
		||||
# wait for 10 minutes...
 | 
			
		||||
for i in {1..120}; do
 | 
			
		||||
    # TODO: detect other type of errors.
 | 
			
		||||
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
 | 
			
		||||
        echo "Detected RuntimeError, exiting."
 | 
			
		||||
        exit 1
 | 
			
		||||
    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
 | 
			
		||||
        echo "Application started"
 | 
			
		||||
        break
 | 
			
		||||
    else
 | 
			
		||||
        echo "wait for 10 seconds..."
 | 
			
		||||
        sleep 10
 | 
			
		||||
    fi
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# run test
 | 
			
		||||
#
 | 
			
		||||
echo "run benchmark test..."
 | 
			
		||||
echo "logging to $BM_LOG"
 | 
			
		||||
echo
 | 
			
		||||
python benchmarks/benchmark_serving.py \
 | 
			
		||||
    --backend vllm \
 | 
			
		||||
    --model $MODEL  \
 | 
			
		||||
    --dataset-name sonnet \
 | 
			
		||||
    --dataset-path benchmarks/sonnet_4x.txt \
 | 
			
		||||
    --sonnet-input-len $INPUT_LEN \
 | 
			
		||||
    --sonnet-output-len $OUTPUT_LEN \
 | 
			
		||||
    --ignore-eos > "$BM_LOG"
 | 
			
		||||
 | 
			
		||||
echo "completed..."
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 | 
			
		||||
echo "throughput: $throughput"
 | 
			
		||||
echo
 | 
			
		||||
@ -8,7 +8,6 @@
 | 
			
		||||
# Documentation
 | 
			
		||||
# label(str): the name of the test. emoji allowed.
 | 
			
		||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 | 
			
		||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 | 
			
		||||
# fast_check_only(bool): run this test on fastcheck pipeline only
 | 
			
		||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 | 
			
		||||
# command(str): the single command to run for tests. incompatible with commands.
 | 
			
		||||
@ -32,27 +31,16 @@ steps:
 | 
			
		||||
##### fast check tests  #####
 | 
			
		||||
 | 
			
		||||
- label: Documentation Build # 2min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/test_docs"
 | 
			
		||||
  working_dir: "/vllm-workspace/test_docs/docs"
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  no_gpu: True
 | 
			
		||||
  commands:
 | 
			
		||||
  - pip install -r ../requirements/docs.txt
 | 
			
		||||
  # TODO: add `--strict` once warnings in docstrings are fixed
 | 
			
		||||
  - mkdocs build
 | 
			
		||||
 | 
			
		||||
- label: Pytorch Nightly Dependency Override Check # 2min
 | 
			
		||||
  # if this test fails, it means the nightly torch version is not compatible with some
 | 
			
		||||
  # of the dependencies. Please check the error message and add the package to whitelist
 | 
			
		||||
  # in /vllm/tools/generate_nightly_torch_test.py
 | 
			
		||||
  soft_fail: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - requirements/nightly_torch_test.txt
 | 
			
		||||
  commands:
 | 
			
		||||
  - bash standalone_tests/pytorch_nightly_dependency.sh
 | 
			
		||||
  - pip install -r ../../requirements/docs.txt
 | 
			
		||||
  - SPHINXOPTS=\"-W\" make html
 | 
			
		||||
  # Check API reference (if it fails, you may have missing mock imports)
 | 
			
		||||
  - grep \"sig sig-object py\" build/html/api/inference_params.html
 | 
			
		||||
 | 
			
		||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/mq_llm_engine
 | 
			
		||||
@ -68,13 +56,11 @@ steps:
 | 
			
		||||
  - pytest -v -s async_engine # AsyncLLMEngine
 | 
			
		||||
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
 | 
			
		||||
  - pytest -v -s test_inputs.py
 | 
			
		||||
  - pytest -v -s test_outputs.py
 | 
			
		||||
  - pytest -v -s multimodal
 | 
			
		||||
  - pytest -v -s test_utils.py # Utils
 | 
			
		||||
  - pytest -v -s worker # Worker
 | 
			
		||||
 | 
			
		||||
- label: Python-only Installation Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - tests/standalone_tests/python_only_compile.sh
 | 
			
		||||
  - setup.py
 | 
			
		||||
@ -82,9 +68,8 @@ steps:
 | 
			
		||||
  - bash standalone_tests/python_only_compile.sh
 | 
			
		||||
 | 
			
		||||
- label: Basic Correctness Test # 30min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/basic_correctness/test_basic_correctness
 | 
			
		||||
@ -99,7 +84,6 @@ steps:
 | 
			
		||||
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 | 
			
		||||
 | 
			
		||||
- label: Chunked Prefill Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/basic_correctness/test_chunked_prefill
 | 
			
		||||
@ -108,7 +92,7 @@ steps:
 | 
			
		||||
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 | 
			
		||||
 | 
			
		||||
- label: Core Test # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/core
 | 
			
		||||
@ -117,14 +101,15 @@ steps:
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s core
 | 
			
		||||
 | 
			
		||||
- label: Entrypoints Test (LLM) # 40min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
- label: Entrypoints Test # 40min
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/entrypoints/llm
 | 
			
		||||
  - tests/entrypoints/openai
 | 
			
		||||
  - tests/entrypoints/test_chat_utils
 | 
			
		||||
  - tests/entrypoints/offline_mode
 | 
			
		||||
  commands:
 | 
			
		||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
@ -133,24 +118,11 @@ steps:
 | 
			
		||||
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
 | 
			
		||||
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
 | 
			
		||||
  - pytest -v -s entrypoints/test_chat_utils.py
 | 
			
		||||
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 | 
			
		||||
 | 
			
		||||
- label: Entrypoints Test (API Server) # 40min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/entrypoints/openai
 | 
			
		||||
  - tests/entrypoints/test_chat_utils
 | 
			
		||||
  commands:
 | 
			
		||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
 | 
			
		||||
  - pytest -v -s entrypoints/test_chat_utils.py
 | 
			
		||||
 | 
			
		||||
- label: Distributed Tests (4 GPUs) # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -158,57 +130,29 @@ steps:
 | 
			
		||||
  - vllm/core/
 | 
			
		||||
  - tests/distributed/test_utils
 | 
			
		||||
  - tests/distributed/test_pynccl
 | 
			
		||||
  - tests/distributed/test_events
 | 
			
		||||
  - tests/spec_decode/e2e/test_integration_dist_tp4
 | 
			
		||||
  - tests/compile/test_basic_correctness
 | 
			
		||||
  - examples/offline_inference/rlhf.py
 | 
			
		||||
  - examples/offline_inference/rlhf_colocate.py
 | 
			
		||||
  - tests/examples/offline_inference/data_parallel.py
 | 
			
		||||
  - tests/v1/test_async_llm_dp.py
 | 
			
		||||
  - tests/v1/test_external_lb_dp.py
 | 
			
		||||
  - tests/v1/engine/test_engine_core_client.py
 | 
			
		||||
  commands:
 | 
			
		||||
  # test with tp=2 and external_dp=2
 | 
			
		||||
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
			
		||||
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
			
		||||
  # test with tp=2 and pp=2
 | 
			
		||||
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
			
		||||
  # test with internal dp
 | 
			
		||||
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
 | 
			
		||||
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
 | 
			
		||||
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
 | 
			
		||||
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
 | 
			
		||||
  - python3 ../examples/offline_inference/data_parallel.py
 | 
			
		||||
  - pytest -v -s distributed/test_utils.py
 | 
			
		||||
  - pytest -v -s compile/test_basic_correctness.py
 | 
			
		||||
  - pytest -v -s distributed/test_pynccl.py
 | 
			
		||||
  - pytest -v -s distributed/test_events.py
 | 
			
		||||
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 | 
			
		||||
  # TODO: create a dedicated test section for multi-GPU example tests
 | 
			
		||||
  # when we have multiple distributed example tests
 | 
			
		||||
  - pushd ../examples/offline_inference
 | 
			
		||||
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
 | 
			
		||||
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
 | 
			
		||||
  - VLLM_ENABLE_V1_MULTIPROCESSING=0 python3 rlhf.py
 | 
			
		||||
  - VLLM_ENABLE_V1_MULTIPROCESSING=0 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
 | 
			
		||||
  - popd
 | 
			
		||||
 | 
			
		||||
- label: EPLB Algorithm Test
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/distributed/eplb
 | 
			
		||||
  - tests/distributed/test_eplb_algo.py
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s distributed/test_eplb_algo.py
 | 
			
		||||
 | 
			
		||||
- label: EPLB Execution Test # 5min
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/distributed/eplb
 | 
			
		||||
  - tests/distributed/test_eplb_execute.py
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s distributed/test_eplb_execute.py
 | 
			
		||||
 | 
			
		||||
- label: Metrics, Tracing Test # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
@ -217,17 +161,17 @@ steps:
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s metrics
 | 
			
		||||
  - "pip install \
 | 
			
		||||
      'opentelemetry-sdk>=1.26.0' \
 | 
			
		||||
      'opentelemetry-api>=1.26.0' \
 | 
			
		||||
      'opentelemetry-exporter-otlp>=1.26.0' \
 | 
			
		||||
      'opentelemetry-semantic-conventions-ai>=0.4.1'"
 | 
			
		||||
      'opentelemetry-sdk>=1.26.0,<1.27.0' \
 | 
			
		||||
      'opentelemetry-api>=1.26.0,<1.27.0' \
 | 
			
		||||
      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
 | 
			
		||||
      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
 | 
			
		||||
  - pytest -v -s tracing
 | 
			
		||||
 | 
			
		||||
##### fast check tests  #####
 | 
			
		||||
#####  1 GPU test  #####
 | 
			
		||||
 | 
			
		||||
- label: Regression Test # 5min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/test_regression
 | 
			
		||||
@ -237,7 +181,7 @@ steps:
 | 
			
		||||
  working_dir: "/vllm-workspace/tests" # optional
 | 
			
		||||
 | 
			
		||||
- label: Engine Test # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/engine
 | 
			
		||||
@ -245,31 +189,28 @@ steps:
 | 
			
		||||
  - tests/test_sequence
 | 
			
		||||
  - tests/test_config
 | 
			
		||||
  - tests/test_logger
 | 
			
		||||
  - tests/test_vllm_port
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
 | 
			
		||||
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
 | 
			
		||||
  # OOM in the CI unless we run this separately
 | 
			
		||||
  - pytest -v -s tokenization
 | 
			
		||||
 | 
			
		||||
- label: V1 Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
    - vllm/
 | 
			
		||||
    - tests/v1
 | 
			
		||||
  commands:
 | 
			
		||||
    # split the test to avoid interference
 | 
			
		||||
    - pytest -v -s v1/core
 | 
			
		||||
    - pytest -v -s v1/entrypoints
 | 
			
		||||
    - pytest -v -s v1/engine
 | 
			
		||||
    - pytest -v -s v1/entrypoints
 | 
			
		||||
    - pytest -v -s v1/sample
 | 
			
		||||
    - pytest -v -s v1/worker
 | 
			
		||||
    - pytest -v -s v1/structured_output
 | 
			
		||||
    - pytest -v -s v1/spec_decode
 | 
			
		||||
    - pytest -v -s v1/kv_connector/unit
 | 
			
		||||
    - pytest -v -s v1/test_serial_utils.py
 | 
			
		||||
    - pytest -v -s v1/test_stats.py
 | 
			
		||||
    - pytest -v -s v1/test_utils.py
 | 
			
		||||
    - pytest -v -s v1/test_oracle.py
 | 
			
		||||
    - pytest -v -s v1/test_metrics_reader.py
 | 
			
		||||
    # TODO: accuracy does not match, whether setting
 | 
			
		||||
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
 | 
			
		||||
    - pytest -v -s v1/e2e
 | 
			
		||||
@ -278,8 +219,8 @@ steps:
 | 
			
		||||
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 | 
			
		||||
 | 
			
		||||
- label: Examples Test # 25min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/examples"
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/entrypoints
 | 
			
		||||
  - examples/
 | 
			
		||||
@ -292,9 +233,9 @@ steps:
 | 
			
		||||
    - python3 offline_inference/llm_engine_example.py
 | 
			
		||||
    - python3 offline_inference/audio_language.py --seed 0
 | 
			
		||||
    - python3 offline_inference/vision_language.py --seed 0
 | 
			
		||||
    - python3 offline_inference/vision_language_pooling.py --seed 0
 | 
			
		||||
    - python3 offline_inference/vision_language_embedding.py --seed 0
 | 
			
		||||
    - python3 offline_inference/vision_language_multi_image.py --seed 0
 | 
			
		||||
    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 | 
			
		||||
    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 | 
			
		||||
    - python3 offline_inference/encoder_decoder.py
 | 
			
		||||
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
 | 
			
		||||
    - python3 offline_inference/basic/classify.py
 | 
			
		||||
@ -303,24 +244,14 @@ steps:
 | 
			
		||||
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 | 
			
		||||
 | 
			
		||||
- label: Prefix Caching Test # 9min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/prefix_caching
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s prefix_caching
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
- label: Platform Tests (CUDA)
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/cuda
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s cuda/test_cuda_context.py
 | 
			
		||||
 | 
			
		||||
- label: Samplers Test # 36min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor/layers
 | 
			
		||||
  - vllm/sampling_metadata.py
 | 
			
		||||
@ -330,8 +261,18 @@ steps:
 | 
			
		||||
    - pytest -v -s samplers
 | 
			
		||||
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 | 
			
		||||
 | 
			
		||||
- label: LogitsProcessor Test # 5min
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor/layers
 | 
			
		||||
  - vllm/model_executor/guided_decoding
 | 
			
		||||
  - tests/test_logits_processor
 | 
			
		||||
  - tests/model_executor/test_guided_processors
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s test_logits_processor.py
 | 
			
		||||
    - pytest -v -s model_executor/test_guided_processors.py
 | 
			
		||||
 | 
			
		||||
- label: Speculative decoding tests # 40min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/spec_decode
 | 
			
		||||
  - tests/spec_decode
 | 
			
		||||
@ -342,30 +283,14 @@ steps:
 | 
			
		||||
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 | 
			
		||||
 | 
			
		||||
- label: LoRA Test %N # 15min each
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/lora
 | 
			
		||||
  - tests/lora
 | 
			
		||||
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
 | 
			
		||||
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
 | 
			
		||||
  parallelism: 4
 | 
			
		||||
 | 
			
		||||
- label: PyTorch Compilation Unit Tests
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
    - vllm/
 | 
			
		||||
    - tests/compile
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s compile/test_pass_manager.py
 | 
			
		||||
    - pytest -v -s compile/test_fusion.py
 | 
			
		||||
    - pytest -v -s compile/test_fusion_attn.py
 | 
			
		||||
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
 | 
			
		||||
    - pytest -v -s compile/test_sequence_parallelism.py
 | 
			
		||||
    - pytest -v -s compile/test_async_tp.py
 | 
			
		||||
 | 
			
		||||
- label: PyTorch Fullgraph Smoke Test # 9min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/compile
 | 
			
		||||
@ -374,127 +299,61 @@ steps:
 | 
			
		||||
  # these tests need to be separated, cannot combine
 | 
			
		||||
  - pytest -v -s compile/piecewise/test_simple.py
 | 
			
		||||
  - pytest -v -s compile/piecewise/test_toy_llama.py
 | 
			
		||||
  - pytest -v -s compile/piecewise/test_full_cudagraph.py
 | 
			
		||||
  - pytest -v -s compile/test_pass_manager.py
 | 
			
		||||
 | 
			
		||||
- label: PyTorch Fullgraph Test # 18min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/compile
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s compile/test_full_graph.py
 | 
			
		||||
 | 
			
		||||
- label: Kernels Core Operation Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
- label: Kernels Test %N # 1h each
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - tests/kernels/core
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/core
 | 
			
		||||
 | 
			
		||||
- label: Kernels Attention Test %N
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/attention/
 | 
			
		||||
  - vllm/attention
 | 
			
		||||
  - vllm/v1/attention
 | 
			
		||||
  - tests/kernels/attention
 | 
			
		||||
  - tests/kernels
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 | 
			
		||||
  parallelism: 2
 | 
			
		||||
 | 
			
		||||
- label: Kernels Quantization Test %N
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/quantization/
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  - tests/kernels/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 | 
			
		||||
  parallelism: 2
 | 
			
		||||
 | 
			
		||||
- label: Kernels MoE Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/moe/
 | 
			
		||||
  - tests/kernels/moe
 | 
			
		||||
  - vllm/model_executor/layers/fused_moe/
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/moe
 | 
			
		||||
 | 
			
		||||
- label: Kernels Mamba Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/mamba/
 | 
			
		||||
  - tests/kernels/mamba
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/mamba
 | 
			
		||||
    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 | 
			
		||||
  parallelism: 4
 | 
			
		||||
 | 
			
		||||
- label: Tensorizer Test # 11min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  soft_fail: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor/model_loader
 | 
			
		||||
  - tests/tensorizer_loader
 | 
			
		||||
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
 | 
			
		||||
  commands:
 | 
			
		||||
    - apt-get update && apt-get install -y curl libsodium23
 | 
			
		||||
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
    - pytest -v -s tensorizer_loader
 | 
			
		||||
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 | 
			
		||||
 | 
			
		||||
- label: Model Executor Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  soft_fail: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor
 | 
			
		||||
  - tests/model_executor
 | 
			
		||||
  commands:
 | 
			
		||||
    - apt-get update && apt-get install -y curl libsodium23
 | 
			
		||||
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
    - pytest -v -s model_executor
 | 
			
		||||
 | 
			
		||||
- label: Benchmarks # 9min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  working_dir: "/vllm-workspace/.buildkite"
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - benchmarks/
 | 
			
		||||
  commands:
 | 
			
		||||
  - bash scripts/run-benchmarks.sh
 | 
			
		||||
  - bash run-benchmarks.sh
 | 
			
		||||
 | 
			
		||||
- label: Benchmarks CLI Test # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/benchmarks/
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s benchmarks/
 | 
			
		||||
 | 
			
		||||
- label: Quantization Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
- label: Quantization Test # 33min
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  - tests/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
  # temporary install here since we need nightly, will move to requirements/test.in
 | 
			
		||||
  # after torchao 0.12 release
 | 
			
		||||
  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 | 
			
		||||
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 | 
			
		||||
  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 | 
			
		||||
 | 
			
		||||
- label: LM Eval Small Models # 53min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 | 
			
		||||
  - bash ./run-tests.sh -c configs/models-small.txt -t 1
 | 
			
		||||
 | 
			
		||||
- label: OpenAI API correctness
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - vllm/entrypoints/openai/
 | 
			
		||||
@ -503,7 +362,6 @@ steps:
 | 
			
		||||
  - pytest -s entrypoints/openai/correctness/
 | 
			
		||||
 | 
			
		||||
- label: Encoder Decoder tests # 5min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/encoder_decoder
 | 
			
		||||
@ -511,128 +369,98 @@ steps:
 | 
			
		||||
    - pytest -v -s encoder_decoder
 | 
			
		||||
 | 
			
		||||
- label: OpenAI-Compatible Tool Use # 20 min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  fast_check: false
 | 
			
		||||
  mirror_hardwares: [ amd ]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
    - vllm/
 | 
			
		||||
    - tests/tool_use
 | 
			
		||||
    - tests/mistral_tool_use
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s tool_use
 | 
			
		||||
    - pytest -v -s mistral_tool_use
 | 
			
		||||
 | 
			
		||||
#####  models test  #####
 | 
			
		||||
 | 
			
		||||
- label: Basic Models Test # 24min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s models/test_transformers.py
 | 
			
		||||
    - pytest -v -s models/test_registry.py
 | 
			
		||||
    - pytest -v -s models/test_utils.py
 | 
			
		||||
    - pytest -v -s models/test_vision.py
 | 
			
		||||
    - pytest -v -s models/test_initialization.py
 | 
			
		||||
    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
 | 
			
		||||
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
 | 
			
		||||
 | 
			
		||||
- label: Language Models Test (Standard)
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
- label: Language Models Test (Standard) # 32min
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/language
 | 
			
		||||
  - tests/models/decoder_only/language
 | 
			
		||||
  - tests/models/embedding/language
 | 
			
		||||
  - tests/models/encoder_decoder/language
 | 
			
		||||
  commands:
 | 
			
		||||
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
 | 
			
		||||
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
 | 
			
		||||
    - pip freeze | grep -E 'torch'
 | 
			
		||||
    - pytest -v -s models/language -m core_model
 | 
			
		||||
    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
 | 
			
		||||
    - pytest -v -s models/embedding/language -m core_model
 | 
			
		||||
 | 
			
		||||
- label: Language Models Test (Hybrid) # 35 min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/language/generation
 | 
			
		||||
  commands:
 | 
			
		||||
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
 | 
			
		||||
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
 | 
			
		||||
    - pytest -v -s models/language/generation -m hybrid_model
 | 
			
		||||
 | 
			
		||||
- label: Language Models Test (Extended Generation) # 1hr20min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
- label: Language Models Test (Extended) # 1h10min
 | 
			
		||||
  optional: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/language/generation
 | 
			
		||||
  - tests/models/decoder_only/language
 | 
			
		||||
  - tests/models/embedding/language
 | 
			
		||||
  - tests/models/encoder_decoder/language
 | 
			
		||||
  commands:
 | 
			
		||||
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
 | 
			
		||||
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
 | 
			
		||||
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 | 
			
		||||
    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
 | 
			
		||||
    - pytest -v -s models/embedding/language -m 'not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Language Models Test (Extended Pooling)  # 36min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  optional: true
 | 
			
		||||
- label: Multi-Modal Models Test (Standard) # 40min
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/language/pooling
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s models/language/pooling -m 'not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Models Test (Standard)
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/multimodal
 | 
			
		||||
  - tests/models/decoder_only/audio_language
 | 
			
		||||
  - tests/models/decoder_only/vision_language
 | 
			
		||||
  - tests/models/embedding/vision_language
 | 
			
		||||
  - tests/models/encoder_decoder/audio_language
 | 
			
		||||
  - tests/models/encoder_decoder/vision_language
 | 
			
		||||
  commands:
 | 
			
		||||
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
 | 
			
		||||
    - pip freeze | grep -E 'torch'
 | 
			
		||||
    - pytest -v -s models/multimodal/processing
 | 
			
		||||
    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
 | 
			
		||||
    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 | 
			
		||||
    - pytest -v -s models/multimodal
 | 
			
		||||
    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
 | 
			
		||||
    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
 | 
			
		||||
    - pytest -v -s models/embedding/vision_language -m core_model
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/language -m core_model
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/vision_language -m core_model
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 1
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 1 # 48m
 | 
			
		||||
  optional: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/multimodal
 | 
			
		||||
  - tests/models/decoder_only/audio_language
 | 
			
		||||
  - tests/models/decoder_only/vision_language
 | 
			
		||||
  - tests/models/embedding/vision_language
 | 
			
		||||
  - tests/models/encoder_decoder/vision_language
 | 
			
		||||
  commands:
 | 
			
		||||
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
 | 
			
		||||
    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
 | 
			
		||||
    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
 | 
			
		||||
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
 | 
			
		||||
    # HACK - run phi3v tests separately to sidestep this transformers bug
 | 
			
		||||
    # https://github.com/huggingface/transformers/issues/34307
 | 
			
		||||
    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
 | 
			
		||||
    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
 | 
			
		||||
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 2
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 2 # 38m
 | 
			
		||||
  optional: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/multimodal
 | 
			
		||||
  - tests/models/decoder_only/vision_language
 | 
			
		||||
  commands:
 | 
			
		||||
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
 | 
			
		||||
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 3
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  optional: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/multimodal
 | 
			
		||||
  commands:
 | 
			
		||||
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
 | 
			
		||||
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Quantized Models Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  - tests/models/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s models/quantization
 | 
			
		||||
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
 | 
			
		||||
 | 
			
		||||
# This test is used only in PR development phase to test individual models and should never run on main
 | 
			
		||||
- label: Custom Models Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  optional: true
 | 
			
		||||
  commands:
 | 
			
		||||
    - echo 'Testing custom models...'
 | 
			
		||||
@ -640,23 +468,10 @@ steps:
 | 
			
		||||
    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
 | 
			
		||||
    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
 | 
			
		||||
 | 
			
		||||
- label: Transformers Nightly Models Test
 | 
			
		||||
  working_dir: "/vllm-workspace/"
 | 
			
		||||
  optional: true
 | 
			
		||||
  commands:
 | 
			
		||||
    - pip install --upgrade git+https://github.com/huggingface/transformers
 | 
			
		||||
    - pytest -v -s models/test_initialization.py
 | 
			
		||||
    - pytest -v -s tests/models/multimodal/processing/
 | 
			
		||||
    - pytest -v -s tests/models/multimodal/test_mapping.py
 | 
			
		||||
    - python3 examples/offline_inference/basic/chat.py
 | 
			
		||||
    - python3 examples/offline_inference/audio_language.py --model-type whisper
 | 
			
		||||
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
 | 
			
		||||
 | 
			
		||||
#####  1 GPU test  #####
 | 
			
		||||
#####  multi gpus test  #####
 | 
			
		||||
 | 
			
		||||
- label: Distributed Comm Ops Test # 7min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -667,7 +482,6 @@ steps:
 | 
			
		||||
  - pytest -v -s distributed/test_shm_broadcast.py
 | 
			
		||||
 | 
			
		||||
- label: 2 Node Tests (4 GPUs in total) # 16min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  num_nodes: 2
 | 
			
		||||
@ -677,21 +491,16 @@ steps:
 | 
			
		||||
  - vllm/executor/
 | 
			
		||||
  - vllm/model_executor/models/
 | 
			
		||||
  - tests/distributed/
 | 
			
		||||
  - tests/examples/offline_inference/data_parallel.py
 | 
			
		||||
  commands:
 | 
			
		||||
  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
 | 
			
		||||
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 | 
			
		||||
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
 | 
			
		||||
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 | 
			
		||||
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
 | 
			
		||||
    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
 | 
			
		||||
  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
 | 
			
		||||
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 | 
			
		||||
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
 | 
			
		||||
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 | 
			
		||||
 | 
			
		||||
- label: Distributed Tests (2 GPUs) # 40min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -705,41 +514,31 @@ steps:
 | 
			
		||||
  - vllm/worker/worker.py
 | 
			
		||||
  - vllm/worker/model_runner.py
 | 
			
		||||
  - entrypoints/llm/test_collective_rpc.py
 | 
			
		||||
  - tests/v1/test_async_llm_dp.py
 | 
			
		||||
  - tests/v1/test_external_lb_dp.py
 | 
			
		||||
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
 | 
			
		||||
  - vllm/v1/engine/
 | 
			
		||||
  commands:
 | 
			
		||||
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
 | 
			
		||||
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
 | 
			
		||||
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
 | 
			
		||||
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
 | 
			
		||||
  - VLLM_ENABLE_V1_MULTIPROCESSING=0 pytest -v -s entrypoints/llm/test_collective_rpc.py
 | 
			
		||||
  - pytest -v -s ./compile/test_basic_correctness.py
 | 
			
		||||
  - pytest -v -s ./compile/test_wrapper.py
 | 
			
		||||
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
 | 
			
		||||
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  # Avoid importing model tests that cause CUDA reinitialization error
 | 
			
		||||
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  # test sequence parallel
 | 
			
		||||
  - pytest -v -s distributed/test_sequence_parallel.py
 | 
			
		||||
  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  # this test fails consistently.
 | 
			
		||||
  # TODO: investigate and fix
 | 
			
		||||
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
 | 
			
		||||
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
 | 
			
		||||
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
 | 
			
		||||
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 | 
			
		||||
 | 
			
		||||
- label: Plugin Tests (2 GPUs) # 40min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/plugins/
 | 
			
		||||
  - tests/plugins/
 | 
			
		||||
  commands:
 | 
			
		||||
  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
 | 
			
		||||
  # begin platform plugin tests, all the code in-between runs on dummy platform
 | 
			
		||||
  - pip install -e ./plugins/vllm_add_dummy_platform
 | 
			
		||||
  - pytest -v -s plugins_tests/test_platform_plugins.py
 | 
			
		||||
  - pip uninstall vllm_add_dummy_platform -y
 | 
			
		||||
@ -750,10 +549,8 @@ steps:
 | 
			
		||||
  - pytest -v -s distributed/test_distributed_oot.py
 | 
			
		||||
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 | 
			
		||||
 | 
			
		||||
- label: Multi-step Tests (4 GPUs) # 36min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -774,7 +571,6 @@ steps:
 | 
			
		||||
  - pytest -v -s multi_step/test_correctness_llm.py
 | 
			
		||||
 | 
			
		||||
- label: Pipeline Parallelism Test # 45min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -788,7 +584,6 @@ steps:
 | 
			
		||||
  - pytest -v -s distributed/test_pipeline_parallel.py
 | 
			
		||||
 | 
			
		||||
- label: LoRA TP Test (Distributed)
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/lora
 | 
			
		||||
@ -797,14 +592,17 @@ steps:
 | 
			
		||||
    # FIXIT: find out which code initialize cuda before running the test
 | 
			
		||||
    # before the fix, we need to use spawn to test it
 | 
			
		||||
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
    # This test runs llama 13B, so it is required to run on 4 GPUs.
 | 
			
		||||
    - pytest -v -s -x lora/test_long_context.py
 | 
			
		||||
    # There is some Tensor Parallelism related processing logic in LoRA that
 | 
			
		||||
    # requires multi-GPU testing for validation.
 | 
			
		||||
    - pytest -v -s -x lora/test_chatglm3_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_llama_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_minicpmv_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_transfomers_model.py
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
- label: Weight Loading Multiple GPU Test  # 33min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -814,7 +612,6 @@ steps:
 | 
			
		||||
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 | 
			
		||||
 | 
			
		||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  gpu: a100
 | 
			
		||||
@ -853,4 +650,4 @@ steps:
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 | 
			
		||||
  - bash ./run-tests.sh -c configs/models-large.txt -t 4
 | 
			
		||||
 | 
			
		||||
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
			
		||||
if [[ $normal_wheel == *"cu118"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu118, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu118 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu126"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu126 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu121"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu121, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu121 wheels"
 | 
			
		||||
else
 | 
			
		||||
    # only upload index.html for cu128 wheels (default wheels)
 | 
			
		||||
    # only upload index.html for cu124 wheels (default wheels)
 | 
			
		||||
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
 | 
			
		||||
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 | 
			
		||||
fi
 | 
			
		||||
@ -66,13 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 | 
			
		||||
if [[ $normal_wheel == *"cu118"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu118, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu118 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu126"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu126 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu121"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu121, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu121 wheels"
 | 
			
		||||
else
 | 
			
		||||
    # only upload index.html for cu128 wheels (default wheels)
 | 
			
		||||
    # only upload index.html for cu124 wheels (default wheels)
 | 
			
		||||
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 | 
			
		||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
 | 
			
		||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 | 
			
		||||
							
								
								
									
										28
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										28
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							@ -10,22 +10,13 @@
 | 
			
		||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 | 
			
		||||
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
 | 
			
		||||
/vllm/model_executor/guided_decoding @mgoin @russellb
 | 
			
		||||
/vllm/multimodal @DarkLight1337 @ywang96
 | 
			
		||||
/vllm/vllm_flash_attn @LucasWilkinson
 | 
			
		||||
/vllm/lora @jeejeelee
 | 
			
		||||
/vllm/reasoning @aarnphm
 | 
			
		||||
/vllm/entrypoints @aarnphm
 | 
			
		||||
/vllm/compilation @zou3519 @youkaichao
 | 
			
		||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 | 
			
		||||
 | 
			
		||||
# Any change to the VllmConfig changes can have a large user-facing impact,
 | 
			
		||||
# so spam a lot of people
 | 
			
		||||
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
 | 
			
		||||
CMakeLists.txt @tlrmchlsmth
 | 
			
		||||
 | 
			
		||||
# vLLM V1
 | 
			
		||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 | 
			
		||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm
 | 
			
		||||
/vllm/v1/structured_output @mgoin @russellb
 | 
			
		||||
 | 
			
		||||
# Test ownership
 | 
			
		||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
 | 
			
		||||
@ -34,8 +25,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 | 
			
		||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
 | 
			
		||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
 | 
			
		||||
/tests/distributed/test_same_node.py @youkaichao
 | 
			
		||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
 | 
			
		||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
 | 
			
		||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 | 
			
		||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
 | 
			
		||||
/tests/kernels @tlrmchlsmth @WoosukKwon
 | 
			
		||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
 | 
			
		||||
/tests/models @DarkLight1337 @ywang96
 | 
			
		||||
@ -45,11 +36,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 | 
			
		||||
/tests/quantization @mgoin @robertgshaw2-redhat
 | 
			
		||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
 | 
			
		||||
/tests/test_inputs.py @DarkLight1337 @ywang96
 | 
			
		||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 | 
			
		||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
 | 
			
		||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
 | 
			
		||||
/tests/v1/structured_output @mgoin @russellb
 | 
			
		||||
/tests/weight_loading @mgoin @youkaichao
 | 
			
		||||
/tests/lora @jeejeelee
 | 
			
		||||
 | 
			
		||||
# Docs
 | 
			
		||||
/docs @hmellor
 | 
			
		||||
mkdocs.yaml @hmellor
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/200-installation.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/200-installation.yml
									
									
									
									
										vendored
									
									
								
							@ -14,7 +14,7 @@ body:
 | 
			
		||||
    description: |
 | 
			
		||||
      Please run the following and paste the output below.
 | 
			
		||||
      ```sh
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
 | 
			
		||||
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
 | 
			
		||||
      python collect_env.py
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/300-usage.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/300-usage.yml
									
									
									
									
										vendored
									
									
								
							@ -14,7 +14,7 @@ body:
 | 
			
		||||
    description: |
 | 
			
		||||
      Please run the following and paste the output below.
 | 
			
		||||
      ```sh
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
 | 
			
		||||
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
 | 
			
		||||
      python collect_env.py
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										24
									
								
								.github/ISSUE_TEMPLATE/400-bug-report.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										24
									
								
								.github/ISSUE_TEMPLATE/400-bug-report.yml
									
									
									
									
										vendored
									
									
								
							@ -8,35 +8,25 @@ body:
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: |
 | 
			
		||||
      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
 | 
			
		||||
      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
 | 
			
		||||
      - Passwords or authentication credentials
 | 
			
		||||
      - Private URLs or endpoints
 | 
			
		||||
      - Personal or confidential data
 | 
			
		||||
      
 | 
			
		||||
      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Your current environment
 | 
			
		||||
    description: |
 | 
			
		||||
      Please run the following and paste the output below.
 | 
			
		||||
      ```sh
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
 | 
			
		||||
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
 | 
			
		||||
      python collect_env.py
 | 
			
		||||
      ```
 | 
			
		||||
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
 | 
			
		||||
    value: |
 | 
			
		||||
      <details>
 | 
			
		||||
      <summary>The output of <code>python collect_env.py</code></summary>
 | 
			
		||||
      <summary>The output of `python collect_env.py`</summary>
 | 
			
		||||
 | 
			
		||||
      ```text
 | 
			
		||||
      Your output of `python collect_env.py` here
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
      
 | 
			
		||||
      </details>
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
@ -85,20 +75,20 @@ body:
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
      ```
 | 
			
		||||
      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
 | 
			
		||||
      The error message you got, with the full traceback.
 | 
			
		||||
      ```
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: |
 | 
			
		||||
      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
 | 
			
		||||
    value: >
 | 
			
		||||
      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
 | 
			
		||||
 | 
			
		||||
      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
 | 
			
		||||
 | 
			
		||||
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
 | 
			
		||||
 | 
			
		||||
      Thanks for reporting 🙏!
 | 
			
		||||
      Thanks for contributing 🎉!
 | 
			
		||||
- type: checkboxes
 | 
			
		||||
  id: askllm
 | 
			
		||||
  attributes:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										69
									
								
								.github/ISSUE_TEMPLATE/450-ci-failure.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										69
									
								
								.github/ISSUE_TEMPLATE/450-ci-failure.yml
									
									
									
									
										vendored
									
									
								
							@ -1,69 +0,0 @@
 | 
			
		||||
name: 🧪 CI failure report
 | 
			
		||||
description: Report a failing test.
 | 
			
		||||
title: "[CI Failure]: "
 | 
			
		||||
labels: ["ci-failure"]
 | 
			
		||||
 | 
			
		||||
body:
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      #### Include the name of the failing Buildkite step and test file in the title.
 | 
			
		||||
- type: input
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Name of failing test
 | 
			
		||||
    description: |
 | 
			
		||||
      Paste in the fully-qualified name of the failing test from the logs.
 | 
			
		||||
    placeholder: |
 | 
			
		||||
      `path/to/test_file.py::test_name[params]`
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: checkboxes
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Basic information
 | 
			
		||||
    description: Select all items that apply to the failing test.
 | 
			
		||||
    options:
 | 
			
		||||
      - label: Flaky test
 | 
			
		||||
      - label: Can reproduce locally
 | 
			
		||||
      - label: Caused by external libraries (e.g. bug in `transformers`)
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: 🧪 Describe the failing test
 | 
			
		||||
    description: |
 | 
			
		||||
      Please provide a clear and concise description of the failing test.
 | 
			
		||||
    placeholder: |
 | 
			
		||||
      A clear and concise description of the failing test.
 | 
			
		||||
  
 | 
			
		||||
      ```
 | 
			
		||||
      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
 | 
			
		||||
      ```
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: 📝 History of failing test
 | 
			
		||||
    description: |
 | 
			
		||||
      Since when did the test start to fail?
 | 
			
		||||
      You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
 | 
			
		||||
 | 
			
		||||
      If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
 | 
			
		||||
 | 
			
		||||
      - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
 | 
			
		||||
 | 
			
		||||
      - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
 | 
			
		||||
 | 
			
		||||
      - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
 | 
			
		||||
    placeholder: |
 | 
			
		||||
      Approximate timeline and/or problematic PRs
 | 
			
		||||
 | 
			
		||||
      A link to the Buildkite analytics of the failing test (if available)
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: CC List.
 | 
			
		||||
    description: >
 | 
			
		||||
      The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      Thanks for reporting 🙏!
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/600-new-model.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/600-new-model.yml
									
									
									
									
										vendored
									
									
								
							@ -9,7 +9,7 @@ body:
 | 
			
		||||
    value: >
 | 
			
		||||
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 | 
			
		||||
 | 
			
		||||
      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
 | 
			
		||||
      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: The model to consider.
 | 
			
		||||
 | 
			
		||||
@ -35,7 +35,7 @@ body:
 | 
			
		||||
    description: |
 | 
			
		||||
      Please run the following and paste the output below.
 | 
			
		||||
      ```sh
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
 | 
			
		||||
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
 | 
			
		||||
      python collect_env.py
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										18
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										18
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							@ -1,18 +1,6 @@
 | 
			
		||||
## Essential Elements of an Effective PR Description Checklist
 | 
			
		||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
 | 
			
		||||
- [ ] The test plan, such as providing test command.
 | 
			
		||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
 | 
			
		||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
 | 
			
		||||
FILL IN THE PR DESCRIPTION HERE
 | 
			
		||||
 | 
			
		||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
 | 
			
		||||
 | 
			
		||||
## Purpose
 | 
			
		||||
 | 
			
		||||
## Test Plan
 | 
			
		||||
 | 
			
		||||
## Test Result
 | 
			
		||||
 | 
			
		||||
## (Optional) Documentation Update
 | 
			
		||||
FIX #xxxx (*link existing issues this PR will resolve*)
 | 
			
		||||
 | 
			
		||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
 | 
			
		||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
 | 
			
		||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										172
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										172
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							@ -19,7 +19,7 @@ pull_request_rules:
 | 
			
		||||
      - files~=\.buildkite/
 | 
			
		||||
      - files~=^cmake/
 | 
			
		||||
      - files=CMakeLists.txt
 | 
			
		||||
      - files~=^docker/Dockerfile
 | 
			
		||||
      - files~=^Dockerfile
 | 
			
		||||
      - files~=^requirements.*\.txt
 | 
			
		||||
      - files=setup.py
 | 
			
		||||
  actions:
 | 
			
		||||
@ -27,22 +27,6 @@ pull_request_rules:
 | 
			
		||||
      add:
 | 
			
		||||
        - ci/build
 | 
			
		||||
 | 
			
		||||
- name: label-deepseek
 | 
			
		||||
  description: Automatically apply deepseek label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/.*deepseek.*\.py
 | 
			
		||||
      - files~=^tests/.*deepseek.*\.py
 | 
			
		||||
      - files~=^vllm/entrypoints/openai/tool_parsers/.*deepseek.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*deepseek.*\.py
 | 
			
		||||
      - files~=^vllm/reasoning/.*deepseek.*\.py
 | 
			
		||||
      - files~=^vllm/transformers_utils/.*deepseek.*\.py
 | 
			
		||||
      - title~=(?i)DeepSeek
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - deepseek
 | 
			
		||||
 | 
			
		||||
- name: label-frontend
 | 
			
		||||
  description: Automatically apply frontend label
 | 
			
		||||
  conditions:
 | 
			
		||||
@ -52,21 +36,6 @@ pull_request_rules:
 | 
			
		||||
      add:
 | 
			
		||||
        - frontend
 | 
			
		||||
 | 
			
		||||
- name: label-llama
 | 
			
		||||
  description: Automatically apply llama label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/.*llama.*\.py
 | 
			
		||||
      - files~=^tests/.*llama.*\.py
 | 
			
		||||
      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*llama.*\.py
 | 
			
		||||
      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
 | 
			
		||||
      - title~=(?i)llama
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - llama
 | 
			
		||||
 | 
			
		||||
- name: label-multi-modality
 | 
			
		||||
  description: Automatically apply multi-modality label
 | 
			
		||||
  conditions:
 | 
			
		||||
@ -74,87 +43,23 @@ pull_request_rules:
 | 
			
		||||
      - files~=^vllm/multimodal/
 | 
			
		||||
      - files~=^tests/multimodal/
 | 
			
		||||
      - files~=^tests/models/multimodal/
 | 
			
		||||
      - files~=^tests/models/*/audio_language/
 | 
			
		||||
      - files~=^tests/models/*/vision_language/
 | 
			
		||||
      - files=tests/models/test_vision.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - multi-modality
 | 
			
		||||
 | 
			
		||||
- name: label-new-model
 | 
			
		||||
  description: Automatically apply new-model label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - and:
 | 
			
		||||
      - files~=^vllm/model_executor/models/
 | 
			
		||||
      - files=vllm/model_executor/models/registry.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - new-model
 | 
			
		||||
 | 
			
		||||
- name: label-performance
 | 
			
		||||
  description: Automatically apply performance label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^benchmarks/
 | 
			
		||||
      - files~=^vllm/benchmarks/
 | 
			
		||||
      - files~=^tests/benchmarks/
 | 
			
		||||
      - files~=^\.buildkite/nightly-benchmarks/
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - performance
 | 
			
		||||
 | 
			
		||||
- name: label-qwen
 | 
			
		||||
  description: Automatically apply qwen label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/.*qwen.*\.py
 | 
			
		||||
      - files~=^tests/.*qwen.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*qwen.*\.py
 | 
			
		||||
      - files~=^vllm/reasoning/.*qwen.*\.py
 | 
			
		||||
      - title~=(?i)Qwen
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - qwen
 | 
			
		||||
 | 
			
		||||
- name: label-rocm
 | 
			
		||||
  description: Automatically apply rocm label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^csrc/rocm/
 | 
			
		||||
      - files~=^docker/Dockerfile.rocm
 | 
			
		||||
      - files~=^requirements/rocm.*\.txt
 | 
			
		||||
      - files~=^vllm/attention/backends/rocm.*\.py
 | 
			
		||||
      - files~=^vllm/attention/ops/rocm.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
 | 
			
		||||
      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
 | 
			
		||||
      - files~=^tests/kernels/.*_rocm.*\.py
 | 
			
		||||
      - files=vllm/platforms/rocm.py
 | 
			
		||||
      - title~=(?i)AMD
 | 
			
		||||
      - title~=(?i)ROCm
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - rocm
 | 
			
		||||
 | 
			
		||||
- name: label-structured-output
 | 
			
		||||
  description: Automatically apply structured-output label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^benchmarks/structured_schemas/
 | 
			
		||||
      - files=benchmarks/benchmark_serving_structured_output.py
 | 
			
		||||
      - files=benchmarks/run_structured_output_benchmark.sh
 | 
			
		||||
      - files=docs/features/structured_outputs.md
 | 
			
		||||
      - files=examples/offline_inference/structured_outputs.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
 | 
			
		||||
      - files~=^vllm/model_executor/guided_decoding/
 | 
			
		||||
      - files=tests/model_executor/test_guided_processors.py
 | 
			
		||||
      - files=tests/entrypoints/llm/test_guided_generate.py
 | 
			
		||||
      - files~=^tests/v1/structured_output/
 | 
			
		||||
      - files=tests/v1/entrypoints/llm/test_guided_generate.py
 | 
			
		||||
      - files~=^vllm/v1/structured_output/
 | 
			
		||||
      - files=benchmarks/benchmark_serving_guided.py
 | 
			
		||||
      - files=benchmarks/benchmark_guided.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
@ -165,14 +70,8 @@ pull_request_rules:
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^vllm/spec_decode/
 | 
			
		||||
      - files~=^vllm/v1/spec_decode/
 | 
			
		||||
      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
 | 
			
		||||
      - files~=^tests/spec_decode/
 | 
			
		||||
      - files~=^tests/v1/spec_decode/
 | 
			
		||||
      - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*eagle.*\.py
 | 
			
		||||
      - files=vllm/model_executor/models/mlp_speculator.py
 | 
			
		||||
      - files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
@ -189,56 +88,6 @@ pull_request_rules:
 | 
			
		||||
      add:
 | 
			
		||||
        - v1
 | 
			
		||||
 | 
			
		||||
- name: label-tpu
 | 
			
		||||
  description: Automatically apply tpu label
 | 
			
		||||
  # Keep this list in sync with `label-tpu-remove` conditions
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=tpu.py
 | 
			
		||||
      - files~=_tpu
 | 
			
		||||
      - files~=tpu_
 | 
			
		||||
      - files~=/tpu/
 | 
			
		||||
      - files~=pallas
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - tpu
 | 
			
		||||
 | 
			
		||||
- name: label-tpu-remove
 | 
			
		||||
  description: Automatically remove tpu label
 | 
			
		||||
  # Keep this list in sync with `label-tpu` conditions
 | 
			
		||||
  conditions:
 | 
			
		||||
    - and:
 | 
			
		||||
      - -files~=tpu.py
 | 
			
		||||
      - -files~=_tpu
 | 
			
		||||
      - -files~=tpu_
 | 
			
		||||
      - -files~=/tpu/
 | 
			
		||||
      - -files~=pallas
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      remove:
 | 
			
		||||
        - tpu
 | 
			
		||||
 | 
			
		||||
- name: label-tool-calling
 | 
			
		||||
  description: Automatically add tool-calling label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^tests/tool_use/
 | 
			
		||||
      - files~=^tests/mistral_tool_use/
 | 
			
		||||
      - files~=^tests/entrypoints/openai/tool_parsers/
 | 
			
		||||
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
 | 
			
		||||
      - files~=^vllm/entrypoints/openai/tool_parsers/
 | 
			
		||||
      - files=docs/features/tool_calling.md
 | 
			
		||||
      - files~=^examples/tool_chat_*
 | 
			
		||||
      - files=examples/offline_inference/chat_with_tools.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - tool-calling
 | 
			
		||||
 | 
			
		||||
- name: ping author on conflicts and add 'needs-rebase' label
 | 
			
		||||
  conditions:
 | 
			
		||||
      - conflict
 | 
			
		||||
@ -254,17 +103,6 @@ pull_request_rules:
 | 
			
		||||
 | 
			
		||||
       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 | 
			
		||||
 | 
			
		||||
- name: assign reviewer for tensorizer changes
 | 
			
		||||
  conditions:
 | 
			
		||||
      - files~=^vllm/model_executor/model_loader/tensorizer.py
 | 
			
		||||
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
 | 
			
		||||
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
 | 
			
		||||
      - files~=^tests/tensorizer_loader/
 | 
			
		||||
  actions:
 | 
			
		||||
    assign:
 | 
			
		||||
      users:
 | 
			
		||||
        - "sangstar"
 | 
			
		||||
 | 
			
		||||
- name: remove 'needs-rebase' label when conflict is resolved
 | 
			
		||||
  conditions:
 | 
			
		||||
      - -conflict
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/scripts/cleanup_pr_body.sh
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/scripts/cleanup_pr_body.sh
									
									
									
									
										vendored
									
									
								
							@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
 | 
			
		||||
 | 
			
		||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
 | 
			
		||||
python3 - <<EOF
 | 
			
		||||
import regex as re
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
with open("${NEW}", "r") as file:
 | 
			
		||||
    content = file.read()
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/add_label_automerge.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/add_label_automerge.yml
									
									
									
									
										vendored
									
									
								
							@ -1,6 +1,4 @@
 | 
			
		||||
name: Add label on auto-merge enabled
 | 
			
		||||
permissions:
 | 
			
		||||
    pull-requests: write
 | 
			
		||||
on:
 | 
			
		||||
    pull_request_target:
 | 
			
		||||
        types:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										7
									
								
								.github/workflows/cleanup_pr_body.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.github/workflows/cleanup_pr_body.yml
									
									
									
									
										vendored
									
									
								
							@ -20,12 +20,7 @@ jobs:
 | 
			
		||||
        with:
 | 
			
		||||
          python-version: '3.12'
 | 
			
		||||
 | 
			
		||||
      - name: Install Python dependencies
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 -m pip install --upgrade pip
 | 
			
		||||
          python3 -m pip install regex
 | 
			
		||||
 | 
			
		||||
      - name: Update PR description
 | 
			
		||||
        env:
 | 
			
		||||
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 | 
			
		||||
        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
 | 
			
		||||
        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										11
									
								
								.github/workflows/lint-and-deploy.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										11
									
								
								.github/workflows/lint-and-deploy.yaml
									
									
									
									
										vendored
									
									
								
							@ -2,9 +2,6 @@ name: Lint and Deploy Charts
 | 
			
		||||
 | 
			
		||||
on: pull_request
 | 
			
		||||
 | 
			
		||||
permissions:
 | 
			
		||||
  contents: read
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  lint-and-deploy:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
@ -53,7 +50,7 @@ jobs:
 | 
			
		||||
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
 | 
			
		||||
 | 
			
		||||
      - name: Build the Docker image vllm cpu
 | 
			
		||||
        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
 | 
			
		||||
        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
 | 
			
		||||
 | 
			
		||||
      - name: Configuration of docker images, network and namespace for the kind cluster
 | 
			
		||||
        run: |
 | 
			
		||||
@ -68,8 +65,8 @@ jobs:
 | 
			
		||||
          export AWS_ACCESS_KEY_ID=minioadmin
 | 
			
		||||
          export AWS_SECRET_ACCESS_KEY=minioadmin
 | 
			
		||||
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
 | 
			
		||||
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
 | 
			
		||||
 | 
			
		||||
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
 | 
			
		||||
    
 | 
			
		||||
      - name: curl test
 | 
			
		||||
        run: |
 | 
			
		||||
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
 | 
			
		||||
@ -82,4 +79,4 @@ jobs:
 | 
			
		||||
                          "max_tokens": 7,
 | 
			
		||||
                          "temperature": 0
 | 
			
		||||
                  }'):$CODE"
 | 
			
		||||
          echo "$CODE"
 | 
			
		||||
          echo "$CODE"
 | 
			
		||||
							
								
								
									
										3
									
								
								.github/workflows/pre-commit.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/workflows/pre-commit.yml
									
									
									
									
										vendored
									
									
								
							@ -5,9 +5,6 @@ on:
 | 
			
		||||
  push:
 | 
			
		||||
    branches: [main]
 | 
			
		||||
 | 
			
		||||
permissions:
 | 
			
		||||
  contents: read
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  pre-commit:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/reminder_comment.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/reminder_comment.yml
									
									
									
									
										vendored
									
									
								
							@ -1,6 +1,4 @@
 | 
			
		||||
name: PR Reminder Comment Bot
 | 
			
		||||
permissions:
 | 
			
		||||
  pull-requests: write
 | 
			
		||||
on:
 | 
			
		||||
  pull_request_target:
 | 
			
		||||
    types: [opened]
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										10
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										10
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -3,6 +3,7 @@
 | 
			
		||||
 | 
			
		||||
# vllm-flash-attn built from source
 | 
			
		||||
vllm/vllm_flash_attn/*
 | 
			
		||||
!vllm/vllm_flash_attn/fa_utils.py
 | 
			
		||||
 | 
			
		||||
# Byte-compiled / optimized / DLL files
 | 
			
		||||
__pycache__/
 | 
			
		||||
@ -77,6 +78,10 @@ instance/
 | 
			
		||||
# Scrapy stuff:
 | 
			
		||||
.scrapy
 | 
			
		||||
 | 
			
		||||
# Sphinx documentation
 | 
			
		||||
docs/_build/
 | 
			
		||||
docs/source/getting_started/examples/
 | 
			
		||||
 | 
			
		||||
# PyBuilder
 | 
			
		||||
.pybuilder/
 | 
			
		||||
target/
 | 
			
		||||
@ -146,8 +151,6 @@ venv.bak/
 | 
			
		||||
 | 
			
		||||
# mkdocs documentation
 | 
			
		||||
/site
 | 
			
		||||
docs/argparse
 | 
			
		||||
docs/examples
 | 
			
		||||
 | 
			
		||||
# mypy
 | 
			
		||||
.mypy_cache/
 | 
			
		||||
@ -200,6 +203,3 @@ benchmarks/**/*.json
 | 
			
		||||
# Linting
 | 
			
		||||
actionlint
 | 
			
		||||
shellcheck*/
 | 
			
		||||
 | 
			
		||||
# Ignore moe/marlin_moe gen code
 | 
			
		||||
csrc/moe/marlin_moe_wna16/kernel_*
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,3 @@
 | 
			
		||||
default_install_hook_types:
 | 
			
		||||
  - pre-commit
 | 
			
		||||
  - commit-msg
 | 
			
		||||
default_stages:
 | 
			
		||||
  - pre-commit # Run locally
 | 
			
		||||
  - manual # Run in CI
 | 
			
		||||
@ -11,59 +8,52 @@ repos:
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: yapf
 | 
			
		||||
    args: [--in-place, --verbose]
 | 
			
		||||
    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
 | 
			
		||||
    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 | 
			
		||||
    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 | 
			
		||||
- repo: https://github.com/astral-sh/ruff-pre-commit
 | 
			
		||||
  rev: v0.11.7
 | 
			
		||||
  rev: v0.9.3
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: ruff
 | 
			
		||||
    args: [--output-format, github, --fix]
 | 
			
		||||
  - id: ruff-format
 | 
			
		||||
    files: ^(.buildkite|benchmarks|examples)/.*
 | 
			
		||||
- repo: https://github.com/crate-ci/typos
 | 
			
		||||
  rev: v1.32.0
 | 
			
		||||
- repo: https://github.com/codespell-project/codespell
 | 
			
		||||
  rev: v2.4.0
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: typos
 | 
			
		||||
  - id: codespell
 | 
			
		||||
    additional_dependencies: ['tomli']
 | 
			
		||||
    args: ['--toml', 'pyproject.toml']
 | 
			
		||||
- repo: https://github.com/PyCQA/isort
 | 
			
		||||
  rev: 6.0.1
 | 
			
		||||
  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: isort
 | 
			
		||||
- repo: https://github.com/pre-commit/mirrors-clang-format
 | 
			
		||||
  rev: v20.1.3
 | 
			
		||||
  rev: v19.1.7
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: clang-format
 | 
			
		||||
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
 | 
			
		||||
    types_or: [c++, cuda]
 | 
			
		||||
    args: [--style=file, --verbose]
 | 
			
		||||
- repo: https://github.com/jackdewinter/pymarkdown
 | 
			
		||||
  rev: v0.9.29
 | 
			
		||||
  rev: v0.9.27
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: pymarkdown
 | 
			
		||||
    exclude: '.*\.inc\.md'
 | 
			
		||||
    args: [fix]
 | 
			
		||||
- repo: https://github.com/rhysd/actionlint
 | 
			
		||||
  rev: v1.7.7
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: actionlint
 | 
			
		||||
- repo: https://github.com/astral-sh/uv-pre-commit
 | 
			
		||||
  rev: 0.6.17
 | 
			
		||||
  rev: 0.6.2
 | 
			
		||||
  hooks:
 | 
			
		||||
    - id: pip-compile
 | 
			
		||||
      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
 | 
			
		||||
      args: [requirements/test.in, -o, requirements/test.txt]
 | 
			
		||||
      files: ^requirements/test\.(in|txt)$
 | 
			
		||||
- repo: local
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: format-torch-nightly-test
 | 
			
		||||
    name: reformat nightly_torch_test.txt to be in sync with test.in
 | 
			
		||||
    language: python
 | 
			
		||||
    entry: python tools/generate_nightly_torch_test.py
 | 
			
		||||
    files: ^requirements/test\.(in|txt)$
 | 
			
		||||
  - id: mypy-local
 | 
			
		||||
    name: Run mypy for local Python installation
 | 
			
		||||
    entry: tools/mypy.sh 0 "local"
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
 | 
			
		||||
    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
 | 
			
		||||
    stages: [pre-commit] # Don't run in CI
 | 
			
		||||
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
			
		||||
    name: Run mypy for Python 3.9
 | 
			
		||||
@ -109,8 +99,8 @@ repos:
 | 
			
		||||
    args:
 | 
			
		||||
      - -c
 | 
			
		||||
      - |
 | 
			
		||||
        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
 | 
			
		||||
          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
 | 
			
		||||
        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
 | 
			
		||||
          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
 | 
			
		||||
        fi
 | 
			
		||||
    language: system
 | 
			
		||||
    verbose: true
 | 
			
		||||
@ -120,11 +110,6 @@ repos:
 | 
			
		||||
    entry: python tools/check_spdx_header.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
  - id: check-root-lazy-imports
 | 
			
		||||
    name: Check root lazy imports
 | 
			
		||||
    entry: python tools/check_init_lazy_imports.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
  - id: check-filenames
 | 
			
		||||
    name: Check for spaces in all filenames
 | 
			
		||||
    entry: bash
 | 
			
		||||
@ -134,43 +119,10 @@ repos:
 | 
			
		||||
    language: system
 | 
			
		||||
    always_run: true
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
  - id: update-dockerfile-graph
 | 
			
		||||
    name: Update Dockerfile dependency graph
 | 
			
		||||
    entry: tools/update-dockerfile-graph.sh
 | 
			
		||||
    language: script
 | 
			
		||||
  - id: enforce-import-regex-instead-of-re
 | 
			
		||||
    name: Enforce import regex as re
 | 
			
		||||
    entry: python tools/enforce_regex_import.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
    additional_dependencies: [regex]
 | 
			
		||||
  # forbid directly import triton
 | 
			
		||||
  - id: forbid-direct-triton-import
 | 
			
		||||
    name: "Forbid direct 'import triton'"
 | 
			
		||||
    entry: python tools/check_triton_import.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
    additional_dependencies: [regex]
 | 
			
		||||
  - id: check-pickle-imports
 | 
			
		||||
    name: Prevent new pickle/cloudpickle imports
 | 
			
		||||
    entry: python tools/check_pickle_imports.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
    additional_dependencies: [pathspec, regex]
 | 
			
		||||
  - id: validate-config
 | 
			
		||||
    name: Validate configuration has default values and that each field has a docstring
 | 
			
		||||
    entry: python tools/validate_config.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    pass_filenames: true
 | 
			
		||||
    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
 | 
			
		||||
  # Keep `suggestion` last
 | 
			
		||||
  - id: suggestion
 | 
			
		||||
    name: Suggestion
 | 
			
		||||
    entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
 | 
			
		||||
    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
 | 
			
		||||
    language: system
 | 
			
		||||
    verbose: true
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
 | 
			
		||||
@ -8,8 +8,12 @@ build:
 | 
			
		||||
  tools:
 | 
			
		||||
    python: "3.12"
 | 
			
		||||
 | 
			
		||||
mkdocs:
 | 
			
		||||
  configuration: mkdocs.yaml
 | 
			
		||||
sphinx:
 | 
			
		||||
  configuration: docs/source/conf.py
 | 
			
		||||
  fail_on_warning: true
 | 
			
		||||
 | 
			
		||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
 | 
			
		||||
formats: []
 | 
			
		||||
 | 
			
		||||
# Optionally declare the Python requirements required to build your docs
 | 
			
		||||
python:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										326
									
								
								CMakeLists.txt
									
									
									
									
									
								
							
							
						
						
									
										326
									
								
								CMakeLists.txt
									
									
									
									
									
								
							@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
 | 
			
		||||
 | 
			
		||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 | 
			
		||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 | 
			
		||||
 | 
			
		||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 | 
			
		||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 | 
			
		||||
 | 
			
		||||
@ -23,17 +24,17 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 | 
			
		||||
# Suppress potential warnings about unused manually-specified variables
 | 
			
		||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
 | 
			
		||||
 | 
			
		||||
# Prevent installation of dependencies (cutlass) by default.
 | 
			
		||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Supported python versions.  These versions will be searched in order, the
 | 
			
		||||
# first match will be selected.  These should be kept in sync with setup.py.
 | 
			
		||||
#
 | 
			
		||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 | 
			
		||||
 | 
			
		||||
# Supported NVIDIA architectures.
 | 
			
		||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 | 
			
		||||
 | 
			
		||||
# Supported AMD GPU architectures.
 | 
			
		||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 | 
			
		||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Supported/expected torch versions for CUDA/ROCm.
 | 
			
		||||
@ -43,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 | 
			
		||||
#
 | 
			
		||||
# Note: the CUDA torch version is derived from pyproject.toml and various
 | 
			
		||||
# requirements.txt files and should be kept consistent.  The ROCm torch
 | 
			
		||||
# versions are derived from docker/Dockerfile.rocm
 | 
			
		||||
# versions are derived from Dockerfile.rocm
 | 
			
		||||
#
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Try to find python package with an executable that exactly matches
 | 
			
		||||
@ -79,15 +80,6 @@ endif()
 | 
			
		||||
#
 | 
			
		||||
find_package(Torch REQUIRED)
 | 
			
		||||
 | 
			
		||||
# Supported NVIDIA architectures.
 | 
			
		||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 | 
			
		||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
 | 
			
		||||
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
 | 
			
		||||
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 | 
			
		||||
else()
 | 
			
		||||
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Forward the non-CUDA device extensions to external CMake scripts.
 | 
			
		||||
#
 | 
			
		||||
@ -171,6 +163,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
 | 
			
		||||
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
 | 
			
		||||
@ -181,6 +174,9 @@ include(FetchContent)
 | 
			
		||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 | 
			
		||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Set rocm version dev int.
 | 
			
		||||
#
 | 
			
		||||
if(VLLM_GPU_LANG STREQUAL "HIP")
 | 
			
		||||
  #
 | 
			
		||||
  # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
 | 
			
		||||
@ -188,6 +184,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 | 
			
		||||
  set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
 | 
			
		||||
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  #
 | 
			
		||||
  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
 | 
			
		||||
  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
 | 
			
		||||
@ -230,34 +227,29 @@ endif()
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
set(VLLM_EXT_SRC
 | 
			
		||||
  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
 | 
			
		||||
  "csrc/cache_kernels.cu"
 | 
			
		||||
  "csrc/block_table.cu"
 | 
			
		||||
  "csrc/attention/paged_attention_v1.cu"
 | 
			
		||||
  "csrc/attention/paged_attention_v2.cu"
 | 
			
		||||
  "csrc/attention/merge_attn_states.cu"
 | 
			
		||||
  "csrc/attention/vertical_slash_index.cu"
 | 
			
		||||
  "csrc/pos_encoding_kernels.cu"
 | 
			
		||||
  "csrc/activation_kernels.cu"
 | 
			
		||||
  "csrc/layernorm_kernels.cu"
 | 
			
		||||
  "csrc/layernorm_quant_kernels.cu"
 | 
			
		||||
  "csrc/sampler.cu"
 | 
			
		||||
  "csrc/cuda_view.cu"
 | 
			
		||||
  "csrc/quantization/gptq/q_gemm.cu"
 | 
			
		||||
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
 | 
			
		||||
  "csrc/quantization/fp8/common.cu"
 | 
			
		||||
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
 | 
			
		||||
  "csrc/quantization/gguf/gguf_kernel.cu"
 | 
			
		||||
  "csrc/quantization/activation_kernels.cu"
 | 
			
		||||
  "csrc/cuda_utils_kernels.cu"
 | 
			
		||||
  "csrc/prepare_inputs/advance_step.cu"
 | 
			
		||||
  "csrc/custom_all_reduce.cu"
 | 
			
		||||
  "csrc/torch_bindings.cpp")
 | 
			
		||||
 | 
			
		||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 | 
			
		||||
 | 
			
		||||
  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
 | 
			
		||||
  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
 | 
			
		||||
  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
 | 
			
		||||
  # Please keep this in sync with FetchContent_Declare line below.
 | 
			
		||||
  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
 | 
			
		||||
 | 
			
		||||
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
 | 
			
		||||
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
 | 
			
		||||
@ -275,7 +267,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
        cutlass
 | 
			
		||||
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
 | 
			
		||||
        # Please keep this in sync with CUTLASS_REVISION line above.
 | 
			
		||||
        GIT_TAG ${CUTLASS_REVISION}
 | 
			
		||||
        GIT_TAG v3.8.0
 | 
			
		||||
        GIT_PROGRESS TRUE
 | 
			
		||||
 | 
			
		||||
        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
 | 
			
		||||
@ -287,16 +279,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  FetchContent_MakeAvailable(cutlass)
 | 
			
		||||
 | 
			
		||||
  list(APPEND VLLM_EXT_SRC
 | 
			
		||||
    "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
 | 
			
		||||
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
 | 
			
		||||
    "csrc/quantization/aqlm/gemm_kernels.cu"
 | 
			
		||||
    "csrc/quantization/awq/gemm_kernels.cu"
 | 
			
		||||
    "csrc/custom_all_reduce.cu"
 | 
			
		||||
    "csrc/permute_cols.cu"
 | 
			
		||||
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
 | 
			
		||||
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
 | 
			
		||||
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
 | 
			
		||||
    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
 | 
			
		||||
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
 | 
			
		||||
    "csrc/cutlass_extensions/common.cpp"
 | 
			
		||||
    "csrc/attention/mla/cutlass_mla_entry.cu")
 | 
			
		||||
    "csrc/cutlass_extensions/common.cpp")
 | 
			
		||||
 | 
			
		||||
  set_gencode_flags_for_srcs(
 | 
			
		||||
    SRCS "${VLLM_EXT_SRC}"
 | 
			
		||||
@ -305,55 +298,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  # Only build Marlin kernels if we are building for at least some compatible archs.
 | 
			
		||||
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
 | 
			
		||||
  # are not supported by Machete yet.
 | 
			
		||||
  # 9.0 for latest bf16 atomicAdd PTX
 | 
			
		||||
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
 | 
			
		||||
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
 | 
			
		||||
  if (MARLIN_ARCHS)
 | 
			
		||||
 | 
			
		||||
    #
 | 
			
		||||
    # For the Marlin kernels we automatically generate sources for various
 | 
			
		||||
    # preselected input type pairs and schedules.
 | 
			
		||||
    # Generate sources:
 | 
			
		||||
    set(MARLIN_GEN_SCRIPT
 | 
			
		||||
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
 | 
			
		||||
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
 | 
			
		||||
 | 
			
		||||
    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
 | 
			
		||||
    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
 | 
			
		||||
 | 
			
		||||
    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
 | 
			
		||||
        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
 | 
			
		||||
      execute_process(
 | 
			
		||||
        COMMAND ${CMAKE_COMMAND} -E env
 | 
			
		||||
        PYTHONPATH=$PYTHONPATH
 | 
			
		||||
          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
 | 
			
		||||
        RESULT_VARIABLE marlin_generation_result
 | 
			
		||||
        OUTPUT_VARIABLE marlin_generation_result
 | 
			
		||||
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
 | 
			
		||||
        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
 | 
			
		||||
      )
 | 
			
		||||
 | 
			
		||||
      if (NOT marlin_generation_result EQUAL 0)
 | 
			
		||||
        message(FATAL_ERROR "Marlin generation failed."
 | 
			
		||||
                            " Result: \"${marlin_generation_result}\""
 | 
			
		||||
                            "\nCheck the log for details: "
 | 
			
		||||
                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
 | 
			
		||||
      else()
 | 
			
		||||
        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
 | 
			
		||||
            CACHE STRING "Last run Marlin generate script hash" FORCE)
 | 
			
		||||
        message(STATUS "Marlin generation completed successfully.")
 | 
			
		||||
      endif()
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Marlin generation script has not changed, skipping generation.")
 | 
			
		||||
    endif()
 | 
			
		||||
 | 
			
		||||
    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
 | 
			
		||||
      CUDA_ARCHS "${MARLIN_ARCHS}")
 | 
			
		||||
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 | 
			
		||||
 | 
			
		||||
    set(MARLIN_SRCS
 | 
			
		||||
       "csrc/quantization/fp8/fp8_marlin.cu"
 | 
			
		||||
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
 | 
			
		||||
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
 | 
			
		||||
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
 | 
			
		||||
@ -391,7 +339,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
 | 
			
		||||
  # CUDA 12.0 or later
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
 | 
			
		||||
    set(SRCS
 | 
			
		||||
       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
 | 
			
		||||
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
 | 
			
		||||
@ -407,7 +355,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
 | 
			
		||||
                     "later if you intend on running FP8 quantized models on "
 | 
			
		||||
@ -418,44 +366,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
 | 
			
		||||
  # The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
 | 
			
		||||
  # CUDA 12.8 or later
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
    set(SRCS
 | 
			
		||||
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
 | 
			
		||||
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
 | 
			
		||||
    )
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM120=1")
 | 
			
		||||
    # Let scaled_mm_c2x know it doesn't need to build these arches
 | 
			
		||||
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
 | 
			
		||||
                     "later if you intend on running FP8 quantized models on "
 | 
			
		||||
                     "Blackwell.")
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x_120 as no compatible archs found "
 | 
			
		||||
                     "in CUDA target architectures")
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
 | 
			
		||||
  # require CUDA 12.8 or later
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
    set(SRCS
 | 
			
		||||
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
 | 
			
		||||
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
 | 
			
		||||
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
 | 
			
		||||
    )
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
@ -466,7 +383,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
 | 
			
		||||
                     "later if you intend on running FP8 quantized models on "
 | 
			
		||||
@ -480,9 +397,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  #
 | 
			
		||||
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
 | 
			
		||||
  # kernels for the remaining archs that are not already built for 3x.
 | 
			
		||||
  # (Build 8.9 for FP8)
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
 | 
			
		||||
    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
 | 
			
		||||
    "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
 | 
			
		||||
  # subtract out the archs that are already built for 3x
 | 
			
		||||
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
 | 
			
		||||
  if (SCALED_MM_2X_ARCHS)
 | 
			
		||||
@ -509,7 +425,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
 | 
			
		||||
  # require CUDA 12.2 or later (and only work on Hopper).
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
 | 
			
		||||
    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
@ -518,7 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
 | 
			
		||||
    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
 | 
			
		||||
      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
 | 
			
		||||
                     "if you intend on running FP8 sparse quantized models on Hopper.")
 | 
			
		||||
@ -530,18 +446,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
 | 
			
		||||
  # FP4 Archs and flags
 | 
			
		||||
  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
 | 
			
		||||
    set(SRCS
 | 
			
		||||
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
 | 
			
		||||
      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
 | 
			
		||||
      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
 | 
			
		||||
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
 | 
			
		||||
      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${FP4_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
 | 
			
		||||
    message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    message(STATUS "Not building NVFP4 as no compatible archs were found.")
 | 
			
		||||
@ -549,99 +462,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
    set(FP4_ARCHS)
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  # CUTLASS MLA Archs and flags
 | 
			
		||||
  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
 | 
			
		||||
    set(SRCS
 | 
			
		||||
      "csrc/attention/mla/cutlass_mla_kernels.cu"
 | 
			
		||||
      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${MLA_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
 | 
			
		||||
    # Add MLA-specific include directories only to MLA source files
 | 
			
		||||
    set_source_files_properties(${SRCS}
 | 
			
		||||
      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
 | 
			
		||||
    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
 | 
			
		||||
    # clear MLA_ARCHS
 | 
			
		||||
    set(MLA_ARCHS)
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  # CUTLASS MoE kernels
 | 
			
		||||
 | 
			
		||||
  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
 | 
			
		||||
  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
 | 
			
		||||
  # if it's possible to compile MoE kernels that use its output.
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
 | 
			
		||||
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
 | 
			
		||||
    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
 | 
			
		||||
      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
 | 
			
		||||
                     "if you intend on running FP8 quantized MoE models on Hopper.")
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
 | 
			
		||||
                     "in CUDA target architectures.")
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  # moe_data.cu is used by all CUTLASS MoE kernels.
 | 
			
		||||
  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
 | 
			
		||||
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
 | 
			
		||||
      message(STATUS "Not building moe_data as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
 | 
			
		||||
                     "if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Not building moe_data as no compatible archs found "
 | 
			
		||||
                     "in CUDA target architectures.")
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
  
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
 | 
			
		||||
    message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
      message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
 | 
			
		||||
                     "if you intend on running FP8 quantized MoE models on Blackwell.")
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
 | 
			
		||||
                     "in CUDA target architectures")
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  #
 | 
			
		||||
  # Machete kernels
 | 
			
		||||
 | 
			
		||||
  # The machete kernels only work on hopper and require CUDA 12.0 or later.
 | 
			
		||||
  # Only build Machete kernels if we are building for something compatible with sm90a
 | 
			
		||||
  cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
 | 
			
		||||
    #
 | 
			
		||||
    # For the Machete kernels we automatically generate sources for various
 | 
			
		||||
    # preselected input type pairs and schedules.
 | 
			
		||||
@ -693,7 +520,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
 | 
			
		||||
    message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
 | 
			
		||||
        AND MACHETE_ARCHS)
 | 
			
		||||
      message(STATUS "Not building Machete kernels as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
 | 
			
		||||
@ -707,14 +534,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
# if CUDA endif
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
if (VLLM_GPU_LANG STREQUAL "HIP")
 | 
			
		||||
  # Add QuickReduce kernels
 | 
			
		||||
  list(APPEND VLLM_EXT_SRC
 | 
			
		||||
    "csrc/custom_quickreduce.cu"
 | 
			
		||||
  )
 | 
			
		||||
# if ROCM endif
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
message(STATUS "Enabling C extension.")
 | 
			
		||||
define_gpu_extension_target(
 | 
			
		||||
  _C
 | 
			
		||||
@ -760,54 +579,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
    CUDA_ARCHS "${CUDA_ARCHS}")
 | 
			
		||||
 | 
			
		||||
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
 | 
			
		||||
  # 9.0 for latest bf16 atomicAdd PTX
 | 
			
		||||
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
 | 
			
		||||
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
 | 
			
		||||
  if (MARLIN_MOE_ARCHS)
 | 
			
		||||
    set(MARLIN_MOE_SRC
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
 | 
			
		||||
        "csrc/moe/marlin_moe_ops.cu")
 | 
			
		||||
 | 
			
		||||
    #
 | 
			
		||||
    # For the Marlin MOE kernels we automatically generate sources for various
 | 
			
		||||
    # preselected input type pairs and schedules.
 | 
			
		||||
    # Generate sources:
 | 
			
		||||
    set(MOE_MARLIN_GEN_SCRIPT
 | 
			
		||||
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
 | 
			
		||||
    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
 | 
			
		||||
 | 
			
		||||
    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
 | 
			
		||||
    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
 | 
			
		||||
 | 
			
		||||
    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
 | 
			
		||||
        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
 | 
			
		||||
      execute_process(
 | 
			
		||||
        COMMAND ${CMAKE_COMMAND} -E env
 | 
			
		||||
        PYTHONPATH=$PYTHONPATH
 | 
			
		||||
          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
 | 
			
		||||
        RESULT_VARIABLE moe_marlin_generation_result
 | 
			
		||||
        OUTPUT_VARIABLE moe_marlin_generation_output
 | 
			
		||||
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
 | 
			
		||||
        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
 | 
			
		||||
      )
 | 
			
		||||
 | 
			
		||||
      if (NOT moe_marlin_generation_result EQUAL 0)
 | 
			
		||||
        message(FATAL_ERROR "Marlin MOE generation failed."
 | 
			
		||||
                            " Result: \"${moe_marlin_generation_result}\""
 | 
			
		||||
                            "\nCheck the log for details: "
 | 
			
		||||
                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
 | 
			
		||||
      else()
 | 
			
		||||
        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
 | 
			
		||||
            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
 | 
			
		||||
        message(STATUS "Marlin MOE generation completed successfully.")
 | 
			
		||||
      endif()
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
 | 
			
		||||
    endif()
 | 
			
		||||
 | 
			
		||||
    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${MOE_WNAA16_MARLIN_SRC}"
 | 
			
		||||
      SRCS "${MARLIN_MOE_SRC}"
 | 
			
		||||
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
 | 
			
		||||
 | 
			
		||||
    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
 | 
			
		||||
 | 
			
		||||
    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
 | 
			
		||||
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
 | 
			
		||||
@ -815,17 +603,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  endif()
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  set(MOE_PERMUTE_SRC
 | 
			
		||||
      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
 | 
			
		||||
      "csrc/moe/moe_permute_unpermute_op.cu")
 | 
			
		||||
 | 
			
		||||
  set_gencode_flags_for_srcs(
 | 
			
		||||
    SRCS "${MARLIN_PERMUTE_SRC}"
 | 
			
		||||
    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
 | 
			
		||||
 | 
			
		||||
  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
 | 
			
		||||
endif()
 | 
			
		||||
message(STATUS "Enabling moe extension.")
 | 
			
		||||
define_gpu_extension_target(
 | 
			
		||||
  _moe_C
 | 
			
		||||
@ -834,8 +611,6 @@ define_gpu_extension_target(
 | 
			
		||||
  SOURCES ${VLLM_MOE_EXT_SRC}
 | 
			
		||||
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
 | 
			
		||||
  ARCHITECTURES ${VLLM_GPU_ARCHES}
 | 
			
		||||
  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
 | 
			
		||||
  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
 | 
			
		||||
  USE_SABI 3
 | 
			
		||||
  WITH_SOABI)
 | 
			
		||||
 | 
			
		||||
@ -845,7 +620,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 | 
			
		||||
  #
 | 
			
		||||
  set(VLLM_ROCM_EXT_SRC
 | 
			
		||||
    "csrc/rocm/torch_bindings.cpp"
 | 
			
		||||
    "csrc/rocm/skinny_gemms.cu"
 | 
			
		||||
    "csrc/rocm/attention.cu")
 | 
			
		||||
 | 
			
		||||
  define_gpu_extension_target(
 | 
			
		||||
@ -862,7 +636,5 @@ endif()
 | 
			
		||||
# For CUDA we also build and ship some external projects.
 | 
			
		||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
    include(cmake/external_projects/flashmla.cmake)
 | 
			
		||||
 | 
			
		||||
    # vllm-flash-attn should be last as it overwrites some CMake functions
 | 
			
		||||
    include(cmake/external_projects/vllm_flash_attn.cmake)
 | 
			
		||||
endif ()
 | 
			
		||||
 | 
			
		||||
@ -1,3 +1,3 @@
 | 
			
		||||
# Contributing to vLLM
 | 
			
		||||
 | 
			
		||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
 | 
			
		||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
 | 
			
		||||
 | 
			
		||||
@ -1,33 +1,32 @@
 | 
			
		||||
# The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing
 | 
			
		||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
 | 
			
		||||
# to run the OpenAI compatible server.
 | 
			
		||||
 | 
			
		||||
# for torch nightly, cuda >=12.6 is required,
 | 
			
		||||
# use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628)
 | 
			
		||||
ARG CUDA_VERSION=12.8.0
 | 
			
		||||
#
 | 
			
		||||
# Please update any changes made here to
 | 
			
		||||
# docs/source/contributing/dockerfile/dockerfile.md and
 | 
			
		||||
# docs/source/assets/contributing/dockerfile-stages-dependency.png
 | 
			
		||||
 | 
			
		||||
ARG CUDA_VERSION=12.4.1
 | 
			
		||||
#################### BASE BUILD IMAGE ####################
 | 
			
		||||
# prepare basic build environment
 | 
			
		||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 | 
			
		||||
ARG CUDA_VERSION=12.8.0
 | 
			
		||||
ARG CUDA_VERSION=12.4.1
 | 
			
		||||
ARG PYTHON_VERSION=3.12
 | 
			
		||||
ARG TARGETPLATFORM
 | 
			
		||||
ENV DEBIAN_FRONTEND=noninteractive
 | 
			
		||||
 | 
			
		||||
# Install Python and other dependencies
 | 
			
		||||
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 | 
			
		||||
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
 | 
			
		||||
    && apt-get update -y \
 | 
			
		||||
    && apt-get install -y ccache software-properties-common git curl sudo \
 | 
			
		||||
    && for i in 1 2 3; do \
 | 
			
		||||
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
 | 
			
		||||
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
 | 
			
		||||
    done \
 | 
			
		||||
    && add-apt-repository ppa:deadsnakes/ppa \
 | 
			
		||||
    && apt-get update -y \
 | 
			
		||||
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
 | 
			
		||||
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
 | 
			
		||||
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
 | 
			
		||||
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
 | 
			
		||||
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
 | 
			
		||||
    && python3 --version \
 | 
			
		||||
    && python3 -m pip --version
 | 
			
		||||
    && python3 --version && python3 -m pip --version
 | 
			
		||||
# Install uv for faster pip installs
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    python3 -m pip install uv
 | 
			
		||||
@ -53,67 +52,28 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 | 
			
		||||
WORKDIR /workspace
 | 
			
		||||
 | 
			
		||||
# install build and runtime dependencies
 | 
			
		||||
 | 
			
		||||
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
 | 
			
		||||
# we need to install torch and torchvision from the nightly builds first,
 | 
			
		||||
# pytorch will not appear as a vLLM dependency in all of the following steps
 | 
			
		||||
# after this step
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
COPY requirements/common.txt requirements/common.txt
 | 
			
		||||
COPY use_existing_torch.py use_existing_torch.py
 | 
			
		||||
COPY pyproject.toml pyproject.toml
 | 
			
		||||
 | 
			
		||||
# install build and runtime dependencies without stable torch version
 | 
			
		||||
RUN python3 use_existing_torch.py
 | 
			
		||||
 | 
			
		||||
# install torch nightly
 | 
			
		||||
ARG PINNED_TORCH_VERSION
 | 
			
		||||
COPY requirements/cuda.txt requirements/cuda.txt
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    if [ -n "$PINNED_TORCH_VERSION" ]; then \
 | 
			
		||||
      pkgs="$PINNED_TORCH_VERSION"; \
 | 
			
		||||
    else \
 | 
			
		||||
      pkgs="torch torchaudio torchvision"; \
 | 
			
		||||
    fi && \
 | 
			
		||||
    uv pip install --system $pkgs --index-url https://download.pytorch.org/whl/nightly/cu128
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system numba==0.61.2
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
uv pip install --system -r requirements/common.txt
 | 
			
		||||
 | 
			
		||||
# must put before installing xformers, so it can install the correct version of xfomrers.
 | 
			
		||||
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 | 
			
		||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 | 
			
		||||
 | 
			
		||||
# Build xformers with cuda and torch nightly
 | 
			
		||||
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
 | 
			
		||||
# todo(elainewy): cache xformers build result for faster build
 | 
			
		||||
ARG max_jobs=16
 | 
			
		||||
ENV MAX_JOBS=${max_jobs}
 | 
			
		||||
ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
 | 
			
		||||
 | 
			
		||||
ENV CCACHE_DIR=/root/.cache/ccache
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/ccache \
 | 
			
		||||
     --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    echo 'git clone xformers...' \
 | 
			
		||||
    && git clone https://github.com/facebookresearch/xformers.git --recursive \
 | 
			
		||||
    && cd xformers \
 | 
			
		||||
    && git checkout ${XFORMERS_COMMIT} \
 | 
			
		||||
    && git submodule update --init --recursive \
 | 
			
		||||
    && echo 'finish git clone xformers...' \
 | 
			
		||||
    && rm -rf build \
 | 
			
		||||
    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
 | 
			
		||||
    && cd .. \
 | 
			
		||||
    && rm -rf xformers
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system xformers-dist/*.whl --verbose
 | 
			
		||||
 | 
			
		||||
# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 | 
			
		||||
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 | 
			
		||||
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
 | 
			
		||||
RUN cat torch_build_versions.txt
 | 
			
		||||
    uv pip install --system -r requirements/cuda.txt
 | 
			
		||||
 | 
			
		||||
# cuda arch list used by torch
 | 
			
		||||
# can be useful for `test`
 | 
			
		||||
# can be useful for both `dev` and `test`
 | 
			
		||||
# explicitly set the list to avoid issues with torch 2.2
 | 
			
		||||
# see https://github.com/pytorch/pytorch/pull/123243
 | 
			
		||||
 | 
			
		||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 | 
			
		||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 | 
			
		||||
# Override the arch list for flash-attn to reduce the binary size
 | 
			
		||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
 | 
			
		||||
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 | 
			
		||||
@ -123,32 +83,32 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 | 
			
		||||
FROM base AS build
 | 
			
		||||
ARG TARGETPLATFORM
 | 
			
		||||
 | 
			
		||||
# install build dependencies
 | 
			
		||||
COPY requirements/build.txt requirements/build.txt
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
 | 
			
		||||
COPY . .
 | 
			
		||||
 | 
			
		||||
RUN python3 use_existing_torch.py
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -r requirements/build.txt
 | 
			
		||||
 | 
			
		||||
COPY . .
 | 
			
		||||
ARG GIT_REPO_CHECK=0
 | 
			
		||||
RUN --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
 | 
			
		||||
 | 
			
		||||
# Max jobs used by Ninja to build extensions
 | 
			
		||||
ARG max_jobs=16
 | 
			
		||||
# max jobs used by Ninja to build extensions
 | 
			
		||||
ARG max_jobs=2
 | 
			
		||||
ENV MAX_JOBS=${max_jobs}
 | 
			
		||||
ARG nvcc_threads=2
 | 
			
		||||
# number of threads used by nvcc
 | 
			
		||||
ARG nvcc_threads=8
 | 
			
		||||
ENV NVCC_THREADS=$nvcc_threads
 | 
			
		||||
 | 
			
		||||
ARG USE_SCCACHE
 | 
			
		||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 | 
			
		||||
ARG SCCACHE_REGION_NAME=us-west-2
 | 
			
		||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
 | 
			
		||||
 | 
			
		||||
# if USE_SCCACHE is set, use sccache to speed up compilation
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
@ -179,13 +139,38 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 | 
			
		||||
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
#################### WHEEL BUILD IMAGE ####################
 | 
			
		||||
# Check the size of the wheel if RUN_WHEEL_CHECK is true
 | 
			
		||||
COPY .buildkite/check-wheel-size.py check-wheel-size.py
 | 
			
		||||
# sync the default value with .buildkite/check-wheel-size.py
 | 
			
		||||
ARG VLLM_MAX_SIZE_MB=400
 | 
			
		||||
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 | 
			
		||||
ARG RUN_WHEEL_CHECK=true
 | 
			
		||||
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
 | 
			
		||||
        python3 check-wheel-size.py dist; \
 | 
			
		||||
    else \
 | 
			
		||||
        echo "Skipping wheel size check."; \
 | 
			
		||||
    fi
 | 
			
		||||
#################### EXTENSION Build IMAGE ####################
 | 
			
		||||
 | 
			
		||||
################### VLLM INSTALLED IMAGE ####################
 | 
			
		||||
# Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer
 | 
			
		||||
#################### DEV IMAGE ####################
 | 
			
		||||
FROM base as dev
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
 | 
			
		||||
COPY requirements/lint.txt requirements/lint.txt
 | 
			
		||||
COPY requirements/test.txt requirements/test.txt
 | 
			
		||||
COPY requirements/dev.txt requirements/dev.txt
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -r requirements/dev.txt
 | 
			
		||||
#################### DEV IMAGE ####################
 | 
			
		||||
 | 
			
		||||
#################### vLLM installation IMAGE ####################
 | 
			
		||||
# image with vLLM installed
 | 
			
		||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
 | 
			
		||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
 | 
			
		||||
# prepare for environment starts
 | 
			
		||||
ARG CUDA_VERSION=12.8.0
 | 
			
		||||
ARG CUDA_VERSION=12.4.1
 | 
			
		||||
ARG PYTHON_VERSION=3.12
 | 
			
		||||
WORKDIR /vllm-workspace
 | 
			
		||||
ENV DEBIAN_FRONTEND=noninteractive
 | 
			
		||||
@ -200,10 +185,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 | 
			
		||||
    && apt-get update -y \
 | 
			
		||||
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
 | 
			
		||||
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
 | 
			
		||||
    && for i in 1 2 3; do \
 | 
			
		||||
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
 | 
			
		||||
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
 | 
			
		||||
    done \
 | 
			
		||||
    && add-apt-repository ppa:deadsnakes/ppa \
 | 
			
		||||
    && apt-get update -y \
 | 
			
		||||
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
 | 
			
		||||
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
 | 
			
		||||
@ -211,7 +193,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 | 
			
		||||
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
 | 
			
		||||
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
 | 
			
		||||
    && python3 --version && python3 -m pip --version
 | 
			
		||||
 | 
			
		||||
# Install uv for faster pip installs
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    python3 -m pip install uv
 | 
			
		||||
 | 
			
		||||
@ -225,78 +207,65 @@ ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
# or future versions of triton.
 | 
			
		||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 | 
			
		||||
 | 
			
		||||
# get the nightly torch version used in the build to make sure the version is the same
 | 
			
		||||
COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
 | 
			
		||||
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
 | 
			
		||||
# we need to install torch and torchvision from the nightly builds first,
 | 
			
		||||
# pytorch will not appear as a vLLM dependency in all of the following steps
 | 
			
		||||
# after this step
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
# Install vllm wheel first, so that torch etc will be installed.
 | 
			
		||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system dist/*.whl --verbose
 | 
			
		||||
 | 
			
		||||
# If we need to build FlashInfer wheel before its release:
 | 
			
		||||
# $ export FLASHINFER_ENABLE_AOT=1
 | 
			
		||||
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
 | 
			
		||||
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
 | 
			
		||||
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
 | 
			
		||||
# $ cd flashinfer
 | 
			
		||||
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
 | 
			
		||||
# $ rm -rf build
 | 
			
		||||
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
 | 
			
		||||
# $ ls dist
 | 
			
		||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
 | 
			
		||||
 | 
			
		||||
# install the vllm wheel
 | 
			
		||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system vllm-dist/*.whl --verbose
 | 
			
		||||
 | 
			
		||||
# install xformers again for the new environment
 | 
			
		||||
RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
 | 
			
		||||
 | 
			
		||||
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 | 
			
		||||
 | 
			
		||||
# install package for build flashinfer
 | 
			
		||||
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
 | 
			
		||||
RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# build flashinfer for torch nightly from source around 10 mins
 | 
			
		||||
# release version: v0.2.2.post1
 | 
			
		||||
# todo(elainewy): cache flashinfer build result for faster build
 | 
			
		||||
ENV CCACHE_DIR=/root/.cache/ccache
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/ccache \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    echo "git clone flashinfer..." \
 | 
			
		||||
    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
 | 
			
		||||
    && cd flashinfer \
 | 
			
		||||
    && git checkout v0.2.2.post1 \
 | 
			
		||||
    && git submodule update --init --recursive \
 | 
			
		||||
    && echo "finish git clone flashinfer..." \
 | 
			
		||||
    && rm -rf build \
 | 
			
		||||
    && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
 | 
			
		||||
    && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
 | 
			
		||||
    && cd .. \
 | 
			
		||||
    && rm -rf flashinfer
 | 
			
		||||
 | 
			
		||||
# install flashinfer
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system flashinfer-dist/*.whl --verbose
 | 
			
		||||
 | 
			
		||||
# install common packages
 | 
			
		||||
COPY requirements/common.txt requirements/common.txt
 | 
			
		||||
COPY use_existing_torch.py use_existing_torch.py
 | 
			
		||||
COPY pyproject.toml pyproject.toml
 | 
			
		||||
 | 
			
		||||
. /etc/environment && \
 | 
			
		||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
 | 
			
		||||
    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 | 
			
		||||
fi
 | 
			
		||||
COPY examples examples
 | 
			
		||||
COPY benchmarks benchmarks
 | 
			
		||||
COPY ./vllm/collect_env.py .
 | 
			
		||||
 | 
			
		||||
RUN python3 use_existing_torch.py
 | 
			
		||||
# Although we build Flashinfer with AOT mode, there's still
 | 
			
		||||
# some issues w.r.t. JIT compilation. Therefore we need to
 | 
			
		||||
# install build dependencies for JIT compilation.
 | 
			
		||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
 | 
			
		||||
COPY requirements/build.txt requirements/build.txt
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -r requirements/common.txt
 | 
			
		||||
    uv pip install --system -r requirements/build.txt
 | 
			
		||||
 | 
			
		||||
################### VLLM INSTALLED IMAGE ####################
 | 
			
		||||
#################### vLLM installation IMAGE ####################
 | 
			
		||||
 | 
			
		||||
#################### TEST IMAGE ####################
 | 
			
		||||
# image to run unit testing suite
 | 
			
		||||
# note that this uses vllm installed by `pip`
 | 
			
		||||
FROM vllm-base AS test
 | 
			
		||||
 | 
			
		||||
#################### UNITTEST IMAGE #############################
 | 
			
		||||
FROM vllm-base as test
 | 
			
		||||
COPY tests/ tests/
 | 
			
		||||
 | 
			
		||||
# install build and runtime dependencies without stable torch version
 | 
			
		||||
COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
 | 
			
		||||
ADD . /vllm-workspace/
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
 | 
			
		||||
# install development dependencies (for testing)
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -r requirements/dev.txt
 | 
			
		||||
 | 
			
		||||
# install development dependencies (for testing)
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -e tests/vllm_test_utils
 | 
			
		||||
@ -306,13 +275,43 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system hf_transfer
 | 
			
		||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
 | 
			
		||||
 | 
			
		||||
# Copy in the v1 package for testing (it isn't distributed yet)
 | 
			
		||||
COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 | 
			
		||||
 | 
			
		||||
# doc requires source code
 | 
			
		||||
# we hide them inside `test_docs/` , so that this source code
 | 
			
		||||
# will not be imported by other tests
 | 
			
		||||
RUN mkdir test_docs
 | 
			
		||||
RUN mv docs test_docs/
 | 
			
		||||
RUN mv vllm test_docs/
 | 
			
		||||
#################### TEST IMAGE ####################
 | 
			
		||||
 | 
			
		||||
#################### OPENAI API SERVER ####################
 | 
			
		||||
# base openai image with additional requirements, for any subsequent openai-style images
 | 
			
		||||
FROM vllm-base AS vllm-openai-base
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
 | 
			
		||||
# install additional dependencies for openai api server
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -r requirements/nightly_torch_test.txt
 | 
			
		||||
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
 | 
			
		||||
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
 | 
			
		||||
    else \
 | 
			
		||||
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
# Logging to confirm the torch versions
 | 
			
		||||
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
 | 
			
		||||
ENV VLLM_USAGE_SOURCE production-docker-image
 | 
			
		||||
 | 
			
		||||
# Logging to confirm all the packages are installed
 | 
			
		||||
RUN pip freeze
 | 
			
		||||
# define sagemaker first, so it is not default from `docker build`
 | 
			
		||||
FROM vllm-openai-base AS vllm-sagemaker
 | 
			
		||||
 | 
			
		||||
#################### UNITTEST IMAGE #############################
 | 
			
		||||
COPY examples/online_serving/sagemaker-entrypoint.sh .
 | 
			
		||||
RUN chmod +x sagemaker-entrypoint.sh
 | 
			
		||||
ENTRYPOINT ["./sagemaker-entrypoint.sh"]
 | 
			
		||||
 | 
			
		||||
FROM vllm-openai-base AS vllm-openai
 | 
			
		||||
 | 
			
		||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 | 
			
		||||
#################### OPENAI API SERVER ####################
 | 
			
		||||
							
								
								
									
										69
									
								
								Dockerfile.cpu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								Dockerfile.cpu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,69 @@
 | 
			
		||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 | 
			
		||||
 | 
			
		||||
FROM ubuntu:22.04 AS cpu-test-1
 | 
			
		||||
 | 
			
		||||
ENV CCACHE_DIR=/root/.cache/ccache
 | 
			
		||||
 | 
			
		||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/var/cache/apt \
 | 
			
		||||
    apt-get update -y \
 | 
			
		||||
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
 | 
			
		||||
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
 | 
			
		||||
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 | 
			
		||||
 | 
			
		||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
 | 
			
		||||
# intel-openmp provides additional performance improvement vs. openmp
 | 
			
		||||
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    pip install intel-openmp==2025.0.1
 | 
			
		||||
 | 
			
		||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
 | 
			
		||||
 | 
			
		||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
 | 
			
		||||
 | 
			
		||||
RUN pip install intel_extension_for_pytorch==2.6.0
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace
 | 
			
		||||
 | 
			
		||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 | 
			
		||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
 | 
			
		||||
    pip install --upgrade pip && \
 | 
			
		||||
    pip install -r requirements/build.txt
 | 
			
		||||
 | 
			
		||||
FROM cpu-test-1 AS build
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/vllm
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
 | 
			
		||||
    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
 | 
			
		||||
    pip install -v -r requirements/cpu.txt
 | 
			
		||||
 | 
			
		||||
COPY . .
 | 
			
		||||
ARG GIT_REPO_CHECK=0
 | 
			
		||||
RUN --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 | 
			
		||||
 | 
			
		||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 | 
			
		||||
ARG VLLM_CPU_DISABLE_AVX512
 | 
			
		||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/ccache \
 | 
			
		||||
    --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
 | 
			
		||||
    pip install dist/*.whl && \
 | 
			
		||||
    rm -rf dist
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/
 | 
			
		||||
 | 
			
		||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 | 
			
		||||
 | 
			
		||||
# install development dependencies (for testing)
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    pip install -e tests/vllm_test_utils
 | 
			
		||||
 | 
			
		||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 | 
			
		||||
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 | 
			
		||||
 | 
			
		||||
COPY ./ /workspace/vllm
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,6 @@
 | 
			
		||||
# default base image
 | 
			
		||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
 | 
			
		||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
 | 
			
		||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
 | 
			
		||||
 | 
			
		||||
FROM $BASE_IMAGE
 | 
			
		||||
 | 
			
		||||
@ -21,8 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
 | 
			
		||||
WORKDIR ${APP_MOUNT}/vllm
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install --upgrade pip
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
 | 
			
		||||
RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 | 
			
		||||
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 | 
			
		||||
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 | 
			
		||||
RUN python3 -m pip install pytest
 | 
			
		||||
 | 
			
		||||
# uninstall transformers-neuronx package explicitly to avoid version conflict
 | 
			
		||||
@ -34,7 +35,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install -U \
 | 
			
		||||
        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
 | 
			
		||||
        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
 | 
			
		||||
        -r requirements/neuron.txt
 | 
			
		||||
 | 
			
		||||
ENV VLLM_TARGET_DEVICE neuron
 | 
			
		||||
@ -48,8 +49,6 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 | 
			
		||||
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
 | 
			
		||||
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
 | 
			
		||||
 | 
			
		||||
# overwrite entrypoint to run bash script
 | 
			
		||||
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
 | 
			
		||||
 | 
			
		||||
@ -1,41 +1,10 @@
 | 
			
		||||
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
 | 
			
		||||
 | 
			
		||||
###############################################################
 | 
			
		||||
# Stage to build openblas
 | 
			
		||||
###############################################################
 | 
			
		||||
 | 
			
		||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
 | 
			
		||||
 | 
			
		||||
ARG MAX_JOBS
 | 
			
		||||
ARG OPENBLAS_VERSION=0.3.29
 | 
			
		||||
RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
 | 
			
		||||
    && source /opt/rh/gcc-toolset-13/enable \
 | 
			
		||||
    && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
 | 
			
		||||
    && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
 | 
			
		||||
    && cd OpenBLAS-$OPENBLAS_VERSION \
 | 
			
		||||
    &&  make -j${MAX_JOBS} TARGET=POWER9 BINARY=64 USE_OPENMP=1 USE_THREAD=1 NUM_THREADS=120 DYNAMIC_ARCH=1 INTERFACE64=0 \
 | 
			
		||||
    && cd /tmp && touch control
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
###############################################################
 | 
			
		||||
# base stage with dependencies coming from centos mirrors
 | 
			
		||||
###############################################################
 | 
			
		||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
 | 
			
		||||
RUN  microdnf install -y dnf && \ 
 | 
			
		||||
     dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
 | 
			
		||||
        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
 | 
			
		||||
        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
 | 
			
		||||
        dnf config-manager --set-enabled crb
 | 
			
		||||
 | 
			
		||||
RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \
 | 
			
		||||
    dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
###############################################################
 | 
			
		||||
# base stage with basic dependencies
 | 
			
		||||
###############################################################
 | 
			
		||||
 | 
			
		||||
FROM centos-deps-builder AS base-builder
 | 
			
		||||
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder
 | 
			
		||||
 | 
			
		||||
ARG PYTHON_VERSION=3.12
 | 
			
		||||
ARG OPENBLAS_VERSION=0.3.29
 | 
			
		||||
@ -51,27 +20,29 @@ ENV UV_LINK_MODE=copy
 | 
			
		||||
# Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
 | 
			
		||||
# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
 | 
			
		||||
#       when `--jobs=<N>` is passed with podman build command
 | 
			
		||||
 | 
			
		||||
COPY --from=openblas-builder /tmp/control /dev/null
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
 | 
			
		||||
    dnf install -y openssl-devel \
 | 
			
		||||
RUN microdnf install -y openssl-devel dnf \
 | 
			
		||||
    && dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
 | 
			
		||||
        https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
 | 
			
		||||
        https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
 | 
			
		||||
    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
 | 
			
		||||
    && dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
 | 
			
		||||
    && dnf config-manager --set-enabled crb \
 | 
			
		||||
    && dnf install -y \
 | 
			
		||||
       git tar gcc-toolset-13 automake libtool \
 | 
			
		||||
       git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
 | 
			
		||||
       pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
 | 
			
		||||
       libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
 | 
			
		||||
       harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
 | 
			
		||||
       libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
 | 
			
		||||
       freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
 | 
			
		||||
       harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
 | 
			
		||||
       python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
 | 
			
		||||
    && dnf clean all \
 | 
			
		||||
    && PREFIX=/usr/local make -C /openblas install \
 | 
			
		||||
    && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
 | 
			
		||||
    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
 | 
			
		||||
    && python -m pip install -U pip uv \
 | 
			
		||||
    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
 | 
			
		||||
    && uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python cmake ninja cython scikit_build_core scikit_build \
 | 
			
		||||
    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
 | 
			
		||||
    && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
 | 
			
		||||
    && cd /tmp && touch control
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
###############################################################
 | 
			
		||||
# Stage to build torch family
 | 
			
		||||
###############################################################
 | 
			
		||||
@ -81,8 +52,6 @@ FROM base-builder AS torch-builder
 | 
			
		||||
ARG MAX_JOBS
 | 
			
		||||
ARG TORCH_VERSION=2.6.0
 | 
			
		||||
ARG _GLIBCXX_USE_CXX11_ABI=1
 | 
			
		||||
ARG OPENBLAS_VERSION=0.3.29
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    source /opt/rh/gcc-toolset-13/enable &&  \
 | 
			
		||||
    git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
 | 
			
		||||
@ -144,8 +113,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
        .. && \
 | 
			
		||||
    make install -j ${MAX_JOBS:-$(nproc)} && \
 | 
			
		||||
    cd ../../python/ && \
 | 
			
		||||
    uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
 | 
			
		||||
    pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
 | 
			
		||||
    uv pip install -v -r requirements-wheel-build.txt && \
 | 
			
		||||
    PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
 | 
			
		||||
    python setup.py build_ext \
 | 
			
		||||
    --build-type=release --bundle-arrow-cpp \
 | 
			
		||||
@ -158,19 +126,48 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
FROM base-builder AS cv-builder
 | 
			
		||||
 | 
			
		||||
ARG MAX_JOBS
 | 
			
		||||
ARG OPENCV_VERSION=86
 | 
			
		||||
# patch for version 4.11.0.86
 | 
			
		||||
ARG OPENCV_PATCH=97f3f39
 | 
			
		||||
ARG OPENCV_VERSION=84
 | 
			
		||||
ARG ENABLE_HEADLESS=1
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    source /opt/rh/gcc-toolset-13/enable && \
 | 
			
		||||
    git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
 | 
			
		||||
    cd opencv-python && \
 | 
			
		||||
    sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
 | 
			
		||||
    cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
 | 
			
		||||
    uv pip install scikit-build && \    
 | 
			
		||||
    sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
 | 
			
		||||
    python -m build --wheel --installer=uv --outdir /opencvwheels/
 | 
			
		||||
 | 
			
		||||
###############################################################
 | 
			
		||||
# Stage to build vllm - this stage builds and installs
 | 
			
		||||
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
 | 
			
		||||
# for transitive dependencies - eg. grpcio
 | 
			
		||||
###############################################################
 | 
			
		||||
 | 
			
		||||
FROM base-builder AS vllmcache-builder
 | 
			
		||||
 | 
			
		||||
COPY --from=torch-builder /tmp/control /dev/null
 | 
			
		||||
COPY --from=arrow-builder /tmp/control /dev/null
 | 
			
		||||
COPY --from=cv-builder /tmp/control /dev/null
 | 
			
		||||
 | 
			
		||||
ARG VLLM_TARGET_DEVICE=cpu
 | 
			
		||||
 | 
			
		||||
# this step installs vllm and populates uv cache
 | 
			
		||||
# with all the transitive dependencies
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
 | 
			
		||||
    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
 | 
			
		||||
    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
 | 
			
		||||
    --mount=type=bind,src=.,dst=/src/,rw \
 | 
			
		||||
    source /opt/rh/gcc-toolset-13/enable && \
 | 
			
		||||
    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
 | 
			
		||||
    sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
 | 
			
		||||
    uv pip install pandas pythran pybind11 && \
 | 
			
		||||
    # sentencepiece.pc is in some pkgconfig inside uv cache
 | 
			
		||||
    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
 | 
			
		||||
    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
 | 
			
		||||
    cd /src/ && \
 | 
			
		||||
    uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
 | 
			
		||||
    uv pip install /vllmwheel/*.whl
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
###############################################################
 | 
			
		||||
# Stage to build numactl
 | 
			
		||||
###############################################################
 | 
			
		||||
@ -186,49 +183,6 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
 | 
			
		||||
    && autoreconf -i && ./configure \
 | 
			
		||||
    && make -j ${MAX_JOBS:-$(nproc)}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
###############################################################
 | 
			
		||||
# Stage to build vllm - this stage builds and installs
 | 
			
		||||
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
 | 
			
		||||
# for transitive dependencies - eg. grpcio
 | 
			
		||||
###############################################################
 | 
			
		||||
 | 
			
		||||
FROM base-builder AS vllmcache-builder
 | 
			
		||||
 | 
			
		||||
COPY --from=torch-builder /tmp/control /dev/null
 | 
			
		||||
COPY --from=arrow-builder /tmp/control /dev/null
 | 
			
		||||
COPY --from=cv-builder /tmp/control /dev/null
 | 
			
		||||
COPY --from=numa-builder /tmp/control /dev/null
 | 
			
		||||
 | 
			
		||||
ARG VLLM_TARGET_DEVICE=cpu
 | 
			
		||||
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 | 
			
		||||
 | 
			
		||||
# this step installs vllm and populates uv cache
 | 
			
		||||
# with all the transitive dependencies
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    source /opt/rh/gcc-toolset-13/enable && \
 | 
			
		||||
    git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
 | 
			
		||||
    uv pip install maturin && \
 | 
			
		||||
    uv build --wheel --out-dir /hf_wheels/
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
 | 
			
		||||
    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
 | 
			
		||||
    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
 | 
			
		||||
    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
 | 
			
		||||
    --mount=type=bind,src=.,dst=/src/,rw \
 | 
			
		||||
    source /opt/rh/gcc-toolset-13/enable && \
 | 
			
		||||
    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
 | 
			
		||||
    sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
 | 
			
		||||
    uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
 | 
			
		||||
    make -C /numactl install && \
 | 
			
		||||
    # sentencepiece.pc is in some pkgconfig inside uv cache
 | 
			
		||||
    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
 | 
			
		||||
    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
 | 
			
		||||
    cd /src/ && \
 | 
			
		||||
    uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
 | 
			
		||||
    uv pip install /vllmwheel/*.whl
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
###############################################################
 | 
			
		||||
# Stage to build lapack
 | 
			
		||||
###############################################################
 | 
			
		||||
@ -258,7 +212,6 @@ ENV PATH=${VIRTUAL_ENV}/bin:$PATH
 | 
			
		||||
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
 | 
			
		||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
 | 
			
		||||
ENV UV_LINK_MODE=copy
 | 
			
		||||
ENV OMP_NUM_THREADS=16
 | 
			
		||||
 | 
			
		||||
# create artificial dependencies between stages for independent stages to build in parallel
 | 
			
		||||
COPY --from=torch-builder /tmp/control /dev/null
 | 
			
		||||
@ -267,13 +220,11 @@ COPY --from=cv-builder /tmp/control /dev/null
 | 
			
		||||
COPY --from=vllmcache-builder /tmp/control /dev/null
 | 
			
		||||
COPY --from=numa-builder /tmp/control /dev/null
 | 
			
		||||
COPY --from=lapack-builder /tmp/control /dev/null
 | 
			
		||||
COPY --from=openblas-builder /tmp/control /dev/null
 | 
			
		||||
 | 
			
		||||
# install gcc-11, python, openblas, numactl, lapack
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
 | 
			
		||||
    --mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
 | 
			
		||||
    --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
 | 
			
		||||
    rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
 | 
			
		||||
    microdnf install --nodocs -y \
 | 
			
		||||
    tar findutils openssl \
 | 
			
		||||
@ -285,9 +236,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    && microdnf clean all \
 | 
			
		||||
    && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
 | 
			
		||||
    && python -m pip install -U pip uv --no-cache \
 | 
			
		||||
    && curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
 | 
			
		||||
    && make -C /numactl install \
 | 
			
		||||
    && PREFIX=/usr/local make -C /openblas install \
 | 
			
		||||
    && uv pip install 'cmake<4' \
 | 
			
		||||
    && uv pip install cmake \
 | 
			
		||||
    && cmake --install /lapack/build \
 | 
			
		||||
    && uv pip uninstall cmake
 | 
			
		||||
 | 
			
		||||
@ -296,9 +247,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
 | 
			
		||||
    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
 | 
			
		||||
    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
 | 
			
		||||
    --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
 | 
			
		||||
    --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
 | 
			
		||||
    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
 | 
			
		||||
    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
 | 
			
		||||
 | 
			
		||||
COPY ./ /workspace/vllm
 | 
			
		||||
WORKDIR /workspace/vllm
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user