mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-11-04 17:34:34 +08:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			fix-precom
			...
			reduce_sca
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 3679753af5 | 
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import sys
 | 
			
		||||
@ -9,12 +8,12 @@ import zipfile
 | 
			
		||||
# Note that we have 400 MiB quota, please use it wisely.
 | 
			
		||||
# See https://github.com/pypi/support/issues/3792 .
 | 
			
		||||
# Please also sync the value with the one in Dockerfile.
 | 
			
		||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
 | 
			
		||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def print_top_10_largest_files(zip_file):
 | 
			
		||||
    """Print the top 10 largest files in the given zip file."""
 | 
			
		||||
    with zipfile.ZipFile(zip_file, "r") as z:
 | 
			
		||||
    with zipfile.ZipFile(zip_file, 'r') as z:
 | 
			
		||||
        file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
 | 
			
		||||
        file_sizes.sort(key=lambda x: x[1], reverse=True)
 | 
			
		||||
        for f, size in file_sizes[:10]:
 | 
			
		||||
@ -29,18 +28,14 @@ def check_wheel_size(directory):
 | 
			
		||||
                wheel_path = os.path.join(root, file_name)
 | 
			
		||||
                wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
 | 
			
		||||
                if wheel_size_mb > VLLM_MAX_SIZE_MB:
 | 
			
		||||
                    print(
 | 
			
		||||
                        f"Not allowed: Wheel {wheel_path} is larger "
 | 
			
		||||
                    print(f"Not allowed: Wheel {wheel_path} is larger "
 | 
			
		||||
                          f"({wheel_size_mb:.2f} MB) than the limit "
 | 
			
		||||
                        f"({VLLM_MAX_SIZE_MB} MB)."
 | 
			
		||||
                    )
 | 
			
		||||
                          f"({VLLM_MAX_SIZE_MB} MB).")
 | 
			
		||||
                    print_top_10_largest_files(wheel_path)
 | 
			
		||||
                    return 1
 | 
			
		||||
                else:
 | 
			
		||||
                    print(
 | 
			
		||||
                        f"Wheel {wheel_path} is within the allowed size "
 | 
			
		||||
                        f"({wheel_size_mb:.2f} MB)."
 | 
			
		||||
                    )
 | 
			
		||||
                    print(f"Wheel {wheel_path} is within the allowed size "
 | 
			
		||||
                          f"({wheel_size_mb:.2f} MB).")
 | 
			
		||||
    return 0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import os
 | 
			
		||||
@ -23,5 +22,5 @@ with open("index.html", "w") as f:
 | 
			
		||||
    print(f"Generated index.html for {args.wheel}")
 | 
			
		||||
    # cloudfront requires escaping the '+' character
 | 
			
		||||
    f.write(
 | 
			
		||||
        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
 | 
			
		||||
    )
 | 
			
		||||
        template.format(wheel=filename,
 | 
			
		||||
                        wheel_html_escaped=filename.replace("+", "%2B")))
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
 | 
			
		||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
 | 
			
		||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
 | 
			
		||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,11 +0,0 @@
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.335
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.323
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,12 +1,11 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "mgoin/Minitron-4B-Base-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.231
 | 
			
		||||
    value: 0.233
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.22
 | 
			
		||||
    value: 0.236
 | 
			
		||||
limit: 1000
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
 | 
			
		||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
 | 
			
		||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# For hf script, without -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
 | 
			
		||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
 | 
			
		||||
@ -1,12 +0,0 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.30
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.465
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
 | 
			
		||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
 | 
			
		||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -1,11 +0,0 @@
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.54
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.59
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,11 +0,0 @@
 | 
			
		||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 | 
			
		||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 | 
			
		||||
tasks:
 | 
			
		||||
- name: "gsm8k"
 | 
			
		||||
  metrics:
 | 
			
		||||
  - name: "exact_match,strict-match"
 | 
			
		||||
    value: 0.47
 | 
			
		||||
  - name: "exact_match,flexible-extract"
 | 
			
		||||
    value: 0.64
 | 
			
		||||
limit: 1319
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
@ -1,4 +1,3 @@
 | 
			
		||||
# For vllm script, with -t option (tensor parallel size).
 | 
			
		||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
 | 
			
		||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
 | 
			
		||||
tasks:
 | 
			
		||||
 | 
			
		||||
@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
 | 
			
		||||
Mixtral-8x7B-Instruct-v0.1.yaml
 | 
			
		||||
Qwen2-57B-A14-Instruct.yaml
 | 
			
		||||
DeepSeek-V2-Lite-Chat.yaml
 | 
			
		||||
Meta-Llama-3-8B-QQQ.yaml
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,10 @@
 | 
			
		||||
Qwen2.5-1.5B-Instruct.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 | 
			
		||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 | 
			
		||||
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
 | 
			
		||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
 | 
			
		||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 | 
			
		||||
Minitron-4B-Base-FP8.yaml
 | 
			
		||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 | 
			
		||||
Qwen2-1.5B-Instruct-FP8W8.yaml
 | 
			
		||||
Meta-Llama-3-8B-QQQ.yaml
 | 
			
		||||
 | 
			
		||||
@ -1,44 +0,0 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pytest_addoption(parser):
 | 
			
		||||
    parser.addoption(
 | 
			
		||||
        "--config-list-file",
 | 
			
		||||
        action="store",
 | 
			
		||||
        help="Path to the file listing model config YAMLs (one per line)",
 | 
			
		||||
    )
 | 
			
		||||
    parser.addoption(
 | 
			
		||||
        "--tp-size",
 | 
			
		||||
        action="store",
 | 
			
		||||
        default="1",
 | 
			
		||||
        help="Tensor parallel size to use for evaluation",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def config_list_file(pytestconfig, config_dir):
 | 
			
		||||
    rel_path = pytestconfig.getoption("--config-list-file")
 | 
			
		||||
    return config_dir / rel_path
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture(scope="session")
 | 
			
		||||
def tp_size(pytestconfig):
 | 
			
		||||
    return pytestconfig.getoption("--tp-size")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def pytest_generate_tests(metafunc):
 | 
			
		||||
    if "config_filename" in metafunc.fixturenames:
 | 
			
		||||
        rel_path = metafunc.config.getoption("--config-list-file")
 | 
			
		||||
        config_list_file = Path(rel_path).resolve()
 | 
			
		||||
        config_dir = config_list_file.parent
 | 
			
		||||
        with open(config_list_file, encoding="utf-8") as f:
 | 
			
		||||
            configs = [
 | 
			
		||||
                config_dir / line.strip()
 | 
			
		||||
                for line in f
 | 
			
		||||
                if line.strip() and not line.startswith("#")
 | 
			
		||||
            ]
 | 
			
		||||
        metafunc.parametrize("config_filename", configs)
 | 
			
		||||
							
								
								
									
										59
									
								
								.buildkite/lm-eval-harness/run-tests.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								.buildkite/lm-eval-harness/run-tests.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,59 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
usage() {
 | 
			
		||||
    echo``
 | 
			
		||||
    echo "Runs lm eval harness on GSM8k using vllm and compares to "
 | 
			
		||||
    echo "precomputed baseline (measured by HF transformers.)"
 | 
			
		||||
    echo
 | 
			
		||||
    echo "usage: ${0} <options>"
 | 
			
		||||
    echo
 | 
			
		||||
    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
 | 
			
		||||
    echo "  -t    - tensor parallel size"
 | 
			
		||||
    echo
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
SUCCESS=0
 | 
			
		||||
 | 
			
		||||
while getopts "c:t:" OPT; do
 | 
			
		||||
  case ${OPT} in
 | 
			
		||||
    c ) 
 | 
			
		||||
        CONFIG="$OPTARG"
 | 
			
		||||
        ;;
 | 
			
		||||
    t )
 | 
			
		||||
        TP_SIZE="$OPTARG"
 | 
			
		||||
        ;;
 | 
			
		||||
    \? )
 | 
			
		||||
        usage
 | 
			
		||||
        exit 1
 | 
			
		||||
        ;;
 | 
			
		||||
  esac
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
# Parse list of configs.
 | 
			
		||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 | 
			
		||||
 | 
			
		||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 | 
			
		||||
do
 | 
			
		||||
    LOCAL_SUCCESS=0
 | 
			
		||||
    
 | 
			
		||||
    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
 | 
			
		||||
 | 
			
		||||
    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
 | 
			
		||||
    export LM_EVAL_TP_SIZE=$TP_SIZE
 | 
			
		||||
    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
 | 
			
		||||
 | 
			
		||||
    if [[ $LOCAL_SUCCESS == 0 ]]; then
 | 
			
		||||
        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
 | 
			
		||||
    else
 | 
			
		||||
        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
 | 
			
		||||
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
if [ "${SUCCESS}" -eq "0" ]; then
 | 
			
		||||
    exit 0
 | 
			
		||||
else
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
@ -1,55 +1,64 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
"""
 | 
			
		||||
LM eval harness on model to compare vs HF baseline computed offline.
 | 
			
		||||
Configs are found in configs/$MODEL.yaml
 | 
			
		||||
 | 
			
		||||
pytest -s -v test_lm_eval_correctness.py \
 | 
			
		||||
    --config-list-file=configs/models-small.txt \
 | 
			
		||||
    --tp-size=1
 | 
			
		||||
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
 | 
			
		||||
* export LM_EVAL_TP_SIZE=4 
 | 
			
		||||
* pytest -s test_lm_eval_correctness.py
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
import lm_eval
 | 
			
		||||
import numpy as np
 | 
			
		||||
import numpy
 | 
			
		||||
import yaml
 | 
			
		||||
 | 
			
		||||
RTOL = 0.08
 | 
			
		||||
RTOL = 0.05
 | 
			
		||||
TEST_DATA_FILE = os.environ.get(
 | 
			
		||||
    "LM_EVAL_TEST_DATA_FILE",
 | 
			
		||||
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
 | 
			
		||||
 | 
			
		||||
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def launch_lm_eval(eval_config, tp_size):
 | 
			
		||||
    trust_remote_code = eval_config.get("trust_remote_code", False)
 | 
			
		||||
    model_args = (
 | 
			
		||||
        f"pretrained={eval_config['model_name']},"
 | 
			
		||||
        f"tensor_parallel_size={tp_size},"
 | 
			
		||||
        f"enforce_eager=true,"
 | 
			
		||||
        f"add_bos_token=true,"
 | 
			
		||||
def launch_lm_eval(eval_config):
 | 
			
		||||
    trust_remote_code = eval_config.get('trust_remote_code', False)
 | 
			
		||||
 | 
			
		||||
    model_args = f"pretrained={eval_config['model_name']}," \
 | 
			
		||||
                 f"tensor_parallel_size={TP_SIZE}," \
 | 
			
		||||
                 f"add_bos_token=true," \
 | 
			
		||||
                 f"trust_remote_code={trust_remote_code}"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    results = lm_eval.simple_evaluate(
 | 
			
		||||
        model="vllm",
 | 
			
		||||
        model_args=model_args,
 | 
			
		||||
        tasks=[task["name"] for task in eval_config["tasks"]],
 | 
			
		||||
        num_fewshot=eval_config["num_fewshot"],
 | 
			
		||||
        limit=eval_config["limit"],
 | 
			
		||||
        batch_size="auto",
 | 
			
		||||
    )
 | 
			
		||||
        batch_size="auto")
 | 
			
		||||
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_lm_eval_correctness_param(config_filename, tp_size):
 | 
			
		||||
    eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
 | 
			
		||||
def test_lm_eval_correctness():
 | 
			
		||||
    eval_config = yaml.safe_load(
 | 
			
		||||
        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
 | 
			
		||||
 | 
			
		||||
    results = launch_lm_eval(eval_config, tp_size)
 | 
			
		||||
    # Launch eval requests.
 | 
			
		||||
    results = launch_lm_eval(eval_config)
 | 
			
		||||
 | 
			
		||||
    # Confirm scores match ground truth.
 | 
			
		||||
    success = True
 | 
			
		||||
    for task in eval_config["tasks"]:
 | 
			
		||||
        for metric in task["metrics"]:
 | 
			
		||||
            ground_truth = metric["value"]
 | 
			
		||||
            measured_value = results["results"][task["name"]][metric["name"]]
 | 
			
		||||
            print(
 | 
			
		||||
                f"{task['name']} | {metric['name']}: "
 | 
			
		||||
                f"ground_truth={ground_truth} | measured={measured_value}"
 | 
			
		||||
            )
 | 
			
		||||
            success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
 | 
			
		||||
            print(f'{task["name"]} | {metric["name"]}: '
 | 
			
		||||
                  f'ground_truth={ground_truth} | measured={measured_value}')
 | 
			
		||||
            success = success and numpy.isclose(
 | 
			
		||||
                ground_truth, measured_value, rtol=RTOL)
 | 
			
		||||
 | 
			
		||||
    # Assert at the end, print all scores even on failure for debugging.
 | 
			
		||||
    assert success
 | 
			
		||||
 | 
			
		||||
@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
 | 
			
		||||
 | 
			
		||||
### Visualizing the results
 | 
			
		||||
 | 
			
		||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
 | 
			
		||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
 | 
			
		||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
 | 
			
		||||
If you do not see the table, please wait till the benchmark finish running.
 | 
			
		||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
@ -66,18 +65,18 @@ def read_markdown(file):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def results_to_json(latency, throughput, serving):
 | 
			
		||||
    return json.dumps(
 | 
			
		||||
        {
 | 
			
		||||
            "latency": latency.to_dict(),
 | 
			
		||||
            "throughput": throughput.to_dict(),
 | 
			
		||||
            "serving": serving.to_dict(),
 | 
			
		||||
        }
 | 
			
		||||
    )
 | 
			
		||||
    return json.dumps({
 | 
			
		||||
        'latency': latency.to_dict(),
 | 
			
		||||
        'throughput': throughput.to_dict(),
 | 
			
		||||
        'serving': serving.to_dict()
 | 
			
		||||
    })
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    # collect results
 | 
			
		||||
    for test_file in results_folder.glob("*.json"):
 | 
			
		||||
 | 
			
		||||
        with open(test_file) as f:
 | 
			
		||||
            raw_result = json.loads(f.read())
 | 
			
		||||
 | 
			
		||||
@ -121,8 +120,7 @@ if __name__ == "__main__":
 | 
			
		||||
            for perc in [10, 25, 50, 75, 90, 99]:
 | 
			
		||||
                # Multiply 1000 to convert the time unit from s to ms
 | 
			
		||||
                raw_result.update(
 | 
			
		||||
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
 | 
			
		||||
                )
 | 
			
		||||
                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
 | 
			
		||||
            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
 | 
			
		||||
 | 
			
		||||
            # add the result to raw_result
 | 
			
		||||
@ -155,27 +153,26 @@ if __name__ == "__main__":
 | 
			
		||||
    serving_results = pd.DataFrame.from_dict(serving_results)
 | 
			
		||||
    throughput_results = pd.DataFrame.from_dict(throughput_results)
 | 
			
		||||
 | 
			
		||||
    raw_results_json = results_to_json(
 | 
			
		||||
        latency_results, throughput_results, serving_results
 | 
			
		||||
    )
 | 
			
		||||
    raw_results_json = results_to_json(latency_results, throughput_results,
 | 
			
		||||
                                       serving_results)
 | 
			
		||||
 | 
			
		||||
    # remapping the key, for visualization purpose
 | 
			
		||||
    if not latency_results.empty:
 | 
			
		||||
        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
 | 
			
		||||
            columns=latency_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
        latency_results = latency_results[list(
 | 
			
		||||
            latency_column_mapping.keys())].rename(
 | 
			
		||||
                columns=latency_column_mapping)
 | 
			
		||||
    if not serving_results.empty:
 | 
			
		||||
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
 | 
			
		||||
            columns=serving_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
        serving_results = serving_results[list(
 | 
			
		||||
            serving_column_mapping.keys())].rename(
 | 
			
		||||
                columns=serving_column_mapping)
 | 
			
		||||
    if not throughput_results.empty:
 | 
			
		||||
        throughput_results = throughput_results[
 | 
			
		||||
            list(throughput_results_column_mapping.keys())
 | 
			
		||||
        ].rename(columns=throughput_results_column_mapping)
 | 
			
		||||
        throughput_results = throughput_results[list(
 | 
			
		||||
            throughput_results_column_mapping.keys())].rename(
 | 
			
		||||
                columns=throughput_results_column_mapping)
 | 
			
		||||
 | 
			
		||||
    processed_results_json = results_to_json(
 | 
			
		||||
        latency_results, throughput_results, serving_results
 | 
			
		||||
    )
 | 
			
		||||
    processed_results_json = results_to_json(latency_results,
 | 
			
		||||
                                             throughput_results,
 | 
			
		||||
                                             serving_results)
 | 
			
		||||
 | 
			
		||||
    for df in [latency_results, serving_results, throughput_results]:
 | 
			
		||||
        if df.empty:
 | 
			
		||||
@ -187,39 +184,38 @@ if __name__ == "__main__":
 | 
			
		||||
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
 | 
			
		||||
        # we want to turn it into "8xGPUTYPE"
 | 
			
		||||
        df["GPU"] = df["GPU"].apply(
 | 
			
		||||
            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
 | 
			
		||||
        )
 | 
			
		||||
            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
 | 
			
		||||
 | 
			
		||||
    # get markdown tables
 | 
			
		||||
    latency_md_table = tabulate(
 | 
			
		||||
        latency_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    serving_md_table = tabulate(
 | 
			
		||||
        serving_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    throughput_md_table = tabulate(
 | 
			
		||||
        throughput_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    latency_md_table = tabulate(latency_results,
 | 
			
		||||
                                headers='keys',
 | 
			
		||||
                                tablefmt='pipe',
 | 
			
		||||
                                showindex=False)
 | 
			
		||||
    serving_md_table = tabulate(serving_results,
 | 
			
		||||
                                headers='keys',
 | 
			
		||||
                                tablefmt='pipe',
 | 
			
		||||
                                showindex=False)
 | 
			
		||||
    throughput_md_table = tabulate(throughput_results,
 | 
			
		||||
                                   headers='keys',
 | 
			
		||||
                                   tablefmt='pipe',
 | 
			
		||||
                                   showindex=False)
 | 
			
		||||
 | 
			
		||||
    # document the result
 | 
			
		||||
    with open(results_folder / "benchmark_results.md", "w") as f:
 | 
			
		||||
        results = read_markdown(
 | 
			
		||||
            "../.buildkite/nightly-benchmarks/"
 | 
			
		||||
            + "performance-benchmarks-descriptions.md"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        results = read_markdown("../.buildkite/nightly-benchmarks/" +
 | 
			
		||||
                                "performance-benchmarks-descriptions.md")
 | 
			
		||||
        results = results.format(
 | 
			
		||||
            latency_tests_markdown_table=latency_md_table,
 | 
			
		||||
            throughput_tests_markdown_table=throughput_md_table,
 | 
			
		||||
            serving_tests_markdown_table=serving_md_table,
 | 
			
		||||
            benchmarking_results_in_json_string=processed_results_json,
 | 
			
		||||
        )
 | 
			
		||||
            benchmarking_results_in_json_string=processed_results_json)
 | 
			
		||||
        f.write(results)
 | 
			
		||||
 | 
			
		||||
    # document benchmarking results in json
 | 
			
		||||
    with open(results_folder / "benchmark_results.json", "w") as f:
 | 
			
		||||
        results = (
 | 
			
		||||
            latency_results.to_dict(orient="records")
 | 
			
		||||
            + throughput_results.to_dict(orient="records")
 | 
			
		||||
            + serving_results.to_dict(orient="records")
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        results = latency_results.to_dict(
 | 
			
		||||
            orient='records') + throughput_results.to_dict(
 | 
			
		||||
                orient='records') + serving_results.to_dict(orient='records')
 | 
			
		||||
        f.write(json.dumps(results))
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
 | 
			
		||||
@ -15,12 +14,15 @@ def main(model, cachedir):
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    parser = argparse.ArgumentParser(
 | 
			
		||||
        description="Download and save Hugging Face tokenizer"
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument("--model", type=str, required=True, help="Name of the model")
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
 | 
			
		||||
    )
 | 
			
		||||
        description="Download and save Hugging Face tokenizer")
 | 
			
		||||
    parser.add_argument("--model",
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help="Name of the model")
 | 
			
		||||
    parser.add_argument("--cachedir",
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help="Directory to save the tokenizer")
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    main(args.model, args.cachedir)
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import json
 | 
			
		||||
@ -12,33 +11,33 @@ from tabulate import tabulate
 | 
			
		||||
 | 
			
		||||
def parse_arguments():
 | 
			
		||||
    parser = argparse.ArgumentParser(
 | 
			
		||||
        description="Parse command line arguments for summary-nightly-results script."
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--results-folder",
 | 
			
		||||
        description=
 | 
			
		||||
        'Parse command line arguments for summary-nightly-results script.')
 | 
			
		||||
    parser.add_argument('--results-folder',
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
        help="The folder where the results are stored.",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--description", type=str, required=True, help="Description of the results."
 | 
			
		||||
    )
 | 
			
		||||
                        help='The folder where the results are stored.')
 | 
			
		||||
    parser.add_argument('--description',
 | 
			
		||||
                        type=str,
 | 
			
		||||
                        required=True,
 | 
			
		||||
                        help='Description of the results.')
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    return args
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_perf(df, method, model, metric):
 | 
			
		||||
 | 
			
		||||
    means = []
 | 
			
		||||
 | 
			
		||||
    for qps in [2, 4, 8, 16, "inf"]:
 | 
			
		||||
        target = df["Test name"].str.contains(model)
 | 
			
		||||
        target = target & df["Engine"].str.contains(method)
 | 
			
		||||
        target = target & df["Test name"].str.contains("qps_" + str(qps))
 | 
			
		||||
        target = df['Test name'].str.contains(model)
 | 
			
		||||
        target = target & df['Engine'].str.contains(method)
 | 
			
		||||
        target = target & df['Test name'].str.contains("qps_" + str(qps))
 | 
			
		||||
        filtered_df = df[target]
 | 
			
		||||
 | 
			
		||||
        if filtered_df.empty:
 | 
			
		||||
            means.append(0.0)
 | 
			
		||||
            means.append(0.)
 | 
			
		||||
        else:
 | 
			
		||||
            means.append(filtered_df[metric].values[0])
 | 
			
		||||
 | 
			
		||||
@ -46,6 +45,7 @@ def get_perf(df, method, model, metric):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_perf_w_std(df, method, model, metric):
 | 
			
		||||
 | 
			
		||||
    if metric in ["TTFT", "ITL"]:
 | 
			
		||||
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
 | 
			
		||||
        mean = mean.tolist()
 | 
			
		||||
@ -60,8 +60,7 @@ def get_perf_w_std(df, method, model, metric):
 | 
			
		||||
    else:
 | 
			
		||||
        assert metric == "Tput"
 | 
			
		||||
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
 | 
			
		||||
            df, method, model, "Output Tput (tok/s)"
 | 
			
		||||
        )
 | 
			
		||||
            df, method, model, "Output Tput (tok/s)")
 | 
			
		||||
        mean = mean.tolist()
 | 
			
		||||
        std = None
 | 
			
		||||
 | 
			
		||||
@ -81,17 +80,18 @@ def main(args):
 | 
			
		||||
    # generate markdown table
 | 
			
		||||
    df = pd.DataFrame.from_dict(results)
 | 
			
		||||
 | 
			
		||||
    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
 | 
			
		||||
    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
 | 
			
		||||
 | 
			
		||||
    with open(args.description) as f:
 | 
			
		||||
        description = f.read()
 | 
			
		||||
 | 
			
		||||
    description = description.format(nightly_results_benchmarking_table=md_table)
 | 
			
		||||
    description = description.format(
 | 
			
		||||
        nightly_results_benchmarking_table=md_table)
 | 
			
		||||
 | 
			
		||||
    with open("nightly_results.md", "w") as f:
 | 
			
		||||
        f.write(description)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    args = parse_arguments()
 | 
			
		||||
    main(args)
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
from lmdeploy.serve.openai.api_client import APIClient
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -426,7 +426,7 @@ main() {
 | 
			
		||||
 | 
			
		||||
  pip install -U transformers
 | 
			
		||||
 | 
			
		||||
  pip install -r requirements/dev.txt
 | 
			
		||||
  pip install -r requirements-dev.txt
 | 
			
		||||
  which genai-perf
 | 
			
		||||
 | 
			
		||||
  # check storage
 | 
			
		||||
 | 
			
		||||
@ -10,24 +10,15 @@ set -x
 | 
			
		||||
set -o pipefail
 | 
			
		||||
 | 
			
		||||
check_gpus() {
 | 
			
		||||
  if command -v nvidia-smi; then
 | 
			
		||||
  # check the number of GPUs and GPU type.
 | 
			
		||||
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
 | 
			
		||||
  elif command -v amd-smi; then
 | 
			
		||||
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ $gpu_count -gt 0 ]]; then
 | 
			
		||||
    echo "GPU found."
 | 
			
		||||
  else
 | 
			
		||||
    echo "Need at least 1 GPU to run benchmarking."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  if command -v nvidia-smi; then
 | 
			
		||||
  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
 | 
			
		||||
  elif command -v amd-smi; then
 | 
			
		||||
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
 | 
			
		||||
  fi
 | 
			
		||||
  echo "GPU type is $gpu_type"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -99,15 +90,9 @@ kill_gpu_processes() {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # wait until GPU memory usage smaller than 1GB
 | 
			
		||||
  if command -v nvidia-smi; then
 | 
			
		||||
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
 | 
			
		||||
    sleep 1
 | 
			
		||||
  done
 | 
			
		||||
  elif command -v amd-smi; then
 | 
			
		||||
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
 | 
			
		||||
      sleep 1
 | 
			
		||||
    done
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  # remove vllm config file
 | 
			
		||||
  rm -rf ~/.config/vllm
 | 
			
		||||
@ -376,7 +361,7 @@ main() {
 | 
			
		||||
  # get the current IP address, required by benchmark_serving.py
 | 
			
		||||
  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
 | 
			
		||||
  # turn of the reporting of the status of each request, to clean up the terminal output
 | 
			
		||||
  export VLLM_LOGGING_LEVEL="WARNING"
 | 
			
		||||
  export VLLM_LOG_LEVEL="WARNING"
 | 
			
		||||
 | 
			
		||||
  # prepare for benchmarking
 | 
			
		||||
  cd benchmarks || exit 1
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,4 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import datetime
 | 
			
		||||
import json
 | 
			
		||||
@ -35,8 +34,10 @@ serving_column_mapping = {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    # collect results
 | 
			
		||||
    for test_file in results_folder.glob("*.json"):
 | 
			
		||||
 | 
			
		||||
        with open(test_file) as f:
 | 
			
		||||
            raw_result = json.loads(f.read())
 | 
			
		||||
 | 
			
		||||
@ -55,16 +56,17 @@ if __name__ == "__main__":
 | 
			
		||||
    serving_results = pd.DataFrame.from_dict(serving_results)
 | 
			
		||||
 | 
			
		||||
    if not serving_results.empty:
 | 
			
		||||
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
 | 
			
		||||
            columns=serving_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
        serving_results = serving_results[list(
 | 
			
		||||
            serving_column_mapping.keys())].rename(
 | 
			
		||||
                columns=serving_column_mapping)
 | 
			
		||||
 | 
			
		||||
    serving_md_table_with_headers = tabulate(
 | 
			
		||||
        serving_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    serving_md_table_with_headers = tabulate(serving_results,
 | 
			
		||||
                                             headers='keys',
 | 
			
		||||
                                             tablefmt='pipe',
 | 
			
		||||
                                             showindex=False)
 | 
			
		||||
    # remove the first line of header
 | 
			
		||||
    serving_md_table_lines = serving_md_table_with_headers.split("\n")
 | 
			
		||||
    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
 | 
			
		||||
    serving_md_table_lines = serving_md_table_with_headers.split('\n')
 | 
			
		||||
    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
 | 
			
		||||
 | 
			
		||||
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 | 
			
		||||
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 | 
			
		||||
@ -74,9 +76,10 @@ if __name__ == "__main__":
 | 
			
		||||
        # document results with header.
 | 
			
		||||
        # for those who wants to reproduce our benchmark.
 | 
			
		||||
        f.write(serving_md_table_with_headers)
 | 
			
		||||
        f.write("\n")
 | 
			
		||||
        f.write('\n')
 | 
			
		||||
 | 
			
		||||
    # document benchmarking results in json
 | 
			
		||||
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
 | 
			
		||||
        results = serving_results.to_dict(orient="records")
 | 
			
		||||
 | 
			
		||||
        results = serving_results.to_dict(orient='records')
 | 
			
		||||
        f.write(json.dumps(results))
 | 
			
		||||
 | 
			
		||||
@ -64,11 +64,9 @@
 | 
			
		||||
            "disable_log_requests": "", 
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "swap_space": 16, 
 | 
			
		||||
            "speculative_config": {
 | 
			
		||||
                "model": "turboderp/Qwama-0.5B-Instruct",
 | 
			
		||||
            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
 | 
			
		||||
            "num_speculative_tokens": 4,
 | 
			
		||||
                "draft_tensor_parallel_size": 1
 | 
			
		||||
            }
 | 
			
		||||
            "speculative_draft_tensor_parallel_size": 1
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
 | 
			
		||||
 | 
			
		||||
@ -1,46 +0,0 @@
 | 
			
		||||
# This local pyproject file is part of the migration from yapf to ruff format.
 | 
			
		||||
# It uses the same core rules as the main pyproject.toml file, but with the
 | 
			
		||||
# following differences:
 | 
			
		||||
# - ruff line length is overridden to 88
 | 
			
		||||
# - deprecated typing ignores (UP006, UP035) have been removed
 | 
			
		||||
 | 
			
		||||
[tool.ruff]
 | 
			
		||||
line-length = 88
 | 
			
		||||
 | 
			
		||||
[tool.ruff.lint.per-file-ignores]
 | 
			
		||||
"vllm/third_party/**" = ["ALL"]
 | 
			
		||||
"vllm/version.py" = ["F401"]
 | 
			
		||||
"vllm/_version.py" = ["ALL"]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.lint]
 | 
			
		||||
select = [
 | 
			
		||||
    # pycodestyle
 | 
			
		||||
    "E",
 | 
			
		||||
    # Pyflakes
 | 
			
		||||
    "F",
 | 
			
		||||
    # pyupgrade
 | 
			
		||||
    "UP",
 | 
			
		||||
    # flake8-bugbear
 | 
			
		||||
    "B",
 | 
			
		||||
    # flake8-simplify
 | 
			
		||||
    "SIM",
 | 
			
		||||
    # isort
 | 
			
		||||
    "I",
 | 
			
		||||
    # flake8-logging-format
 | 
			
		||||
    "G",
 | 
			
		||||
]
 | 
			
		||||
ignore = [
 | 
			
		||||
    # star imports
 | 
			
		||||
    "F405", "F403",
 | 
			
		||||
    # lambda expression assignment
 | 
			
		||||
    "E731",
 | 
			
		||||
    # Loop control variable not used within loop body
 | 
			
		||||
    "B007",
 | 
			
		||||
    # f-string format
 | 
			
		||||
    "UP032",
 | 
			
		||||
    # Can remove once 3.10+ is the minimum Python version
 | 
			
		||||
    "UP007",
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[tool.ruff.format]
 | 
			
		||||
docstring-code-format = true
 | 
			
		||||
@ -1,25 +1,23 @@
 | 
			
		||||
steps:
 | 
			
		||||
  - label: "Build wheel - CUDA 12.8"
 | 
			
		||||
    id: build-wheel-cuda-12-8
 | 
			
		||||
  - label: "Build wheel - CUDA 12.4"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
      - "bash .buildkite/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - label: "Build wheel - CUDA 12.6"
 | 
			
		||||
    id: build-wheel-cuda-12-6
 | 
			
		||||
  - label: "Build wheel - CUDA 12.1"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
      - "bash .buildkite/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
@ -30,14 +28,13 @@ steps:
 | 
			
		||||
 | 
			
		||||
  - label: "Build wheel - CUDA 11.8"
 | 
			
		||||
    # depends_on: block-build-cu118-wheel
 | 
			
		||||
    id: build-wheel-cuda-11-8
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
      - "bash .buildkite/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
@ -47,49 +44,33 @@ steps:
 | 
			
		||||
 | 
			
		||||
  - label: "Build release image"
 | 
			
		||||
    depends_on: block-release-image-build
 | 
			
		||||
    id: build-release-image
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
			
		||||
 | 
			
		||||
  - label: "Annotate release workflow"
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - build-release-image
 | 
			
		||||
      - build-wheel-cuda-12-8
 | 
			
		||||
      - build-wheel-cuda-12-6
 | 
			
		||||
      - build-wheel-cuda-11-8
 | 
			
		||||
    id: annotate-release-workflow
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "bash .buildkite/scripts/annotate-release.sh"
 | 
			
		||||
 | 
			
		||||
  - label: "Build and publish TPU release image"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    if: build.env("NIGHTLY") == "1"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: tpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "yes | docker system prune -a"
 | 
			
		||||
      - "git fetch --all"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
 | 
			
		||||
      - "docker push vllm/vllm-tpu:nightly"
 | 
			
		||||
      - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
 | 
			
		||||
    plugins:
 | 
			
		||||
      - docker-login#v3.0.0:
 | 
			
		||||
          username: vllmbot
 | 
			
		||||
          username: vllm
 | 
			
		||||
          password-env: DOCKERHUB_TOKEN
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - input: "Provide Release version here"
 | 
			
		||||
    id: input-release-version
 | 
			
		||||
    fields:
 | 
			
		||||
      - text: "What is the release version?"
 | 
			
		||||
        key: release-version
 | 
			
		||||
        key: "release-version"
 | 
			
		||||
 | 
			
		||||
  - block: "Build CPU release image"
 | 
			
		||||
    key: block-cpu-release-image-build
 | 
			
		||||
@ -101,22 +82,7 @@ steps:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - block: "Build Neuron release image"
 | 
			
		||||
    key: block-neuron-release-image-build
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
 | 
			
		||||
  - label: "Build and publish Neuron release image"
 | 
			
		||||
    depends_on: block-neuron-release-image-build
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: neuron-postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										162
									
								
								.buildkite/run-amd-test.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										162
									
								
								.buildkite/run-amd-test.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,162 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script runs test inside the corresponding ROCm docker container.
 | 
			
		||||
set -o pipefail
 | 
			
		||||
 | 
			
		||||
# Print ROCm version
 | 
			
		||||
echo "--- Confirming Clean Initial State"
 | 
			
		||||
while true; do
 | 
			
		||||
        sleep 3
 | 
			
		||||
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
 | 
			
		||||
                echo "GPUs state is \"clean\""
 | 
			
		||||
                break
 | 
			
		||||
        fi
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
echo "--- ROCm info"
 | 
			
		||||
rocminfo
 | 
			
		||||
 | 
			
		||||
# cleanup older docker images
 | 
			
		||||
cleanup_docker() {
 | 
			
		||||
  # Get Docker's root directory
 | 
			
		||||
  docker_root=$(docker info -f '{{.DockerRootDir}}')
 | 
			
		||||
  if [ -z "$docker_root" ]; then
 | 
			
		||||
    echo "Failed to determine Docker root directory."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  echo "Docker root directory: $docker_root"
 | 
			
		||||
  # Check disk usage of the filesystem where Docker's root directory is located
 | 
			
		||||
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 | 
			
		||||
  # Define the threshold
 | 
			
		||||
  threshold=70
 | 
			
		||||
  if [ "$disk_usage" -gt "$threshold" ]; then
 | 
			
		||||
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
 | 
			
		||||
    # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
    docker image prune -f
 | 
			
		||||
    # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
 | 
			
		||||
    echo "Docker images and volumes cleanup completed."
 | 
			
		||||
  else
 | 
			
		||||
    echo "Disk usage is below $threshold%. No cleanup needed."
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Call the cleanup docker function
 | 
			
		||||
cleanup_docker
 | 
			
		||||
 | 
			
		||||
echo "--- Resetting GPUs"
 | 
			
		||||
 | 
			
		||||
echo "reset" > /opt/amdgpu/etc/gpu_state
 | 
			
		||||
 | 
			
		||||
while true; do
 | 
			
		||||
        sleep 3
 | 
			
		||||
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
 | 
			
		||||
                echo "GPUs state is \"clean\""
 | 
			
		||||
                break
 | 
			
		||||
        fi
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
echo "--- Pulling container" 
 | 
			
		||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 | 
			
		||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 | 
			
		||||
docker pull "${image_name}"
 | 
			
		||||
 | 
			
		||||
remove_docker_container() {
 | 
			
		||||
   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 | 
			
		||||
}
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
echo "--- Running container"
 | 
			
		||||
 | 
			
		||||
HF_CACHE="$(realpath ~)/huggingface"
 | 
			
		||||
mkdir -p "${HF_CACHE}"
 | 
			
		||||
HF_MOUNT="/root/.cache/huggingface"
 | 
			
		||||
 | 
			
		||||
commands=$@
 | 
			
		||||
echo "Commands:$commands"
 | 
			
		||||
#ignore certain kernels tests
 | 
			
		||||
if [[ $commands == *" kernels "* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/test_attention.py \
 | 
			
		||||
  --ignore=kernels/test_attention_selector.py \
 | 
			
		||||
  --ignore=kernels/test_blocksparse_attention.py \
 | 
			
		||||
  --ignore=kernels/test_causal_conv1d.py \
 | 
			
		||||
  --ignore=kernels/test_cutlass.py \
 | 
			
		||||
  --ignore=kernels/test_encoder_decoder_attn.py \
 | 
			
		||||
  --ignore=kernels/test_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/test_flashinfer.py \
 | 
			
		||||
  --ignore=kernels/test_int8_quant.py \
 | 
			
		||||
  --ignore=kernels/test_machete_gemm.py \
 | 
			
		||||
  --ignore=kernels/test_mamba_ssm.py \
 | 
			
		||||
  --ignore=kernels/test_marlin_gemm.py \
 | 
			
		||||
  --ignore=kernels/test_moe.py \
 | 
			
		||||
  --ignore=kernels/test_prefix_prefill.py \
 | 
			
		||||
  --ignore=kernels/test_rand.py \
 | 
			
		||||
  --ignore=kernels/test_sampler.py \
 | 
			
		||||
  --ignore=kernels/test_cascade_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/test_mamba_mixer2.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#ignore certain Entrypoints tests
 | 
			
		||||
if [[ $commands == *" entrypoints/openai "* ]]; then
 | 
			
		||||
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
 | 
			
		||||
  --ignore=entrypoints/openai/test_accuracy.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_audio.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_encoder_decoder.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_embedding.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_oot_registration.py "}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
PARALLEL_JOB_COUNT=8
 | 
			
		||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 | 
			
		||||
if [[ $commands == *"--shard-id="* ]]; then
 | 
			
		||||
  # assign job count as the number of shards used   
 | 
			
		||||
  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
 | 
			
		||||
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
 | 
			
		||||
    # assign shard-id for each shard
 | 
			
		||||
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
 | 
			
		||||
    echo "Shard ${GPU} commands:$commands_gpu"
 | 
			
		||||
    docker run \
 | 
			
		||||
        --device /dev/kfd --device /dev/dri \
 | 
			
		||||
        --network host \
 | 
			
		||||
        --shm-size=16gb \
 | 
			
		||||
        --rm \
 | 
			
		||||
        -e HIP_VISIBLE_DEVICES="${GPU}" \
 | 
			
		||||
        -e HF_TOKEN \
 | 
			
		||||
        -e AWS_ACCESS_KEY_ID \
 | 
			
		||||
        -e AWS_SECRET_ACCESS_KEY \
 | 
			
		||||
        -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
        -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
        --name "${container_name}_${GPU}" \
 | 
			
		||||
        "${image_name}" \
 | 
			
		||||
        /bin/bash -c "${commands_gpu}" \
 | 
			
		||||
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
 | 
			
		||||
    PIDS+=($!)
 | 
			
		||||
  done
 | 
			
		||||
  #wait for all processes to finish and collect exit codes
 | 
			
		||||
  for pid in "${PIDS[@]}"; do
 | 
			
		||||
    wait "${pid}"
 | 
			
		||||
    STATUS+=($?)
 | 
			
		||||
  done
 | 
			
		||||
  for st in "${STATUS[@]}"; do
 | 
			
		||||
    if [[ ${st} -ne 0 ]]; then
 | 
			
		||||
      echo "One of the processes failed with $st"
 | 
			
		||||
      exit "${st}"
 | 
			
		||||
    fi
 | 
			
		||||
  done
 | 
			
		||||
else
 | 
			
		||||
  docker run \
 | 
			
		||||
          --device /dev/kfd --device /dev/dri \
 | 
			
		||||
          --network host \
 | 
			
		||||
          --shm-size=16gb \
 | 
			
		||||
          --rm \
 | 
			
		||||
          -e HIP_VISIBLE_DEVICES=0 \
 | 
			
		||||
          -e HF_TOKEN \
 | 
			
		||||
          -e AWS_ACCESS_KEY_ID \
 | 
			
		||||
          -e AWS_SECRET_ACCESS_KEY \
 | 
			
		||||
          -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
          -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
          --name "${container_name}" \
 | 
			
		||||
          "${image_name}" \
 | 
			
		||||
          /bin/bash -c "${commands}"
 | 
			
		||||
fi
 | 
			
		||||
@ -5,8 +5,8 @@
 | 
			
		||||
set -ex
 | 
			
		||||
set -o pipefail
 | 
			
		||||
 | 
			
		||||
# cd 2 levels into the working directory
 | 
			
		||||
cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 | 
			
		||||
# cd into parent directory of this file
 | 
			
		||||
cd "$(dirname "${BASH_SOURCE[0]}")/.."
 | 
			
		||||
 | 
			
		||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 | 
			
		||||
 | 
			
		||||
@ -10,4 +10,5 @@ trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
docker build -t cpu-test -f docker/Dockerfile.s390x .
 | 
			
		||||
docker build -t cpu-test -f Dockerfile.ppc64le .
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										88
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										88
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,88 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# allow to bind to different cores
 | 
			
		||||
CORE_RANGE=${CORE_RANGE:-48-95}
 | 
			
		||||
NUMA_NODE=${NUMA_NODE:-1}
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() { set -e; docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Run the image, setting --shm-size=4g for tensor parallel.
 | 
			
		||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
 | 
			
		||||
 --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 | 
			
		||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
 | 
			
		||||
 --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 | 
			
		||||
 | 
			
		||||
function cpu_tests() {
 | 
			
		||||
  set -e
 | 
			
		||||
  export NUMA_NODE=$2
 | 
			
		||||
 | 
			
		||||
  # offline inference
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run basic model test
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip install -r vllm/requirements-test.txt
 | 
			
		||||
    pytest -v -s tests/models/decoder_only/language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/embedding/language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 | 
			
		||||
 | 
			
		||||
  # Run compressed-tensor test
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 | 
			
		||||
 | 
			
		||||
  # Run AWQ test
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/quantization/test_ipex_quant.py"
 | 
			
		||||
 | 
			
		||||
  # Run chunked-prefill and prefix-cache test
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v -k cpu_model \
 | 
			
		||||
    tests/basic_correctness/test_chunked_prefill.py"  
 | 
			
		||||
 | 
			
		||||
  # online serving
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    export VLLM_CPU_KVCACHE_SPACE=10 
 | 
			
		||||
    export VLLM_CPU_OMP_THREADS_BIND=$1
 | 
			
		||||
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
 | 
			
		||||
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 | 
			
		||||
    python3 benchmarks/benchmark_serving.py \
 | 
			
		||||
      --backend vllm \
 | 
			
		||||
      --dataset-name random \
 | 
			
		||||
      --model facebook/opt-125m \
 | 
			
		||||
      --num-prompts 20 \
 | 
			
		||||
      --endpoint /v1/completions \
 | 
			
		||||
      --tokenizer facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run multi-lora tests
 | 
			
		||||
  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/lora/test_qwen2vl.py"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# All of CPU tests are expected to be finished less than 40 mins.
 | 
			
		||||
export -f cpu_tests
 | 
			
		||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
 | 
			
		||||
@ -9,13 +9,11 @@ python3 use_existing_torch.py
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
DOCKER_BUILDKIT=1 docker build . \
 | 
			
		||||
  --file docker/Dockerfile \
 | 
			
		||||
  --target vllm-openai \
 | 
			
		||||
  --platform "linux/arm64" \
 | 
			
		||||
  -t gh200-test \
 | 
			
		||||
  --build-arg max_jobs=66 \
 | 
			
		||||
  --build-arg nvcc_threads=2 \
 | 
			
		||||
  --build-arg RUN_WHEEL_CHECK=false \
 | 
			
		||||
  --build-arg torch_cuda_arch_list="9.0+PTX" \
 | 
			
		||||
  --build-arg vllm_fa_cmake_gpu_arches="90-real"
 | 
			
		||||
 | 
			
		||||
@ -25,6 +23,6 @@ trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Run the image and test offline inference
 | 
			
		||||
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
 | 
			
		||||
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
 | 
			
		||||
'
 | 
			
		||||
@ -5,22 +5,20 @@
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 | 
			
		||||
docker build -t hpu-test-env -f Dockerfile.hpu .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
# certain versions of HPU software stack have a bug that can
 | 
			
		||||
# override the exit code of the script, so we need to use
 | 
			
		||||
# separate remove_docker_containers and remove_docker_containers_and_exit
 | 
			
		||||
# separate remove_docker_container and remove_docker_container_and_exit
 | 
			
		||||
# functions, while other platforms only need one remove_docker_container
 | 
			
		||||
# function.
 | 
			
		||||
EXITCODE=1
 | 
			
		||||
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
 | 
			
		||||
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
 | 
			
		||||
trap remove_docker_containers_and_exit EXIT
 | 
			
		||||
remove_docker_containers
 | 
			
		||||
remove_docker_container() { docker rm -f hpu-test || true; }
 | 
			
		||||
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
 | 
			
		||||
trap remove_docker_container_and_exit EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Run the image and launch offline inference
 | 
			
		||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 | 
			
		||||
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
 | 
			
		||||
 | 
			
		||||
EXITCODE=$?
 | 
			
		||||
@ -3,7 +3,7 @@
 | 
			
		||||
set -euox pipefail
 | 
			
		||||
 | 
			
		||||
if [[ $# -lt 4 ]]; then
 | 
			
		||||
    echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
 | 
			
		||||
    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
@ -11,14 +11,13 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 | 
			
		||||
HF_CACHE="$(realpath ~)/huggingface"
 | 
			
		||||
mkdir -p "${HF_CACHE}"
 | 
			
		||||
HF_MOUNT="/root/.cache/huggingface"
 | 
			
		||||
HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
 | 
			
		||||
 | 
			
		||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 | 
			
		||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 | 
			
		||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
 | 
			
		||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
 | 
			
		||||
 | 
			
		||||
# prune old image and containers to save disk space, and only once a day
 | 
			
		||||
# by using a timestamp file in tmp.
 | 
			
		||||
@ -36,7 +35,7 @@ else
 | 
			
		||||
    date "+%s" > /tmp/neuron-docker-build-timestamp
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 | 
			
		||||
docker build -t "${image_name}" -f Dockerfile.neuron .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() {
 | 
			
		||||
@ -45,19 +44,11 @@ remove_docker_container() {
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Run the image
 | 
			
		||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
 | 
			
		||||
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
 | 
			
		||||
       -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
       -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
       -e "HF_TOKEN=${HF_TOKEN}" \
 | 
			
		||||
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
 | 
			
		||||
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
 | 
			
		||||
       --name "${container_name}" \
 | 
			
		||||
       ${image_name} \
 | 
			
		||||
       /bin/bash -c "
 | 
			
		||||
            python3 /workspace/vllm/examples/offline_inference/neuron.py;
 | 
			
		||||
            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
 | 
			
		||||
            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
 | 
			
		||||
                echo 'Running test file: '$f;
 | 
			
		||||
                python3 -m pytest \$f -v --capture=tee-sys;
 | 
			
		||||
            done
 | 
			
		||||
       "
 | 
			
		||||
       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
 | 
			
		||||
							
								
								
									
										16
									
								
								.buildkite/run-openvino-test.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										16
									
								
								.buildkite/run-openvino-test.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,16 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
docker build -t openvino-test -f Dockerfile.openvino .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() { docker rm -f openvino-test || true; }
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Run the image and launch offline inference
 | 
			
		||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m
 | 
			
		||||
							
								
								
									
										26
									
								
								.buildkite/run-tpu-test.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										26
									
								
								.buildkite/run-tpu-test.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,26 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -e
 | 
			
		||||
 | 
			
		||||
# Build the docker image.
 | 
			
		||||
docker build -f Dockerfile.tpu -t vllm-tpu .
 | 
			
		||||
 | 
			
		||||
# Set up cleanup.
 | 
			
		||||
remove_docker_container() { docker rm -f tpu-test || true; }
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# For HF_TOKEN.
 | 
			
		||||
source /etc/environment
 | 
			
		||||
# Run a simple end-to-end example.
 | 
			
		||||
docker run --privileged --net host --shm-size=16G -it \
 | 
			
		||||
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
 | 
			
		||||
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
 | 
			
		||||
    && python3 -m pip install pytest \
 | 
			
		||||
    && python3 -m pip install lm_eval[api]==0.4.4 \
 | 
			
		||||
    && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
 | 
			
		||||
    && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 | 
			
		||||
    && python3 /workspace/vllm/tests/tpu/test_compilation.py \
 | 
			
		||||
    && python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
 | 
			
		||||
    && python3 /workspace/vllm/examples/offline_inference/tpu.py"
 | 
			
		||||
							
								
								
									
										19
									
								
								.buildkite/run-xpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								.buildkite/run-xpu-test.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,19 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
docker build -t xpu-test -f Dockerfile.xpu .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() { docker rm -f xpu-test || true; }
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Run the image and test offline inference/tensor parallel
 | 
			
		||||
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 | 
			
		||||
'
 | 
			
		||||
@ -1,31 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Get release version and strip leading 'v' if present
 | 
			
		||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
 | 
			
		||||
 | 
			
		||||
if [ -z "$RELEASE_VERSION" ]; then
 | 
			
		||||
  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 | 
			
		||||
To download the wheel:
 | 
			
		||||
\`\`\`
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 | 
			
		||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
 | 
			
		||||
\`\`\`
 | 
			
		||||
 | 
			
		||||
To download and upload the image:
 | 
			
		||||
 | 
			
		||||
\`\`\`
 | 
			
		||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
 | 
			
		||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
 | 
			
		||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
 | 
			
		||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
 | 
			
		||||
docker push vllm/vllm-openai:latest
 | 
			
		||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
 | 
			
		||||
\`\`\`
 | 
			
		||||
EOF 
 | 
			
		||||
@ -1,17 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
# Usage: ./ci_clean_log.sh ci.log
 | 
			
		||||
# This script strips timestamps and color codes from CI log files.
 | 
			
		||||
 | 
			
		||||
# Check if argument is given
 | 
			
		||||
if [ $# -lt 1 ]; then
 | 
			
		||||
    echo "Usage: $0 ci.log"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
INPUT_FILE="$1"
 | 
			
		||||
 | 
			
		||||
# Strip timestamps
 | 
			
		||||
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
 | 
			
		||||
 | 
			
		||||
# Strip colorization
 | 
			
		||||
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
 | 
			
		||||
@ -1,245 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script runs test inside the corresponding ROCm docker container.
 | 
			
		||||
set -o pipefail
 | 
			
		||||
 | 
			
		||||
# Export Python path
 | 
			
		||||
export PYTHONPATH=".."
 | 
			
		||||
 | 
			
		||||
# Print ROCm version
 | 
			
		||||
echo "--- Confirming Clean Initial State"
 | 
			
		||||
while true; do
 | 
			
		||||
        sleep 3
 | 
			
		||||
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
 | 
			
		||||
                echo "GPUs state is \"clean\""
 | 
			
		||||
                break
 | 
			
		||||
        fi
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
echo "--- ROCm info"
 | 
			
		||||
rocminfo
 | 
			
		||||
 | 
			
		||||
# cleanup older docker images
 | 
			
		||||
cleanup_docker() {
 | 
			
		||||
  # Get Docker's root directory
 | 
			
		||||
  docker_root=$(docker info -f '{{.DockerRootDir}}')
 | 
			
		||||
  if [ -z "$docker_root" ]; then
 | 
			
		||||
    echo "Failed to determine Docker root directory."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  echo "Docker root directory: $docker_root"
 | 
			
		||||
  # Check disk usage of the filesystem where Docker's root directory is located
 | 
			
		||||
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 | 
			
		||||
  # Define the threshold
 | 
			
		||||
  threshold=70
 | 
			
		||||
  if [ "$disk_usage" -gt "$threshold" ]; then
 | 
			
		||||
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
 | 
			
		||||
    # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
    docker image prune -f
 | 
			
		||||
    # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
 | 
			
		||||
    echo "Docker images and volumes cleanup completed."
 | 
			
		||||
  else
 | 
			
		||||
    echo "Disk usage is below $threshold%. No cleanup needed."
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Call the cleanup docker function
 | 
			
		||||
cleanup_docker
 | 
			
		||||
 | 
			
		||||
echo "--- Resetting GPUs"
 | 
			
		||||
 | 
			
		||||
echo "reset" > /opt/amdgpu/etc/gpu_state
 | 
			
		||||
 | 
			
		||||
while true; do
 | 
			
		||||
        sleep 3
 | 
			
		||||
        if grep -q clean /opt/amdgpu/etc/gpu_state; then
 | 
			
		||||
                echo "GPUs state is \"clean\""
 | 
			
		||||
                break
 | 
			
		||||
        fi
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
echo "--- Pulling container" 
 | 
			
		||||
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 | 
			
		||||
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 | 
			
		||||
docker pull "${image_name}"
 | 
			
		||||
 | 
			
		||||
remove_docker_container() {
 | 
			
		||||
   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 | 
			
		||||
}
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
echo "--- Running container"
 | 
			
		||||
 | 
			
		||||
HF_CACHE="$(realpath ~)/huggingface"
 | 
			
		||||
mkdir -p "${HF_CACHE}"
 | 
			
		||||
HF_MOUNT="/root/.cache/huggingface"
 | 
			
		||||
 | 
			
		||||
commands=$@
 | 
			
		||||
echo "Commands:$commands"
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
 | 
			
		||||
  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
 | 
			
		||||
  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#ignore certain kernels tests
 | 
			
		||||
if [[ $commands == *" kernels/core"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/core/test_fused_quant_layernorm.py \
 | 
			
		||||
  --ignore=kernels/core/test_permute_cols.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/attention"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/attention/stest_attention_selector.py \
 | 
			
		||||
  --ignore=kernels/attention/test_blocksparse_attention.py \
 | 
			
		||||
  --ignore=kernels/attention/test_encoder_decoder_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_attention_selector.py \
 | 
			
		||||
  --ignore=kernels/attention/test_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_flashinfer.py \
 | 
			
		||||
  --ignore=kernels/attention/test_prefix_prefill.py \
 | 
			
		||||
  --ignore=kernels/attention/test_cascade_flash_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_mha_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_lightning_attn.py \
 | 
			
		||||
  --ignore=kernels/attention/test_attention.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/quantization"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/quantization/test_int8_quant.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_aqlm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_machete_mm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_block_fp8.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_block_int8.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_marlin_gemm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
 | 
			
		||||
  --ignore=kernels/quantization/test_int8_kernel.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/mamba"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/mamba/test_mamba_mixer2.py \
 | 
			
		||||
  --ignore=kernels/mamba/test_causal_conv1d.py \
 | 
			
		||||
  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [[ $commands == *" kernels/moe"* ]]; then
 | 
			
		||||
  commands="${commands} \
 | 
			
		||||
  --ignore=kernels/moe/test_moe.py \
 | 
			
		||||
  --ignore=kernels/moe/test_cutlass_moe.py \
 | 
			
		||||
  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#ignore certain Entrypoints/openai tests
 | 
			
		||||
if [[ $commands == *" entrypoints/openai "* ]]; then
 | 
			
		||||
  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
 | 
			
		||||
  --ignore=entrypoints/openai/test_audio.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_shutdown.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_completion.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_sleep.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_models.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_lora_adapters.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_root_path.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_tokenization.py \
 | 
			
		||||
  --ignore=entrypoints/openai/test_prompt_validation.py "}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#ignore certain Entrypoints/llm tests
 | 
			
		||||
if [[ $commands == *" entrypoints/llm "* ]]; then
 | 
			
		||||
  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
 | 
			
		||||
  --ignore=entrypoints/llm/test_chat.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_accuracy.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_init.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
 | 
			
		||||
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#Obsolete currently
 | 
			
		||||
##ignore certain Entrypoints/llm tests
 | 
			
		||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
 | 
			
		||||
#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 | 
			
		||||
#fi
 | 
			
		||||
 | 
			
		||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
 | 
			
		||||
# --ignore=entrypoints/openai/test_embedding.py \
 | 
			
		||||
# --ignore=entrypoints/openai/test_oot_registration.py
 | 
			
		||||
# --ignore=entrypoints/openai/test_accuracy.py \
 | 
			
		||||
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
PARALLEL_JOB_COUNT=8
 | 
			
		||||
MYPYTHONPATH=".."
 | 
			
		||||
 | 
			
		||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
 | 
			
		||||
if [[ $commands == *"--shard-id="* ]]; then
 | 
			
		||||
  # assign job count as the number of shards used   
 | 
			
		||||
  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
 | 
			
		||||
  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
 | 
			
		||||
    # assign shard-id for each shard
 | 
			
		||||
    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
 | 
			
		||||
    echo "Shard ${GPU} commands:$commands_gpu"
 | 
			
		||||
    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
 | 
			
		||||
    docker run \
 | 
			
		||||
        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
 | 
			
		||||
        --network=host \
 | 
			
		||||
        --shm-size=16gb \
 | 
			
		||||
        --rm \
 | 
			
		||||
        -e HIP_VISIBLE_DEVICES="${GPU}" \
 | 
			
		||||
        -e HF_TOKEN \
 | 
			
		||||
        -e AWS_ACCESS_KEY_ID \
 | 
			
		||||
        -e AWS_SECRET_ACCESS_KEY \
 | 
			
		||||
        -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
        -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
        -e "PYTHONPATH=${MYPYTHONPATH}" \
 | 
			
		||||
        --name "${container_name}_${GPU}" \
 | 
			
		||||
        "${image_name}" \
 | 
			
		||||
        /bin/bash -c "${commands_gpu}" \
 | 
			
		||||
        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
 | 
			
		||||
    PIDS+=($!)
 | 
			
		||||
  done
 | 
			
		||||
  #wait for all processes to finish and collect exit codes
 | 
			
		||||
  for pid in "${PIDS[@]}"; do
 | 
			
		||||
    wait "${pid}"
 | 
			
		||||
    STATUS+=($?)
 | 
			
		||||
  done
 | 
			
		||||
  for st in "${STATUS[@]}"; do
 | 
			
		||||
    if [[ ${st} -ne 0 ]]; then
 | 
			
		||||
      echo "One of the processes failed with $st"
 | 
			
		||||
      exit "${st}"
 | 
			
		||||
    fi
 | 
			
		||||
  done
 | 
			
		||||
else
 | 
			
		||||
  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
 | 
			
		||||
  docker run \
 | 
			
		||||
          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
 | 
			
		||||
          --network=host \
 | 
			
		||||
          --shm-size=16gb \
 | 
			
		||||
          --rm \
 | 
			
		||||
          -e HIP_VISIBLE_DEVICES=0 \
 | 
			
		||||
          -e HF_TOKEN \
 | 
			
		||||
          -e AWS_ACCESS_KEY_ID \
 | 
			
		||||
          -e AWS_SECRET_ACCESS_KEY \
 | 
			
		||||
          -v "${HF_CACHE}:${HF_MOUNT}" \
 | 
			
		||||
          -e "HF_HOME=${HF_MOUNT}" \
 | 
			
		||||
          -e "PYTHONPATH=${MYPYTHONPATH}" \
 | 
			
		||||
          --name "${container_name}" \
 | 
			
		||||
          "${image_name}" \
 | 
			
		||||
          /bin/bash -c "${commands}"
 | 
			
		||||
fi
 | 
			
		||||
@ -1,49 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() {
 | 
			
		||||
  if [[ -n "$container_id" ]]; then
 | 
			
		||||
      podman stop --all -t0
 | 
			
		||||
      podman rm -f "$container_id" || true
 | 
			
		||||
  fi
 | 
			
		||||
  podman system prune -f
 | 
			
		||||
}
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
 | 
			
		||||
 | 
			
		||||
# Run the image
 | 
			
		||||
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
 | 
			
		||||
 | 
			
		||||
function cpu_tests() {
 | 
			
		||||
 | 
			
		||||
  # offline inference
 | 
			
		||||
  podman exec -it "$container_id" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run basic model test
 | 
			
		||||
  podman exec -it "$container_id" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
 | 
			
		||||
    pip install sentence-transformers datamodel_code_generator
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
 | 
			
		||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
 | 
			
		||||
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
 | 
			
		||||
    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# All of CPU tests are expected to be finished less than 40 mins.
 | 
			
		||||
 | 
			
		||||
export container_id
 | 
			
		||||
export -f cpu_tests
 | 
			
		||||
timeout 40m bash -c cpu_tests
 | 
			
		||||
 | 
			
		||||
@ -1,101 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
# allow to bind to different cores
 | 
			
		||||
CORE_RANGE=${CORE_RANGE:-48-95}
 | 
			
		||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 | 
			
		||||
NUMA_NODE=${NUMA_NODE:-1}
 | 
			
		||||
 | 
			
		||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    set -e; 
 | 
			
		||||
    docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; 
 | 
			
		||||
}
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
 | 
			
		||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 | 
			
		||||
 | 
			
		||||
# Run the image, setting --shm-size=4g for tensor parallel.
 | 
			
		||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
 | 
			
		||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 | 
			
		||||
 | 
			
		||||
function cpu_tests() {
 | 
			
		||||
  set -e
 | 
			
		||||
  export NUMA_NODE=$2
 | 
			
		||||
 | 
			
		||||
  # list packages
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip list"
 | 
			
		||||
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pip list"
 | 
			
		||||
 | 
			
		||||
  # offline inference
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run basic model test
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
 | 
			
		||||
    pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/language/generation -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/language/pooling -m cpu_model
 | 
			
		||||
    pytest -v -s tests/models/multimodal/generation \
 | 
			
		||||
                --ignore=tests/models/multimodal/generation/test_mllama.py \
 | 
			
		||||
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
 | 
			
		||||
                -m cpu_model"
 | 
			
		||||
 | 
			
		||||
  # Run compressed-tensor test
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
 | 
			
		||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 | 
			
		||||
 | 
			
		||||
  # Run AWQ test
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    VLLM_USE_V1=0 pytest -s -v \
 | 
			
		||||
    tests/quantization/test_ipex_quant.py"
 | 
			
		||||
 | 
			
		||||
  # Run chunked-prefill and prefix-cache test
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v -k cpu_model \
 | 
			
		||||
    tests/basic_correctness/test_chunked_prefill.py"  
 | 
			
		||||
 | 
			
		||||
  # online serving
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
 | 
			
		||||
    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 | 
			
		||||
    VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
 | 
			
		||||
      --backend vllm \
 | 
			
		||||
      --dataset-name random \
 | 
			
		||||
      --model facebook/opt-125m \
 | 
			
		||||
      --num-prompts 20 \
 | 
			
		||||
      --endpoint /v1/completions \
 | 
			
		||||
      --tokenizer facebook/opt-125m"
 | 
			
		||||
 | 
			
		||||
  # Run multi-lora tests
 | 
			
		||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
			
		||||
    set -e
 | 
			
		||||
    pytest -s -v \
 | 
			
		||||
    tests/lora/test_qwen2vl.py"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# All of CPU tests are expected to be finished less than 40 mins.
 | 
			
		||||
export -f cpu_tests
 | 
			
		||||
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
 | 
			
		||||
@ -1,185 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -xu
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    docker rm -f tpu-test || true; 
 | 
			
		||||
    docker rm -f vllm-tpu || true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Build the docker image.
 | 
			
		||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
 | 
			
		||||
 | 
			
		||||
# Set up cleanup.
 | 
			
		||||
cleanup_docker() {
 | 
			
		||||
  # Get Docker's root directory
 | 
			
		||||
  docker_root=$(docker info -f '{{.DockerRootDir}}')
 | 
			
		||||
  if [ -z "$docker_root" ]; then
 | 
			
		||||
    echo "Failed to determine Docker root directory."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  echo "Docker root directory: $docker_root"
 | 
			
		||||
  # Check disk usage of the filesystem where Docker's root directory is located
 | 
			
		||||
  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 | 
			
		||||
  # Define the threshold
 | 
			
		||||
  threshold=70
 | 
			
		||||
  if [ "$disk_usage" -gt "$threshold" ]; then
 | 
			
		||||
    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
 | 
			
		||||
    # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
    docker image prune -f
 | 
			
		||||
    # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
    docker volume prune -f && docker system prune --force --filter "until=72h" --all
 | 
			
		||||
    echo "Docker images and volumes cleanup completed."
 | 
			
		||||
  else
 | 
			
		||||
    echo "Disk usage is below $threshold%. No cleanup needed."
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
cleanup_docker
 | 
			
		||||
 | 
			
		||||
# For HF_TOKEN.
 | 
			
		||||
source /etc/environment
 | 
			
		||||
 | 
			
		||||
docker run --privileged --net host --shm-size=16G -it \
 | 
			
		||||
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
 | 
			
		||||
    vllm-tpu /bin/bash -c '
 | 
			
		||||
set -e # Exit immediately if a command exits with a non-zero status.
 | 
			
		||||
set -u # Treat unset variables as an error.
 | 
			
		||||
 | 
			
		||||
echo "--- Starting script inside Docker container ---"
 | 
			
		||||
 | 
			
		||||
# Create results directory
 | 
			
		||||
RESULTS_DIR=$(mktemp -d)
 | 
			
		||||
# If mktemp fails, set -e will cause the script to exit.
 | 
			
		||||
echo "Results will be stored in: $RESULTS_DIR"
 | 
			
		||||
 | 
			
		||||
# Install dependencies
 | 
			
		||||
echo "--- Installing Python dependencies ---"
 | 
			
		||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
 | 
			
		||||
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
 | 
			
		||||
    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
 | 
			
		||||
echo "--- Python dependencies installed ---"
 | 
			
		||||
export VLLM_USE_V1=1
 | 
			
		||||
export VLLM_XLA_CHECK_RECOMPILATION=1
 | 
			
		||||
export VLLM_XLA_CACHE_PATH=
 | 
			
		||||
echo "Using VLLM V1"
 | 
			
		||||
 | 
			
		||||
echo "--- Hardware Information ---"
 | 
			
		||||
tpu-info
 | 
			
		||||
echo "--- Starting Tests ---"
 | 
			
		||||
set +e
 | 
			
		||||
overall_script_exit_code=0
 | 
			
		||||
 | 
			
		||||
# --- Test Definitions ---
 | 
			
		||||
# If a test fails, this function will print logs and will not cause the main script to exit.
 | 
			
		||||
run_test() {
 | 
			
		||||
    local test_num=$1
 | 
			
		||||
    local test_name=$2
 | 
			
		||||
    local test_command=$3
 | 
			
		||||
    local log_file="$RESULTS_DIR/test_${test_num}.log"
 | 
			
		||||
    local actual_exit_code
 | 
			
		||||
 | 
			
		||||
    echo "--- TEST_$test_num: Running $test_name ---"
 | 
			
		||||
    
 | 
			
		||||
    # Execute the test command.
 | 
			
		||||
    eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
 | 
			
		||||
    actual_exit_code=$?
 | 
			
		||||
 | 
			
		||||
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
 | 
			
		||||
    echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
 | 
			
		||||
 | 
			
		||||
    if [ "$actual_exit_code" -ne 0 ]; then
 | 
			
		||||
        echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
 | 
			
		||||
        echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
 | 
			
		||||
        if [ -f "$log_file" ]; then
 | 
			
		||||
            cat "$log_file" >&2
 | 
			
		||||
        else
 | 
			
		||||
            echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
 | 
			
		||||
        fi
 | 
			
		||||
        echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
 | 
			
		||||
        return "$actual_exit_code" # Return the failure code
 | 
			
		||||
    else
 | 
			
		||||
        echo "TEST_$test_num ($test_name) PASSED."
 | 
			
		||||
        return 0 # Return success
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# Helper function to call run_test and update the overall script exit code
 | 
			
		||||
run_and_track_test() {
 | 
			
		||||
    local test_num_arg="$1"
 | 
			
		||||
    local test_name_arg="$2"
 | 
			
		||||
    local test_command_arg="$3"
 | 
			
		||||
 | 
			
		||||
    # Run the test
 | 
			
		||||
    run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
 | 
			
		||||
    local test_specific_exit_code=$?
 | 
			
		||||
 | 
			
		||||
    # If the test failed, set the overall script exit code to 1
 | 
			
		||||
    if [ "$test_specific_exit_code" -ne 0 ]; then
 | 
			
		||||
        # No need for extra echo here, run_test already logged the failure.
 | 
			
		||||
        overall_script_exit_code=1
 | 
			
		||||
    fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
# --- Actual Test Execution ---
 | 
			
		||||
run_and_track_test 0 "test_perf.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
 | 
			
		||||
run_and_track_test 1 "test_compilation.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
 | 
			
		||||
run_and_track_test 2 "test_basic.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
 | 
			
		||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
 | 
			
		||||
run_and_track_test 4 "test_quantization_accuracy.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
 | 
			
		||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
 | 
			
		||||
    "python3 /workspace/vllm/examples/offline_inference/tpu.py"
 | 
			
		||||
run_and_track_test 6 "test_tpu_model_runner.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
 | 
			
		||||
run_and_track_test 7 "test_sampler.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
 | 
			
		||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
 | 
			
		||||
run_and_track_test 9 "test_multimodal.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
 | 
			
		||||
run_and_track_test 10 "test_pallas.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 | 
			
		||||
run_and_track_test 11 "test_struct_output_generate.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 | 
			
		||||
run_and_track_test 12 "test_moe_pallas.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 | 
			
		||||
run_and_track_test 13 "test_lora.py" \
 | 
			
		||||
    "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
 | 
			
		||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
 | 
			
		||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
 | 
			
		||||
    "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
 | 
			
		||||
 | 
			
		||||
# After all tests have been attempted, exit with the overall status.
 | 
			
		||||
if [ "$overall_script_exit_code" -ne 0 ]; then
 | 
			
		||||
    echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
 | 
			
		||||
else
 | 
			
		||||
    echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
 | 
			
		||||
fi
 | 
			
		||||
exit "$overall_script_exit_code"
 | 
			
		||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
 | 
			
		||||
 | 
			
		||||
# Capture the exit code of the docker run command
 | 
			
		||||
DOCKER_RUN_EXIT_CODE=$?
 | 
			
		||||
 | 
			
		||||
# The trap will run for cleanup.
 | 
			
		||||
# Exit the main script with the Docker run command's exit code.
 | 
			
		||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
 | 
			
		||||
    echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
 | 
			
		||||
    exit "$DOCKER_RUN_EXIT_CODE"
 | 
			
		||||
else
 | 
			
		||||
    echo "Docker run command completed successfully."
 | 
			
		||||
    exit 0
 | 
			
		||||
fi
 | 
			
		||||
# TODO: This test fails because it uses RANDOM_SEED sampling
 | 
			
		||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
 | 
			
		||||
@ -1,31 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# This script build the CPU docker image and run the offline inference inside the container.
 | 
			
		||||
# It serves a sanity check for compilation and basic model usage.
 | 
			
		||||
set -ex
 | 
			
		||||
 | 
			
		||||
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
 | 
			
		||||
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 | 
			
		||||
 | 
			
		||||
# Try building the docker image
 | 
			
		||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
 | 
			
		||||
 | 
			
		||||
# Setup cleanup
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
  docker rm -f "${container_name}" || true; 
 | 
			
		||||
  docker image rm -f "${image_name}" || true;
 | 
			
		||||
  docker system prune -f || true;
 | 
			
		||||
}
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Run the image and test offline inference/tensor parallel
 | 
			
		||||
docker run \
 | 
			
		||||
    --device /dev/dri \
 | 
			
		||||
    -v /dev/dri/by-path:/dev/dri/by-path \
 | 
			
		||||
    --entrypoint="" \
 | 
			
		||||
    --name "${container_name}" \
 | 
			
		||||
    "${image_name}" \
 | 
			
		||||
    sh -c '
 | 
			
		||||
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
 | 
			
		||||
    VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
 | 
			
		||||
'
 | 
			
		||||
@ -1,18 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
# Usage: ./rerun_test.sh path/to/test.py::test_name
 | 
			
		||||
 | 
			
		||||
# Check if argument is given
 | 
			
		||||
if [ $# -lt 1 ]; then
 | 
			
		||||
    echo "Usage: $0 path/to/test.py::test_name"
 | 
			
		||||
    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
TEST=$1
 | 
			
		||||
COUNT=1
 | 
			
		||||
 | 
			
		||||
while pytest -sv "$TEST"; do
 | 
			
		||||
    COUNT=$((COUNT + 1))
 | 
			
		||||
    echo "RUN NUMBER ${COUNT}"
 | 
			
		||||
done
 | 
			
		||||
@ -1,24 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -euo pipefail
 | 
			
		||||
 | 
			
		||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
 | 
			
		||||
if [ -z "$docker_root" ]; then
 | 
			
		||||
  echo "Failed to determine Docker root directory."
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
echo "Docker root directory: $docker_root"
 | 
			
		||||
# Check disk usage of the filesystem where Docker's root directory is located
 | 
			
		||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
 | 
			
		||||
# Define the threshold
 | 
			
		||||
threshold=70
 | 
			
		||||
if [ "$disk_usage" -gt "$threshold" ]; then
 | 
			
		||||
  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
 | 
			
		||||
  # Remove dangling images (those that are not tagged and not used by any container)
 | 
			
		||||
  docker image prune -f
 | 
			
		||||
  # Remove unused volumes / force the system prune for old images as well.
 | 
			
		||||
  docker volume prune -f && docker system prune --force --filter "until=72h" --all
 | 
			
		||||
  echo "Docker images and volumes cleanup completed."
 | 
			
		||||
else
 | 
			
		||||
  echo "Disk usage is below $threshold%. No cleanup needed."
 | 
			
		||||
fi
 | 
			
		||||
@ -1,14 +0,0 @@
 | 
			
		||||
# Environment config
 | 
			
		||||
TEST_NAME=llama8b
 | 
			
		||||
CONTAINER_NAME=vllm-tpu
 | 
			
		||||
 | 
			
		||||
# vllm config
 | 
			
		||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
 | 
			
		||||
MAX_NUM_SEQS=512
 | 
			
		||||
MAX_NUM_BATCHED_TOKENS=512
 | 
			
		||||
TENSOR_PARALLEL_SIZE=1
 | 
			
		||||
MAX_MODEL_LEN=2048
 | 
			
		||||
DOWNLOAD_DIR=/mnt/disks/persist
 | 
			
		||||
EXPECTED_THROUGHPUT=8.0
 | 
			
		||||
INPUT_LEN=1800
 | 
			
		||||
OUTPUT_LEN=128
 | 
			
		||||
@ -1,102 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
if [ ! -f "$1" ]; then
 | 
			
		||||
  echo "Error: The env file '$1' does not exist."
 | 
			
		||||
  exit 1  # Exit the script with a non-zero status to indicate an error
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
ENV_FILE=$1
 | 
			
		||||
 | 
			
		||||
# For testing on local vm, use `set -a` to export all variables
 | 
			
		||||
source /etc/environment
 | 
			
		||||
source $ENV_FILE
 | 
			
		||||
 | 
			
		||||
remove_docker_container() { 
 | 
			
		||||
    docker rm -f tpu-test || true; 
 | 
			
		||||
    docker rm -f vllm-tpu || true;
 | 
			
		||||
    docker rm -f $CONTAINER_NAME || true;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Remove the container that might not be cleaned up in the previous run.
 | 
			
		||||
remove_docker_container
 | 
			
		||||
 | 
			
		||||
# Build docker image.
 | 
			
		||||
# TODO: build the image outside the script and share the image with other
 | 
			
		||||
# tpu test if building time is too long.
 | 
			
		||||
DOCKER_BUILDKIT=1 docker build \
 | 
			
		||||
  --build-arg max_jobs=16 \
 | 
			
		||||
  --build-arg USE_SCCACHE=1 \
 | 
			
		||||
  --build-arg GIT_REPO_CHECK=0 \
 | 
			
		||||
  --tag vllm/vllm-tpu-bm \
 | 
			
		||||
  --progress plain -f docker/Dockerfile.tpu .
 | 
			
		||||
 | 
			
		||||
LOG_ROOT=$(mktemp -d)
 | 
			
		||||
# If mktemp fails, set -e will cause the script to exit.
 | 
			
		||||
echo "Results will be stored in: $LOG_ROOT"
 | 
			
		||||
 | 
			
		||||
if [ -z "$HF_TOKEN" ]; then
 | 
			
		||||
  echo "Error: HF_TOKEN is not set or is empty."  
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# Make sure mounted disk or dir exists
 | 
			
		||||
if [ ! -d "$DOWNLOAD_DIR" ]; then
 | 
			
		||||
    echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
 | 
			
		||||
    exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
echo "Run model $MODEL"
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
echo "starting docker...$CONTAINER_NAME"
 | 
			
		||||
echo    
 | 
			
		||||
docker run \
 | 
			
		||||
 -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
 | 
			
		||||
 --env-file $ENV_FILE \
 | 
			
		||||
 -e HF_TOKEN="$HF_TOKEN" \
 | 
			
		||||
 -e TARGET_COMMIT=$BUILDKITE_COMMIT \
 | 
			
		||||
 -e MODEL=$MODEL \
 | 
			
		||||
 -e WORKSPACE=/workspace \
 | 
			
		||||
 --name $CONTAINER_NAME \
 | 
			
		||||
 -d \
 | 
			
		||||
 --privileged \
 | 
			
		||||
 --network host \
 | 
			
		||||
 -v /dev/shm:/dev/shm \
 | 
			
		||||
 vllm/vllm-tpu-bm tail -f /dev/null
 | 
			
		||||
 | 
			
		||||
echo "run script..."
 | 
			
		||||
echo
 | 
			
		||||
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
 | 
			
		||||
 | 
			
		||||
echo "copy result back..."
 | 
			
		||||
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
 | 
			
		||||
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
 | 
			
		||||
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG" 
 | 
			
		||||
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
 | 
			
		||||
 | 
			
		||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 | 
			
		||||
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
 | 
			
		||||
 | 
			
		||||
if [ "$BUILDKITE" = "true" ]; then
 | 
			
		||||
  echo "Running inside Buildkite"
 | 
			
		||||
  buildkite-agent artifact upload "$VLLM_LOG" 
 | 
			
		||||
  buildkite-agent artifact upload "$BM_LOG"
 | 
			
		||||
else
 | 
			
		||||
  echo "Not running inside Buildkite"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# compare the throughput with EXPECTED_THROUGHPUT 
 | 
			
		||||
# and assert meeting the expectation
 | 
			
		||||
# 
 | 
			
		||||
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
 | 
			
		||||
  echo "Failed to get the throughput"
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
 | 
			
		||||
  echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
 | 
			
		||||
  exit 1
 | 
			
		||||
fi
 | 
			
		||||
@ -1,94 +0,0 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -euo pipefail
 | 
			
		||||
 | 
			
		||||
VLLM_LOG="$WORKSPACE/vllm_log.txt"
 | 
			
		||||
BM_LOG="$WORKSPACE/bm_log.txt"
 | 
			
		||||
 | 
			
		||||
if [ -n "$TARGET_COMMIT" ]; then
 | 
			
		||||
  head_hash=$(git rev-parse HEAD)
 | 
			
		||||
  if [ "$TARGET_COMMIT" != "$head_hash" ]; then
 | 
			
		||||
    echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
echo "model: $MODEL"
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# create a log folder
 | 
			
		||||
#
 | 
			
		||||
mkdir "$WORKSPACE/log"
 | 
			
		||||
 | 
			
		||||
# TODO: Move to image building.
 | 
			
		||||
pip install pandas
 | 
			
		||||
pip install datasets
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# create sonnet_4x
 | 
			
		||||
#
 | 
			
		||||
echo "Create sonnet_4x.txt"
 | 
			
		||||
echo "" > benchmarks/sonnet_4x.txt
 | 
			
		||||
for _ in {1..4}
 | 
			
		||||
 do
 | 
			
		||||
  cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# start vllm service in backend
 | 
			
		||||
#
 | 
			
		||||
echo "lanching vllm..."
 | 
			
		||||
echo "logging to $VLLM_LOG"
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
VLLM_USE_V1=1 vllm serve $MODEL \
 | 
			
		||||
 --seed 42 \
 | 
			
		||||
 --disable-log-requests \
 | 
			
		||||
 --max-num-seqs $MAX_NUM_SEQS \
 | 
			
		||||
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
 | 
			
		||||
 --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
 | 
			
		||||
 --no-enable-prefix-caching \
 | 
			
		||||
 --download_dir $DOWNLOAD_DIR \
 | 
			
		||||
 --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
echo "wait for 20 minutes.."
 | 
			
		||||
echo
 | 
			
		||||
# sleep 1200
 | 
			
		||||
# wait for 10 minutes...
 | 
			
		||||
for i in {1..120}; do
 | 
			
		||||
    # TODO: detect other type of errors.
 | 
			
		||||
    if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
 | 
			
		||||
        echo "Detected RuntimeError, exiting."
 | 
			
		||||
        exit 1
 | 
			
		||||
    elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
 | 
			
		||||
        echo "Application started"
 | 
			
		||||
        break
 | 
			
		||||
    else
 | 
			
		||||
        echo "wait for 10 seconds..."
 | 
			
		||||
        sleep 10
 | 
			
		||||
    fi
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# run test
 | 
			
		||||
#
 | 
			
		||||
echo "run benchmark test..."
 | 
			
		||||
echo "logging to $BM_LOG"
 | 
			
		||||
echo
 | 
			
		||||
python benchmarks/benchmark_serving.py \
 | 
			
		||||
    --backend vllm \
 | 
			
		||||
    --model $MODEL  \
 | 
			
		||||
    --dataset-name sonnet \
 | 
			
		||||
    --dataset-path benchmarks/sonnet_4x.txt \
 | 
			
		||||
    --sonnet-input-len $INPUT_LEN \
 | 
			
		||||
    --sonnet-output-len $OUTPUT_LEN \
 | 
			
		||||
    --ignore-eos > "$BM_LOG"
 | 
			
		||||
 | 
			
		||||
echo "completed..."
 | 
			
		||||
echo
 | 
			
		||||
 | 
			
		||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
 | 
			
		||||
echo "throughput: $throughput"
 | 
			
		||||
echo
 | 
			
		||||
@ -8,7 +8,6 @@
 | 
			
		||||
# Documentation
 | 
			
		||||
# label(str): the name of the test. emoji allowed.
 | 
			
		||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 | 
			
		||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 | 
			
		||||
# fast_check_only(bool): run this test on fastcheck pipeline only
 | 
			
		||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 | 
			
		||||
# command(str): the single command to run for tests. incompatible with commands.
 | 
			
		||||
@ -32,17 +31,17 @@ steps:
 | 
			
		||||
##### fast check tests  #####
 | 
			
		||||
 | 
			
		||||
- label: Documentation Build # 2min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/test_docs"
 | 
			
		||||
  working_dir: "/vllm-workspace/test_docs/docs"
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  no_gpu: True
 | 
			
		||||
  commands:
 | 
			
		||||
  - pip install -r ../requirements/docs.txt
 | 
			
		||||
  # TODO: add `--strict` once warnings in docstrings are fixed
 | 
			
		||||
  - mkdocs build
 | 
			
		||||
  - pip install -r requirements-docs.txt
 | 
			
		||||
  - SPHINXOPTS=\"-W\" make html
 | 
			
		||||
  # Check API reference (if it fails, you may have missing mock imports)
 | 
			
		||||
  - grep \"sig sig-object py\" build/html/api/inference_params.html
 | 
			
		||||
 | 
			
		||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/mq_llm_engine
 | 
			
		||||
@ -58,13 +57,11 @@ steps:
 | 
			
		||||
  - pytest -v -s async_engine # AsyncLLMEngine
 | 
			
		||||
  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
 | 
			
		||||
  - pytest -v -s test_inputs.py
 | 
			
		||||
  - pytest -v -s test_outputs.py
 | 
			
		||||
  - pytest -v -s multimodal
 | 
			
		||||
  - pytest -v -s test_utils.py # Utils
 | 
			
		||||
  - pytest -v -s worker # Worker
 | 
			
		||||
 | 
			
		||||
- label: Python-only Installation Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - tests/standalone_tests/python_only_compile.sh
 | 
			
		||||
  - setup.py
 | 
			
		||||
@ -72,9 +69,8 @@ steps:
 | 
			
		||||
  - bash standalone_tests/python_only_compile.sh
 | 
			
		||||
 | 
			
		||||
- label: Basic Correctness Test # 30min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/basic_correctness/test_basic_correctness
 | 
			
		||||
@ -82,14 +78,12 @@ steps:
 | 
			
		||||
  - tests/basic_correctness/test_preemption
 | 
			
		||||
  - tests/basic_correctness/test_cumem.py
 | 
			
		||||
  commands:
 | 
			
		||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
  - pytest -v -s basic_correctness/test_cumem.py
 | 
			
		||||
  - pytest -v -s basic_correctness/test_basic_correctness.py
 | 
			
		||||
  - pytest -v -s basic_correctness/test_cpu_offload.py
 | 
			
		||||
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 | 
			
		||||
 | 
			
		||||
- label: Chunked Prefill Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/basic_correctness/test_chunked_prefill
 | 
			
		||||
@ -98,7 +92,7 @@ steps:
 | 
			
		||||
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 | 
			
		||||
 | 
			
		||||
- label: Core Test # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/core
 | 
			
		||||
@ -108,10 +102,9 @@ steps:
 | 
			
		||||
  - pytest -v -s core
 | 
			
		||||
 | 
			
		||||
- label: Entrypoints Test # 40min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/entrypoints/llm
 | 
			
		||||
@ -119,58 +112,43 @@ steps:
 | 
			
		||||
  - tests/entrypoints/test_chat_utils
 | 
			
		||||
  - tests/entrypoints/offline_mode
 | 
			
		||||
  commands:
 | 
			
		||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
 | 
			
		||||
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
 | 
			
		||||
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
 | 
			
		||||
  - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
 | 
			
		||||
  - pytest -v -s entrypoints/test_chat_utils.py
 | 
			
		||||
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 | 
			
		||||
  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 | 
			
		||||
 | 
			
		||||
- label: Distributed Tests (4 GPUs) # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/distributed/
 | 
			
		||||
  - vllm/core/
 | 
			
		||||
  - tests/distributed/test_utils
 | 
			
		||||
  - tests/distributed/test_pynccl
 | 
			
		||||
  - tests/distributed/test_events
 | 
			
		||||
  - tests/spec_decode/e2e/test_integration_dist_tp4
 | 
			
		||||
  - tests/compile/test_basic_correctness
 | 
			
		||||
  - examples/offline_inference/rlhf.py
 | 
			
		||||
  - examples/offline_inference/rlhf_colocate.py
 | 
			
		||||
  - tests/examples/offline_inference/data_parallel.py
 | 
			
		||||
  - tests/v1/test_async_llm_dp.py
 | 
			
		||||
  - tests/v1/engine/test_engine_core_client.py
 | 
			
		||||
  commands:
 | 
			
		||||
  # test with tp=2 and external_dp=2
 | 
			
		||||
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
			
		||||
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
			
		||||
  # test with tp=2 and pp=2
 | 
			
		||||
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
			
		||||
  # test with internal dp
 | 
			
		||||
  - python3 ../examples/offline_inference/data_parallel.py
 | 
			
		||||
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
 | 
			
		||||
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
 | 
			
		||||
  - VLLM_USE_V1=1 python3 ../examples/offline_inference/data_parallel.py
 | 
			
		||||
  - pytest -v -s distributed/test_utils.py
 | 
			
		||||
  - pytest -v -s compile/test_basic_correctness.py
 | 
			
		||||
  - pytest -v -s distributed/test_pynccl.py
 | 
			
		||||
  - pytest -v -s distributed/test_events.py
 | 
			
		||||
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 | 
			
		||||
  # TODO: create a dedicated test section for multi-GPU example tests
 | 
			
		||||
  # when we have multiple distributed example tests
 | 
			
		||||
  - pushd ../examples/offline_inference
 | 
			
		||||
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
 | 
			
		||||
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
 | 
			
		||||
  - popd
 | 
			
		||||
  - python3 ../examples/offline_inference/rlhf.py
 | 
			
		||||
  - RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/rlhf_colocate.py
 | 
			
		||||
 | 
			
		||||
- label: Metrics, Tracing Test # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/metrics
 | 
			
		||||
@ -178,17 +156,17 @@ steps:
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s metrics
 | 
			
		||||
  - "pip install \
 | 
			
		||||
      'opentelemetry-sdk>=1.26.0' \
 | 
			
		||||
      'opentelemetry-api>=1.26.0' \
 | 
			
		||||
      'opentelemetry-exporter-otlp>=1.26.0' \
 | 
			
		||||
      'opentelemetry-semantic-conventions-ai>=0.4.1'"
 | 
			
		||||
      'opentelemetry-sdk>=1.26.0,<1.27.0' \
 | 
			
		||||
      'opentelemetry-api>=1.26.0,<1.27.0' \
 | 
			
		||||
      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
 | 
			
		||||
      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
 | 
			
		||||
  - pytest -v -s tracing
 | 
			
		||||
 | 
			
		||||
##### fast check tests  #####
 | 
			
		||||
#####  1 GPU test  #####
 | 
			
		||||
 | 
			
		||||
- label: Regression Test # 5min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/test_regression
 | 
			
		||||
@ -198,7 +176,7 @@ steps:
 | 
			
		||||
  working_dir: "/vllm-workspace/tests" # optional
 | 
			
		||||
 | 
			
		||||
- label: Engine Test # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/engine
 | 
			
		||||
@ -206,41 +184,34 @@ steps:
 | 
			
		||||
  - tests/test_sequence
 | 
			
		||||
  - tests/test_config
 | 
			
		||||
  - tests/test_logger
 | 
			
		||||
  - tests/test_vllm_port
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
 | 
			
		||||
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
 | 
			
		||||
  # OOM in the CI unless we run this separately
 | 
			
		||||
  - pytest -v -s tokenization
 | 
			
		||||
 | 
			
		||||
- label: V1 Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
    - vllm/
 | 
			
		||||
    - tests/v1
 | 
			
		||||
  commands:
 | 
			
		||||
    # split the test to avoid interference
 | 
			
		||||
    - pytest -v -s v1/core
 | 
			
		||||
    - pytest -v -s v1/engine
 | 
			
		||||
    - pytest -v -s v1/entrypoints
 | 
			
		||||
    - pytest -v -s v1/sample
 | 
			
		||||
    - pytest -v -s v1/worker
 | 
			
		||||
    - pytest -v -s v1/structured_output
 | 
			
		||||
    - pytest -v -s v1/spec_decode
 | 
			
		||||
    - pytest -v -s v1/kv_connector/unit
 | 
			
		||||
    - pytest -v -s v1/test_serial_utils.py
 | 
			
		||||
    - pytest -v -s v1/test_utils.py
 | 
			
		||||
    - pytest -v -s v1/test_oracle.py
 | 
			
		||||
    - pytest -v -s v1/test_metrics_reader.py
 | 
			
		||||
    - VLLM_USE_V1=1 pytest -v -s v1/core
 | 
			
		||||
    - VLLM_USE_V1=1 pytest -v -s v1/engine
 | 
			
		||||
    - VLLM_USE_V1=1 pytest -v -s v1/sample
 | 
			
		||||
    - VLLM_USE_V1=1 pytest -v -s v1/worker
 | 
			
		||||
    - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
 | 
			
		||||
    - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
 | 
			
		||||
    # TODO: accuracy does not match, whether setting
 | 
			
		||||
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
 | 
			
		||||
    - pytest -v -s v1/e2e
 | 
			
		||||
    - VLLM_USE_V1=1 pytest -v -s v1/e2e
 | 
			
		||||
    # Integration test for streaming correctness (requires special branch).
 | 
			
		||||
    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
 | 
			
		||||
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 | 
			
		||||
 | 
			
		||||
- label: Examples Test # 25min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/examples"
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/entrypoints
 | 
			
		||||
  - examples/
 | 
			
		||||
@ -251,20 +222,17 @@ steps:
 | 
			
		||||
    - python3 offline_inference/basic/chat.py
 | 
			
		||||
    - python3 offline_inference/prefix_caching.py
 | 
			
		||||
    - python3 offline_inference/llm_engine_example.py
 | 
			
		||||
    - python3 offline_inference/audio_language.py --seed 0
 | 
			
		||||
    - python3 offline_inference/vision_language.py --seed 0
 | 
			
		||||
    - python3 offline_inference/vision_language_embedding.py --seed 0
 | 
			
		||||
    - python3 offline_inference/vision_language_multi_image.py --seed 0
 | 
			
		||||
    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 | 
			
		||||
    - python3 offline_inference/vision_language.py
 | 
			
		||||
    - python3 offline_inference/vision_language_multi_image.py
 | 
			
		||||
    - python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 | 
			
		||||
    - python3 offline_inference/encoder_decoder.py
 | 
			
		||||
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
 | 
			
		||||
    - python3 offline_inference/basic/classify.py
 | 
			
		||||
    - python3 offline_inference/basic/embed.py
 | 
			
		||||
    - python3 offline_inference/basic/score.py
 | 
			
		||||
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 | 
			
		||||
    - python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 | 
			
		||||
 | 
			
		||||
- label: Prefix Caching Test # 9min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/prefix_caching
 | 
			
		||||
@ -272,7 +240,6 @@ steps:
 | 
			
		||||
    - pytest -v -s prefix_caching
 | 
			
		||||
 | 
			
		||||
- label: Samplers Test # 36min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor/layers
 | 
			
		||||
  - vllm/sampling_metadata.py
 | 
			
		||||
@ -282,8 +249,18 @@ steps:
 | 
			
		||||
    - pytest -v -s samplers
 | 
			
		||||
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 | 
			
		||||
 | 
			
		||||
- label: LogitsProcessor Test # 5min
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor/layers
 | 
			
		||||
  - vllm/model_executor/guided_decoding
 | 
			
		||||
  - tests/test_logits_processor
 | 
			
		||||
  - tests/model_executor/test_guided_processors
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s test_logits_processor.py
 | 
			
		||||
    - pytest -v -s model_executor/test_guided_processors.py
 | 
			
		||||
 | 
			
		||||
- label: Speculative decoding tests # 40min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/spec_decode
 | 
			
		||||
  - tests/spec_decode
 | 
			
		||||
@ -294,30 +271,15 @@ steps:
 | 
			
		||||
    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 | 
			
		||||
 | 
			
		||||
- label: LoRA Test %N # 15min each
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/lora
 | 
			
		||||
  - tests/lora
 | 
			
		||||
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
 | 
			
		||||
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
 | 
			
		||||
  parallelism: 4
 | 
			
		||||
 | 
			
		||||
- label: PyTorch Compilation Unit Tests
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
    - vllm/
 | 
			
		||||
    - tests/compile
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s compile/test_pass_manager.py
 | 
			
		||||
    - pytest -v -s compile/test_fusion.py
 | 
			
		||||
    - pytest -v -s compile/test_fusion_attn.py
 | 
			
		||||
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
 | 
			
		||||
    - pytest -v -s compile/test_sequence_parallelism.py
 | 
			
		||||
    - pytest -v -s compile/test_async_tp.py
 | 
			
		||||
 | 
			
		||||
- label: PyTorch Fullgraph Smoke Test # 9min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/compile
 | 
			
		||||
@ -326,127 +288,60 @@ steps:
 | 
			
		||||
  # these tests need to be separated, cannot combine
 | 
			
		||||
  - pytest -v -s compile/piecewise/test_simple.py
 | 
			
		||||
  - pytest -v -s compile/piecewise/test_toy_llama.py
 | 
			
		||||
  - pytest -v -s compile/piecewise/test_full_cudagraph.py
 | 
			
		||||
 | 
			
		||||
- label: PyTorch Fullgraph Test # 18min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/compile
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s compile/test_full_graph.py
 | 
			
		||||
 | 
			
		||||
- label: Kernels Core Operation Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
- label: Kernels Test %N # 1h each
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - tests/kernels/core
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/core
 | 
			
		||||
 | 
			
		||||
- label: Kernels Attention Test %N
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/attention/
 | 
			
		||||
  - vllm/attention
 | 
			
		||||
  - vllm/v1/attention
 | 
			
		||||
  - tests/kernels/attention
 | 
			
		||||
  - tests/kernels
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 | 
			
		||||
  parallelism: 2
 | 
			
		||||
 | 
			
		||||
- label: Kernels Quantization Test %N
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/quantization/
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  - tests/kernels/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 | 
			
		||||
  parallelism: 2
 | 
			
		||||
 | 
			
		||||
- label: Kernels MoE Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/moe/
 | 
			
		||||
  - tests/kernels/moe
 | 
			
		||||
  - vllm/model_executor/layers/fused_moe/
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/moe
 | 
			
		||||
 | 
			
		||||
- label: Kernels Mamba Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/mamba/
 | 
			
		||||
  - tests/kernels/mamba
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/mamba
 | 
			
		||||
    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 | 
			
		||||
  parallelism: 4
 | 
			
		||||
 | 
			
		||||
- label: Tensorizer Test # 11min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  soft_fail: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor/model_loader
 | 
			
		||||
  - tests/tensorizer_loader
 | 
			
		||||
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
 | 
			
		||||
  commands:
 | 
			
		||||
    - apt-get update && apt-get install -y curl libsodium23
 | 
			
		||||
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
    - pytest -v -s tensorizer_loader
 | 
			
		||||
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 | 
			
		||||
 | 
			
		||||
- label: Model Executor Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  soft_fail: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor
 | 
			
		||||
  - tests/model_executor
 | 
			
		||||
  commands:
 | 
			
		||||
    - apt-get update && apt-get install -y curl libsodium23
 | 
			
		||||
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
    - pytest -v -s model_executor
 | 
			
		||||
 | 
			
		||||
- label: Benchmarks # 9min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  working_dir: "/vllm-workspace/.buildkite"
 | 
			
		||||
  mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - benchmarks/
 | 
			
		||||
  commands:
 | 
			
		||||
  - bash scripts/run-benchmarks.sh
 | 
			
		||||
  - bash run-benchmarks.sh
 | 
			
		||||
 | 
			
		||||
- label: Benchmarks CLI Test # 10min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/benchmarks/
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s benchmarks/
 | 
			
		||||
 | 
			
		||||
- label: Quantization Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
- label: Quantization Test # 33min
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  - tests/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
  # temporary install here since we need nightly, will move to requirements/test.in
 | 
			
		||||
  # after torchao 0.12 release
 | 
			
		||||
  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
 | 
			
		||||
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 | 
			
		||||
  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 | 
			
		||||
 | 
			
		||||
- label: LM Eval Small Models # 53min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
 | 
			
		||||
  - bash ./run-tests.sh -c configs/models-small.txt -t 1
 | 
			
		||||
 | 
			
		||||
- label: OpenAI API correctness
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - vllm/entrypoints/openai/
 | 
			
		||||
@ -455,7 +350,6 @@ steps:
 | 
			
		||||
  - pytest -s entrypoints/openai/correctness/
 | 
			
		||||
 | 
			
		||||
- label: Encoder Decoder tests # 5min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/encoder_decoder
 | 
			
		||||
@ -463,117 +357,97 @@ steps:
 | 
			
		||||
    - pytest -v -s encoder_decoder
 | 
			
		||||
 | 
			
		||||
- label: OpenAI-Compatible Tool Use # 20 min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  fast_check: false
 | 
			
		||||
  mirror_hardwares: [ amd ]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
    - vllm/
 | 
			
		||||
    - tests/tool_use
 | 
			
		||||
    - tests/mistral_tool_use
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s tool_use
 | 
			
		||||
    - pytest -v -s mistral_tool_use
 | 
			
		||||
 | 
			
		||||
#####  models test  #####
 | 
			
		||||
 | 
			
		||||
- label: Basic Models Test # 24min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s models/test_transformers.py
 | 
			
		||||
    - pytest -v -s models/test_registry.py
 | 
			
		||||
    - pytest -v -s models/test_utils.py
 | 
			
		||||
    - pytest -v -s models/test_vision.py
 | 
			
		||||
    - pytest -v -s models/test_initialization.py
 | 
			
		||||
 | 
			
		||||
- label: Language Models Test (Standard)
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
- label: Language Models Test (Standard) # 32min
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/language
 | 
			
		||||
  - tests/models/decoder_only/language
 | 
			
		||||
  - tests/models/embedding/language
 | 
			
		||||
  - tests/models/encoder_decoder/language
 | 
			
		||||
  commands:
 | 
			
		||||
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
 | 
			
		||||
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
 | 
			
		||||
    - pip freeze | grep -E 'torch'
 | 
			
		||||
    - pytest -v -s models/language -m core_model
 | 
			
		||||
    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
 | 
			
		||||
    - pytest -v -s models/embedding/language -m core_model
 | 
			
		||||
 | 
			
		||||
- label: Language Models Test (Extended Generation) # 1hr20min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
- label: Language Models Test (Extended) # 1h10min
 | 
			
		||||
  optional: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/language/generation
 | 
			
		||||
  - tests/models/decoder_only/language
 | 
			
		||||
  - tests/models/embedding/language
 | 
			
		||||
  - tests/models/encoder_decoder/language
 | 
			
		||||
  commands:
 | 
			
		||||
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
 | 
			
		||||
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
 | 
			
		||||
    - pytest -v -s models/language/generation -m 'not core_model'
 | 
			
		||||
    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
 | 
			
		||||
    - pytest -v -s models/embedding/language -m 'not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Language Models Test (Extended Pooling)  # 36min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  optional: true
 | 
			
		||||
- label: Multi-Modal Models Test (Standard) # 40min
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/language/pooling
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s models/language/pooling -m 'not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Models Test (Standard)
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  torch_nightly: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/multimodal
 | 
			
		||||
  - tests/models/decoder_only/audio_language
 | 
			
		||||
  - tests/models/decoder_only/vision_language
 | 
			
		||||
  - tests/models/embedding/vision_language
 | 
			
		||||
  - tests/models/encoder_decoder/audio_language
 | 
			
		||||
  - tests/models/encoder_decoder/vision_language
 | 
			
		||||
  commands:
 | 
			
		||||
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
 | 
			
		||||
    - pip freeze | grep -E 'torch'
 | 
			
		||||
    - pytest -v -s models/multimodal/processing
 | 
			
		||||
    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
 | 
			
		||||
    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 | 
			
		||||
    - pytest -v -s models/multimodal
 | 
			
		||||
    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
 | 
			
		||||
    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
 | 
			
		||||
    - pytest -v -s models/embedding/vision_language -m core_model
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/language -m core_model
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/vision_language -m core_model
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 1
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 1 # 48m
 | 
			
		||||
  optional: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/multimodal
 | 
			
		||||
  - tests/models/decoder_only/audio_language
 | 
			
		||||
  - tests/models/decoder_only/vision_language
 | 
			
		||||
  - tests/models/embedding/vision_language
 | 
			
		||||
  - tests/models/encoder_decoder/vision_language
 | 
			
		||||
  commands:
 | 
			
		||||
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
 | 
			
		||||
    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
 | 
			
		||||
    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
 | 
			
		||||
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
 | 
			
		||||
    # HACK - run phi3v tests separately to sidestep this transformers bug
 | 
			
		||||
    # https://github.com/huggingface/transformers/issues/34307
 | 
			
		||||
    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
 | 
			
		||||
    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
 | 
			
		||||
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
 | 
			
		||||
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 2
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 2 # 38m
 | 
			
		||||
  optional: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/multimodal
 | 
			
		||||
  - tests/models/decoder_only/vision_language
 | 
			
		||||
  commands:
 | 
			
		||||
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
 | 
			
		||||
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 3
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  optional: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/multimodal
 | 
			
		||||
  commands:
 | 
			
		||||
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
 | 
			
		||||
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
 | 
			
		||||
 | 
			
		||||
- label: Quantized Models Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  - tests/models/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s models/quantization
 | 
			
		||||
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
 | 
			
		||||
 | 
			
		||||
# This test is used only in PR development phase to test individual models and should never run on main
 | 
			
		||||
- label: Custom Models Test
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  optional: true
 | 
			
		||||
  commands:
 | 
			
		||||
    - echo 'Testing custom models...'
 | 
			
		||||
@ -585,7 +459,6 @@ steps:
 | 
			
		||||
#####  multi gpus test  #####
 | 
			
		||||
 | 
			
		||||
- label: Distributed Comm Ops Test # 7min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -596,7 +469,6 @@ steps:
 | 
			
		||||
  - pytest -v -s distributed/test_shm_broadcast.py
 | 
			
		||||
 | 
			
		||||
- label: 2 Node Tests (4 GPUs in total) # 16min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  num_nodes: 2
 | 
			
		||||
@ -615,7 +487,7 @@ steps:
 | 
			
		||||
    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 | 
			
		||||
 | 
			
		||||
- label: Distributed Tests (2 GPUs) # 40min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  #mirror_hardwares: [amd]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -629,39 +501,34 @@ steps:
 | 
			
		||||
  - vllm/worker/worker.py
 | 
			
		||||
  - vllm/worker/model_runner.py
 | 
			
		||||
  - entrypoints/llm/test_collective_rpc.py
 | 
			
		||||
  - tests/v1/test_async_llm_dp.py
 | 
			
		||||
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
 | 
			
		||||
  - vllm/v1/engine/
 | 
			
		||||
  commands:
 | 
			
		||||
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
 | 
			
		||||
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
 | 
			
		||||
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
 | 
			
		||||
  - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
 | 
			
		||||
  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
 | 
			
		||||
  - pytest -v -s ./compile/test_basic_correctness.py
 | 
			
		||||
  - pytest -v -s ./compile/test_wrapper.py
 | 
			
		||||
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
 | 
			
		||||
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  # Avoid importing model tests that cause CUDA reinitialization error
 | 
			
		||||
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  # test sequence parallel
 | 
			
		||||
  - pytest -v -s distributed/test_sequence_parallel.py
 | 
			
		||||
  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
 | 
			
		||||
  # this test fails consistently.
 | 
			
		||||
  # TODO: investigate and fix
 | 
			
		||||
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
 | 
			
		||||
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
 | 
			
		||||
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
 | 
			
		||||
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 | 
			
		||||
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
 | 
			
		||||
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 | 
			
		||||
 | 
			
		||||
- label: Plugin Tests (2 GPUs) # 40min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  fast_check: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/plugins/
 | 
			
		||||
  - tests/plugins/
 | 
			
		||||
  commands:
 | 
			
		||||
  # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
 | 
			
		||||
  # begin platform plugin tests, all the code in-between runs on dummy platform
 | 
			
		||||
  - pip install -e ./plugins/vllm_add_dummy_platform
 | 
			
		||||
  - pytest -v -s plugins_tests/test_platform_plugins.py
 | 
			
		||||
  - pip uninstall vllm_add_dummy_platform -y
 | 
			
		||||
@ -672,10 +539,8 @@ steps:
 | 
			
		||||
  - pytest -v -s distributed/test_distributed_oot.py
 | 
			
		||||
  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
 | 
			
		||||
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
 | 
			
		||||
 | 
			
		||||
- label: Multi-step Tests (4 GPUs) # 36min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -696,7 +561,6 @@ steps:
 | 
			
		||||
  - pytest -v -s multi_step/test_correctness_llm.py
 | 
			
		||||
 | 
			
		||||
- label: Pipeline Parallelism Test # 45min
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -710,7 +574,6 @@ steps:
 | 
			
		||||
  - pytest -v -s distributed/test_pipeline_parallel.py
 | 
			
		||||
 | 
			
		||||
- label: LoRA TP Test (Distributed)
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/lora
 | 
			
		||||
@ -719,14 +582,16 @@ steps:
 | 
			
		||||
    # FIXIT: find out which code initialize cuda before running the test
 | 
			
		||||
    # before the fix, we need to use spawn to test it
 | 
			
		||||
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
    # This test runs llama 13B, so it is required to run on 4 GPUs.
 | 
			
		||||
    - pytest -v -s -x lora/test_long_context.py
 | 
			
		||||
    # There is some Tensor Parallelism related processing logic in LoRA that
 | 
			
		||||
    # requires multi-GPU testing for validation.
 | 
			
		||||
    - pytest -v -s -x lora/test_chatglm3_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_llama_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_minicpmv_tp.py
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
- label: Weight Loading Multiple GPU Test  # 33min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -736,7 +601,6 @@ steps:
 | 
			
		||||
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 | 
			
		||||
 | 
			
		||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
 | 
			
		||||
  mirror_hardwares: [amdexperimental] 
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  gpu: a100
 | 
			
		||||
@ -775,4 +639,4 @@ steps:
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 | 
			
		||||
  - bash ./run-tests.sh -c configs/models-large.txt -t 4
 | 
			
		||||
 | 
			
		||||
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
			
		||||
if [[ $normal_wheel == *"cu118"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu118, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu118 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu126"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu126 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu121"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu121, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu121 wheels"
 | 
			
		||||
else
 | 
			
		||||
    # only upload index.html for cu128 wheels (default wheels)
 | 
			
		||||
    # only upload index.html for cu124 wheels (default wheels)
 | 
			
		||||
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
 | 
			
		||||
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 | 
			
		||||
fi
 | 
			
		||||
@ -66,13 +66,12 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 | 
			
		||||
if [[ $normal_wheel == *"cu118"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu118, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu118 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu126"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu126 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu121"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu121, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu121 wheels"
 | 
			
		||||
else
 | 
			
		||||
    # only upload index.html for cu128 wheels (default wheels)
 | 
			
		||||
    # only upload index.html for cu124 wheels (default wheels)
 | 
			
		||||
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 | 
			
		||||
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
 | 
			
		||||
							
								
								
									
										36
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										36
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							@ -10,41 +10,27 @@
 | 
			
		||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
 | 
			
		||||
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
 | 
			
		||||
/vllm/model_executor/guided_decoding @mgoin
 | 
			
		||||
/vllm/multimodal @DarkLight1337 @ywang96
 | 
			
		||||
/vllm/vllm_flash_attn @LucasWilkinson
 | 
			
		||||
/vllm/lora @jeejeelee
 | 
			
		||||
/vllm/reasoning @aarnphm
 | 
			
		||||
/vllm/entrypoints @aarnphm
 | 
			
		||||
CMakeLists.txt @tlrmchlsmth
 | 
			
		||||
 | 
			
		||||
# vLLM V1
 | 
			
		||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 | 
			
		||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm
 | 
			
		||||
 | 
			
		||||
# Test ownership
 | 
			
		||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
 | 
			
		||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 | 
			
		||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 | 
			
		||||
/tests/test_inputs.py @DarkLight1337 @ywang96
 | 
			
		||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
 | 
			
		||||
/tests/models @DarkLight1337 @ywang96
 | 
			
		||||
/tests/multimodal @DarkLight1337 @ywang96
 | 
			
		||||
/tests/prefix_caching @comaniac @KuntaiDu
 | 
			
		||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
 | 
			
		||||
/tests/kernels @tlrmchlsmth @WoosukKwon
 | 
			
		||||
/tests/quantization @mgoin @robertgshaw2-redhat
 | 
			
		||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
 | 
			
		||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
 | 
			
		||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
 | 
			
		||||
/tests/distributed/test_same_node.py @youkaichao
 | 
			
		||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
 | 
			
		||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
 | 
			
		||||
/tests/kernels @tlrmchlsmth @WoosukKwon
 | 
			
		||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
 | 
			
		||||
/tests/models @DarkLight1337 @ywang96
 | 
			
		||||
/tests/multi_step @alexm-redhat @comaniac
 | 
			
		||||
/tests/multimodal @DarkLight1337 @ywang96
 | 
			
		||||
/tests/prefix_caching @comaniac @KuntaiDu
 | 
			
		||||
/tests/quantization @mgoin @robertgshaw2-redhat
 | 
			
		||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
 | 
			
		||||
/tests/test_inputs.py @DarkLight1337 @ywang96
 | 
			
		||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 | 
			
		||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
 | 
			
		||||
/tests/weight_loading @mgoin @youkaichao
 | 
			
		||||
/tests/lora @jeejeelee
 | 
			
		||||
 | 
			
		||||
# Docs
 | 
			
		||||
/docs @hmellor
 | 
			
		||||
mkdocs.yaml @hmellor
 | 
			
		||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/200-installation.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/200-installation.yml
									
									
									
									
										vendored
									
									
								
							@ -14,7 +14,7 @@ body:
 | 
			
		||||
    description: |
 | 
			
		||||
      Please run the following and paste the output below.
 | 
			
		||||
      ```sh
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
 | 
			
		||||
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
 | 
			
		||||
      python collect_env.py
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/300-usage.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/300-usage.yml
									
									
									
									
										vendored
									
									
								
							@ -14,7 +14,7 @@ body:
 | 
			
		||||
    description: |
 | 
			
		||||
      Please run the following and paste the output below.
 | 
			
		||||
      ```sh
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
 | 
			
		||||
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
 | 
			
		||||
      python collect_env.py
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										22
									
								
								.github/ISSUE_TEMPLATE/400-bug-report.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										22
									
								
								.github/ISSUE_TEMPLATE/400-bug-report.yml
									
									
									
									
										vendored
									
									
								
							@ -8,30 +8,20 @@ body:
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: |
 | 
			
		||||
      ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
 | 
			
		||||
      - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
 | 
			
		||||
      - Passwords or authentication credentials
 | 
			
		||||
      - Private URLs or endpoints
 | 
			
		||||
      - Personal or confidential data
 | 
			
		||||
      
 | 
			
		||||
      Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Your current environment
 | 
			
		||||
    description: |
 | 
			
		||||
      Please run the following and paste the output below.
 | 
			
		||||
      ```sh
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
 | 
			
		||||
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
 | 
			
		||||
      python collect_env.py
 | 
			
		||||
      ```
 | 
			
		||||
      It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
 | 
			
		||||
    value: |
 | 
			
		||||
      <details>
 | 
			
		||||
      <summary>The output of <code>python collect_env.py</code></summary>
 | 
			
		||||
      <summary>The output of `python collect_env.py`</summary>
 | 
			
		||||
 | 
			
		||||
      ```text
 | 
			
		||||
      Your output of `python collect_env.py` here
 | 
			
		||||
@ -85,20 +75,20 @@ body:
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
      ```
 | 
			
		||||
      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
 | 
			
		||||
      The error message you got, with the full traceback.
 | 
			
		||||
      ```
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: |
 | 
			
		||||
      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
 | 
			
		||||
    value: >
 | 
			
		||||
      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
 | 
			
		||||
 | 
			
		||||
      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
 | 
			
		||||
 | 
			
		||||
      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
 | 
			
		||||
 | 
			
		||||
      Thanks for reporting 🙏!
 | 
			
		||||
      Thanks for contributing 🎉!
 | 
			
		||||
- type: checkboxes
 | 
			
		||||
  id: askllm
 | 
			
		||||
  attributes:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										69
									
								
								.github/ISSUE_TEMPLATE/450-ci-failure.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										69
									
								
								.github/ISSUE_TEMPLATE/450-ci-failure.yml
									
									
									
									
										vendored
									
									
								
							@ -1,69 +0,0 @@
 | 
			
		||||
name: 🧪 CI failure report
 | 
			
		||||
description: Report a failing test.
 | 
			
		||||
title: "[CI Failure]: "
 | 
			
		||||
labels: ["ci-failure"]
 | 
			
		||||
 | 
			
		||||
body:
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      #### Include the name of the failing Buildkite step and test file in the title.
 | 
			
		||||
- type: input
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Name of failing test
 | 
			
		||||
    description: |
 | 
			
		||||
      Paste in the fully-qualified name of the failing test from the logs.
 | 
			
		||||
    placeholder: |
 | 
			
		||||
      `path/to/test_file.py::test_name[params]`
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: checkboxes
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Basic information
 | 
			
		||||
    description: Select all items that apply to the failing test.
 | 
			
		||||
    options:
 | 
			
		||||
      - label: Flaky test
 | 
			
		||||
      - label: Can reproduce locally
 | 
			
		||||
      - label: Caused by external libraries (e.g. bug in `transformers`)
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: 🧪 Describe the failing test
 | 
			
		||||
    description: |
 | 
			
		||||
      Please provide a clear and concise description of the failing test.
 | 
			
		||||
    placeholder: |
 | 
			
		||||
      A clear and concise description of the failing test.
 | 
			
		||||
  
 | 
			
		||||
      ```
 | 
			
		||||
      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
 | 
			
		||||
      ```
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: 📝 History of failing test
 | 
			
		||||
    description: |
 | 
			
		||||
      Since when did the test start to fail?
 | 
			
		||||
      You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
 | 
			
		||||
 | 
			
		||||
      If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
 | 
			
		||||
 | 
			
		||||
      - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
 | 
			
		||||
 | 
			
		||||
      - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
 | 
			
		||||
 | 
			
		||||
      - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
 | 
			
		||||
    placeholder: |
 | 
			
		||||
      Approximate timeline and/or problematic PRs
 | 
			
		||||
 | 
			
		||||
      A link to the Buildkite analytics of the failing test (if available)
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: CC List.
 | 
			
		||||
    description: >
 | 
			
		||||
      The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      Thanks for reporting 🙏!
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/ISSUE_TEMPLATE/600-new-model.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/ISSUE_TEMPLATE/600-new-model.yml
									
									
									
									
										vendored
									
									
								
							@ -9,7 +9,7 @@ body:
 | 
			
		||||
    value: >
 | 
			
		||||
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 | 
			
		||||
 | 
			
		||||
      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
 | 
			
		||||
      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: The model to consider.
 | 
			
		||||
 | 
			
		||||
@ -35,7 +35,7 @@ body:
 | 
			
		||||
    description: |
 | 
			
		||||
      Please run the following and paste the output below.
 | 
			
		||||
      ```sh
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
 | 
			
		||||
      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
 | 
			
		||||
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
 | 
			
		||||
      python collect_env.py
 | 
			
		||||
      ```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										28
									
								
								.github/ISSUE_TEMPLATE/800-misc-discussion.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								.github/ISSUE_TEMPLATE/800-misc-discussion.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@ -0,0 +1,28 @@
 | 
			
		||||
name: 🎲 Misc/random discussions that do not fit into the above categories.
 | 
			
		||||
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
 | 
			
		||||
title: "[Misc]: "
 | 
			
		||||
labels: ["misc"]
 | 
			
		||||
 | 
			
		||||
body:
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
 | 
			
		||||
- type: textarea
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Anything you want to discuss about vllm.
 | 
			
		||||
    description: >
 | 
			
		||||
      Anything you want to discuss about vllm.
 | 
			
		||||
  validations:
 | 
			
		||||
    required: true
 | 
			
		||||
- type: markdown
 | 
			
		||||
  attributes:
 | 
			
		||||
    value: >
 | 
			
		||||
      Thanks for contributing 🎉!
 | 
			
		||||
- type: checkboxes
 | 
			
		||||
  id: askllm
 | 
			
		||||
  attributes:
 | 
			
		||||
    label: Before submitting a new issue...
 | 
			
		||||
    options:
 | 
			
		||||
      - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
 | 
			
		||||
        required: true
 | 
			
		||||
							
								
								
									
										4
									
								
								.github/ISSUE_TEMPLATE/config.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/ISSUE_TEMPLATE/config.yml
									
									
									
									
										vendored
									
									
								
							@ -1,5 +1 @@
 | 
			
		||||
blank_issues_enabled: false
 | 
			
		||||
contact_links:
 | 
			
		||||
  - name: Questions
 | 
			
		||||
    url: https://discuss.vllm.ai
 | 
			
		||||
    about: Ask questions and discuss with other vLLM community members
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										18
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										18
									
								
								.github/PULL_REQUEST_TEMPLATE.md
									
									
									
									
										vendored
									
									
								
							@ -1,18 +1,6 @@
 | 
			
		||||
## Essential Elements of an Effective PR Description Checklist
 | 
			
		||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
 | 
			
		||||
- [ ] The test plan, such as providing test command.
 | 
			
		||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
 | 
			
		||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
 | 
			
		||||
FILL IN THE PR DESCRIPTION HERE
 | 
			
		||||
 | 
			
		||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
 | 
			
		||||
 | 
			
		||||
## Purpose
 | 
			
		||||
 | 
			
		||||
## Test Plan
 | 
			
		||||
 | 
			
		||||
## Test Result
 | 
			
		||||
 | 
			
		||||
## (Optional) Documentation Update
 | 
			
		||||
FIX #xxxx (*link existing issues this PR will resolve*)
 | 
			
		||||
 | 
			
		||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
 | 
			
		||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
 | 
			
		||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/dependabot.yml
									
									
									
									
										vendored
									
									
								
							@ -23,7 +23,7 @@ updates:
 | 
			
		||||
      - dependency-name: "lm-format-enforcer"
 | 
			
		||||
      - dependency-name: "gguf"
 | 
			
		||||
      - dependency-name: "compressed-tensors"
 | 
			
		||||
      - dependency-name: "ray[cgraph]" # Ray Compiled Graph
 | 
			
		||||
      - dependency-name: "ray[adag]"
 | 
			
		||||
      - dependency-name: "lm-eval"
 | 
			
		||||
    groups:
 | 
			
		||||
      minor-update:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										139
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										139
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							@ -19,7 +19,7 @@ pull_request_rules:
 | 
			
		||||
      - files~=\.buildkite/
 | 
			
		||||
      - files~=^cmake/
 | 
			
		||||
      - files=CMakeLists.txt
 | 
			
		||||
      - files~=^docker/Dockerfile
 | 
			
		||||
      - files~=^Dockerfile
 | 
			
		||||
      - files~=^requirements.*\.txt
 | 
			
		||||
      - files=setup.py
 | 
			
		||||
  actions:
 | 
			
		||||
@ -36,87 +36,15 @@ pull_request_rules:
 | 
			
		||||
      add:
 | 
			
		||||
        - frontend
 | 
			
		||||
 | 
			
		||||
- name: label-llama
 | 
			
		||||
  description: Automatically apply llama label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/.*llama.*\.py
 | 
			
		||||
      - files~=^tests/.*llama.*\.py
 | 
			
		||||
      - files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*llama.*\.py
 | 
			
		||||
      - files~=^vllm/transformers_utils/configs/.*llama.*\.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - llama
 | 
			
		||||
 | 
			
		||||
- name: label-multi-modality
 | 
			
		||||
  description: Automatically apply multi-modality label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^vllm/multimodal/
 | 
			
		||||
      - files~=^tests/multimodal/
 | 
			
		||||
      - files~=^tests/models/multimodal/
 | 
			
		||||
      - files~=^tests/models/*/audio_language/
 | 
			
		||||
      - files~=^tests/models/*/vision_language/
 | 
			
		||||
      - files=tests/models/test_vision.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - multi-modality
 | 
			
		||||
 | 
			
		||||
- name: label-qwen
 | 
			
		||||
  description: Automatically apply qwen label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^examples/.*qwen.*\.py
 | 
			
		||||
      - files~=^tests/.*qwen.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/models/.*qwen.*\.py
 | 
			
		||||
      - files~=^vllm/reasoning/.*qwen.*\.py
 | 
			
		||||
      - title~=(?i)Qwen
 | 
			
		||||
      - body~=(?i)Qwen
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - qwen
 | 
			
		||||
 | 
			
		||||
- name: label-rocm
 | 
			
		||||
  description: Automatically apply rocm label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^csrc/rocm/
 | 
			
		||||
      - files~=^docker/Dockerfile.rocm
 | 
			
		||||
      - files~=^requirements/rocm.*\.txt
 | 
			
		||||
      - files~=^vllm/attention/backends/rocm.*\.py
 | 
			
		||||
      - files~=^vllm/attention/ops/rocm.*\.py
 | 
			
		||||
      - files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
 | 
			
		||||
      - files~=^vllm/v1/attention/backends/mla/rocm.*\.py
 | 
			
		||||
      - files~=^tests/kernels/.*_rocm.*\.py
 | 
			
		||||
      - files=vllm/platforms/rocm.py
 | 
			
		||||
      - title~=(?i)AMD
 | 
			
		||||
      - title~=(?i)ROCm
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - rocm
 | 
			
		||||
 | 
			
		||||
- name: label-structured-output
 | 
			
		||||
  description: Automatically apply structured-output label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^benchmarks/structured_schemas/
 | 
			
		||||
      - files=benchmarks/benchmark_serving_structured_output.py
 | 
			
		||||
      - files=benchmarks/run_structured_output_benchmark.sh
 | 
			
		||||
      - files=docs/features/structured_outputs.md
 | 
			
		||||
      - files=examples/offline_inference/structured_outputs.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
 | 
			
		||||
      - files~=^vllm/model_executor/guided_decoding/
 | 
			
		||||
      - files=tests/model_executor/test_guided_processors.py
 | 
			
		||||
      - files=tests/entrypoints/llm/test_guided_generate.py
 | 
			
		||||
      - files~=^tests/v1/structured_output/
 | 
			
		||||
      - files=tests/v1/entrypoints/llm/test_guided_generate.py
 | 
			
		||||
      - files~=^vllm/v1/structured_output/
 | 
			
		||||
      - files=benchmarks/benchmark_serving_guided.py
 | 
			
		||||
      - files=benchmarks/benchmark_guided.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
@ -145,56 +73,6 @@ pull_request_rules:
 | 
			
		||||
      add:
 | 
			
		||||
        - v1
 | 
			
		||||
 | 
			
		||||
- name: label-tpu
 | 
			
		||||
  description: Automatically apply tpu label
 | 
			
		||||
  # Keep this list in sync with `label-tpu-remove` conditions
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=tpu.py
 | 
			
		||||
      - files~=_tpu
 | 
			
		||||
      - files~=tpu_
 | 
			
		||||
      - files~=/tpu/
 | 
			
		||||
      - files~=pallas
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - tpu
 | 
			
		||||
 | 
			
		||||
- name: label-tpu-remove
 | 
			
		||||
  description: Automatically remove tpu label
 | 
			
		||||
  # Keep this list in sync with `label-tpu` conditions
 | 
			
		||||
  conditions:
 | 
			
		||||
    - and:
 | 
			
		||||
      - -files~=tpu.py
 | 
			
		||||
      - -files~=_tpu
 | 
			
		||||
      - -files~=tpu_
 | 
			
		||||
      - -files~=/tpu/
 | 
			
		||||
      - -files~=pallas
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      remove:
 | 
			
		||||
        - tpu
 | 
			
		||||
 | 
			
		||||
- name: label-tool-calling
 | 
			
		||||
  description: Automatically add tool-calling label
 | 
			
		||||
  conditions:
 | 
			
		||||
    - or:
 | 
			
		||||
      - files~=^tests/tool_use/
 | 
			
		||||
      - files~=^tests/mistral_tool_use/
 | 
			
		||||
      - files~=^tests/entrypoints/openai/tool_parsers/
 | 
			
		||||
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
 | 
			
		||||
      - files~=^vllm/entrypoints/openai/tool_parsers/
 | 
			
		||||
      - files=docs/features/tool_calling.md
 | 
			
		||||
      - files~=^examples/tool_chat_*
 | 
			
		||||
      - files=examples/offline_inference/chat_with_tools.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
 | 
			
		||||
      - files=examples/online_serving/openai_chat_completion_client_with_tools.py
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
        - tool-calling
 | 
			
		||||
 | 
			
		||||
- name: ping author on conflicts and add 'needs-rebase' label
 | 
			
		||||
  conditions:
 | 
			
		||||
      - conflict
 | 
			
		||||
@ -210,17 +88,6 @@ pull_request_rules:
 | 
			
		||||
 | 
			
		||||
       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 | 
			
		||||
 | 
			
		||||
- name: assign reviewer for tensorizer changes
 | 
			
		||||
  conditions:
 | 
			
		||||
      - files~=^vllm/model_executor/model_loader/tensorizer.py
 | 
			
		||||
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
 | 
			
		||||
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
 | 
			
		||||
      - files~=^tests/tensorizer_loader/
 | 
			
		||||
  actions:
 | 
			
		||||
    assign:
 | 
			
		||||
      users:
 | 
			
		||||
        - "sangstar"
 | 
			
		||||
 | 
			
		||||
- name: remove 'needs-rebase' label when conflict is resolved
 | 
			
		||||
  conditions:
 | 
			
		||||
      - -conflict
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/scripts/cleanup_pr_body.sh
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/scripts/cleanup_pr_body.sh
									
									
									
									
										vendored
									
									
								
							@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
 | 
			
		||||
 | 
			
		||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
 | 
			
		||||
python3 - <<EOF
 | 
			
		||||
import regex as re
 | 
			
		||||
import re
 | 
			
		||||
 | 
			
		||||
with open("${NEW}", "r") as file:
 | 
			
		||||
    content = file.read()
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/add_label_automerge.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/add_label_automerge.yml
									
									
									
									
										vendored
									
									
								
							@ -1,6 +1,4 @@
 | 
			
		||||
name: Add label on auto-merge enabled
 | 
			
		||||
permissions:
 | 
			
		||||
    pull-requests: write
 | 
			
		||||
on:
 | 
			
		||||
    pull_request_target:
 | 
			
		||||
        types:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										7
									
								
								.github/workflows/cleanup_pr_body.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.github/workflows/cleanup_pr_body.yml
									
									
									
									
										vendored
									
									
								
							@ -20,12 +20,7 @@ jobs:
 | 
			
		||||
        with:
 | 
			
		||||
          python-version: '3.12'
 | 
			
		||||
 | 
			
		||||
      - name: Install Python dependencies
 | 
			
		||||
        run: |
 | 
			
		||||
          python3 -m pip install --upgrade pip
 | 
			
		||||
          python3 -m pip install regex
 | 
			
		||||
 | 
			
		||||
      - name: Update PR description
 | 
			
		||||
        env:
 | 
			
		||||
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 | 
			
		||||
        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
 | 
			
		||||
        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										5
									
								
								.github/workflows/lint-and-deploy.yaml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								.github/workflows/lint-and-deploy.yaml
									
									
									
									
										vendored
									
									
								
							@ -2,9 +2,6 @@ name: Lint and Deploy Charts
 | 
			
		||||
 | 
			
		||||
on: pull_request
 | 
			
		||||
 | 
			
		||||
permissions:
 | 
			
		||||
  contents: read
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  lint-and-deploy:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
@ -53,7 +50,7 @@ jobs:
 | 
			
		||||
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
 | 
			
		||||
 | 
			
		||||
      - name: Build the Docker image vllm cpu
 | 
			
		||||
        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
 | 
			
		||||
        run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
 | 
			
		||||
 | 
			
		||||
      - name: Configuration of docker images, network and namespace for the kind cluster
 | 
			
		||||
        run: |
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										3
									
								
								.github/workflows/pre-commit.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.github/workflows/pre-commit.yml
									
									
									
									
										vendored
									
									
								
							@ -5,9 +5,6 @@ on:
 | 
			
		||||
  push:
 | 
			
		||||
    branches: [main]
 | 
			
		||||
 | 
			
		||||
permissions:
 | 
			
		||||
  contents: read
 | 
			
		||||
 | 
			
		||||
jobs:
 | 
			
		||||
  pre-commit:
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										4
									
								
								.github/workflows/publish.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/publish.yml
									
									
									
									
										vendored
									
									
								
							@ -39,7 +39,7 @@ jobs:
 | 
			
		||||
            const script = require('.github/workflows/scripts/create_release.js')
 | 
			
		||||
            await script(github, context, core)
 | 
			
		||||
 | 
			
		||||
  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
 | 
			
		||||
  # NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow. 
 | 
			
		||||
  # wheel:
 | 
			
		||||
  #   name: Build Wheel
 | 
			
		||||
  #   runs-on: ${{ matrix.os }}
 | 
			
		||||
@ -50,7 +50,7 @@ jobs:
 | 
			
		||||
  #     matrix:
 | 
			
		||||
  #         os: ['ubuntu-20.04']
 | 
			
		||||
  #         python-version: ['3.9', '3.10', '3.11', '3.12']
 | 
			
		||||
  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
 | 
			
		||||
  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
 | 
			
		||||
  #         cuda-version: ['11.8', '12.1']
 | 
			
		||||
 | 
			
		||||
  #   steps:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/reminder_comment.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/reminder_comment.yml
									
									
									
									
										vendored
									
									
								
							@ -1,6 +1,4 @@
 | 
			
		||||
name: PR Reminder Comment Bot
 | 
			
		||||
permissions:
 | 
			
		||||
  pull-requests: write
 | 
			
		||||
on:
 | 
			
		||||
  pull_request_target:
 | 
			
		||||
    types: [opened]
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/scripts/build.sh
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/scripts/build.sh
									
									
									
									
										vendored
									
									
								
							@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
 | 
			
		||||
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 | 
			
		||||
 | 
			
		||||
# Install requirements
 | 
			
		||||
$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
 | 
			
		||||
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
 | 
			
		||||
 | 
			
		||||
# Limit the number of parallel jobs to avoid OOM
 | 
			
		||||
export MAX_JOBS=1
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/workflows/scripts/create_release.js
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/scripts/create_release.js
									
									
									
									
										vendored
									
									
								
							@ -1,4 +1,4 @@
 | 
			
		||||
// Uses GitHub's API to create the release and wait for result.
 | 
			
		||||
// Uses Github's API to create the release and wait for result.
 | 
			
		||||
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 | 
			
		||||
 | 
			
		||||
module.exports = async (github, context, core) => {
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										12
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										12
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -2,7 +2,7 @@
 | 
			
		||||
/vllm/_version.py
 | 
			
		||||
 | 
			
		||||
# vllm-flash-attn built from source
 | 
			
		||||
vllm/vllm_flash_attn/*
 | 
			
		||||
vllm/vllm_flash_attn/
 | 
			
		||||
 | 
			
		||||
# Byte-compiled / optimized / DLL files
 | 
			
		||||
__pycache__/
 | 
			
		||||
@ -77,6 +77,10 @@ instance/
 | 
			
		||||
# Scrapy stuff:
 | 
			
		||||
.scrapy
 | 
			
		||||
 | 
			
		||||
# Sphinx documentation
 | 
			
		||||
docs/_build/
 | 
			
		||||
docs/source/getting_started/examples/
 | 
			
		||||
 | 
			
		||||
# PyBuilder
 | 
			
		||||
.pybuilder/
 | 
			
		||||
target/
 | 
			
		||||
@ -146,7 +150,6 @@ venv.bak/
 | 
			
		||||
 | 
			
		||||
# mkdocs documentation
 | 
			
		||||
/site
 | 
			
		||||
docs/examples
 | 
			
		||||
 | 
			
		||||
# mypy
 | 
			
		||||
.mypy_cache/
 | 
			
		||||
@ -194,11 +197,8 @@ _build/
 | 
			
		||||
hip_compat.h
 | 
			
		||||
 | 
			
		||||
# Benchmark dataset
 | 
			
		||||
benchmarks/**/*.json
 | 
			
		||||
benchmarks/*.json
 | 
			
		||||
 | 
			
		||||
# Linting
 | 
			
		||||
actionlint
 | 
			
		||||
shellcheck*/
 | 
			
		||||
 | 
			
		||||
# Ignore moe/marlin_moe gen code
 | 
			
		||||
csrc/moe/marlin_moe_wna16/kernel_*
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,3 @@
 | 
			
		||||
default_install_hook_types:
 | 
			
		||||
  - pre-commit
 | 
			
		||||
  - commit-msg
 | 
			
		||||
default_stages:
 | 
			
		||||
  - pre-commit # Run locally
 | 
			
		||||
  - manual # Run in CI
 | 
			
		||||
@ -11,46 +8,44 @@ repos:
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: yapf
 | 
			
		||||
    args: [--in-place, --verbose]
 | 
			
		||||
    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
 | 
			
		||||
    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 | 
			
		||||
    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 | 
			
		||||
- repo: https://github.com/astral-sh/ruff-pre-commit
 | 
			
		||||
  rev: v0.11.7
 | 
			
		||||
  rev: v0.9.3
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: ruff
 | 
			
		||||
    args: [--output-format, github, --fix]
 | 
			
		||||
  - id: ruff-format
 | 
			
		||||
    files: ^(.buildkite|benchmarks|examples)/.*
 | 
			
		||||
- repo: https://github.com/crate-ci/typos
 | 
			
		||||
  rev: v1.32.0
 | 
			
		||||
- repo: https://github.com/codespell-project/codespell
 | 
			
		||||
  rev: v2.4.0
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: typos
 | 
			
		||||
  - id: codespell
 | 
			
		||||
    additional_dependencies: ['tomli']
 | 
			
		||||
    args: ['--toml', 'pyproject.toml']
 | 
			
		||||
- repo: https://github.com/PyCQA/isort
 | 
			
		||||
  rev: 6.0.1
 | 
			
		||||
  rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: isort
 | 
			
		||||
- repo: https://github.com/pre-commit/mirrors-clang-format
 | 
			
		||||
  rev: v20.1.3
 | 
			
		||||
  rev: v19.1.7
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: clang-format
 | 
			
		||||
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
 | 
			
		||||
    types_or: [c++, cuda]
 | 
			
		||||
    args: [--style=file, --verbose]
 | 
			
		||||
- repo: https://github.com/jackdewinter/pymarkdown
 | 
			
		||||
  rev: v0.9.29
 | 
			
		||||
  rev: v0.9.27
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: pymarkdown
 | 
			
		||||
    exclude: '.*\.inc\.md'
 | 
			
		||||
    args: [fix]
 | 
			
		||||
- repo: https://github.com/rhysd/actionlint
 | 
			
		||||
  rev: v1.7.7
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: actionlint
 | 
			
		||||
- repo: https://github.com/astral-sh/uv-pre-commit
 | 
			
		||||
  rev: 0.6.17
 | 
			
		||||
  rev: 0.6.2
 | 
			
		||||
  hooks:
 | 
			
		||||
    - id: pip-compile
 | 
			
		||||
      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
 | 
			
		||||
      files: ^requirements/test\.(in|txt)$
 | 
			
		||||
      args: [requirements-test.in, -o, requirements-test.txt]
 | 
			
		||||
      files: ^requirements-test\.(in|txt)$
 | 
			
		||||
- repo: local
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: mypy-local
 | 
			
		||||
@ -58,7 +53,7 @@ repos:
 | 
			
		||||
    entry: tools/mypy.sh 0 "local"
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
 | 
			
		||||
    additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
 | 
			
		||||
    stages: [pre-commit] # Don't run in CI
 | 
			
		||||
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
			
		||||
    name: Run mypy for Python 3.9
 | 
			
		||||
@ -104,8 +99,8 @@ repos:
 | 
			
		||||
    args:
 | 
			
		||||
      - -c
 | 
			
		||||
      - |
 | 
			
		||||
        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
 | 
			
		||||
          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
 | 
			
		||||
        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
 | 
			
		||||
          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
 | 
			
		||||
        fi
 | 
			
		||||
    language: system
 | 
			
		||||
    verbose: true
 | 
			
		||||
@ -124,32 +119,6 @@ repos:
 | 
			
		||||
    language: system
 | 
			
		||||
    always_run: true
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
  - id: update-dockerfile-graph
 | 
			
		||||
    name: Update Dockerfile dependency graph
 | 
			
		||||
    entry: tools/update-dockerfile-graph.sh
 | 
			
		||||
    language: script
 | 
			
		||||
  - id: enforce-import-regex-instead-of-re
 | 
			
		||||
    name: Enforce import regex as re
 | 
			
		||||
    entry: python tools/enforce_regex_import.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
    additional_dependencies: [regex]
 | 
			
		||||
  # forbid directly import triton
 | 
			
		||||
  - id: forbid-direct-triton-import
 | 
			
		||||
    name: "Forbid direct 'import triton'"
 | 
			
		||||
    entry: python tools/check_triton_import.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
    additional_dependencies: [regex]
 | 
			
		||||
  - id: check-pickle-imports
 | 
			
		||||
    name: Prevent new pickle/cloudpickle imports
 | 
			
		||||
    entry: python tools/check_pickle_imports.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
    additional_dependencies: [pathspec, regex]
 | 
			
		||||
  # Keep `suggestion` last
 | 
			
		||||
  - id: suggestion
 | 
			
		||||
    name: Suggestion
 | 
			
		||||
 | 
			
		||||
@ -8,10 +8,14 @@ build:
 | 
			
		||||
  tools:
 | 
			
		||||
    python: "3.12"
 | 
			
		||||
 | 
			
		||||
mkdocs:
 | 
			
		||||
  configuration: mkdocs.yaml
 | 
			
		||||
sphinx:
 | 
			
		||||
  configuration: docs/source/conf.py
 | 
			
		||||
  fail_on_warning: true
 | 
			
		||||
 | 
			
		||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
 | 
			
		||||
formats: []
 | 
			
		||||
 | 
			
		||||
# Optionally declare the Python requirements required to build your docs
 | 
			
		||||
python:
 | 
			
		||||
  install:
 | 
			
		||||
    - requirements: requirements/docs.txt
 | 
			
		||||
    - requirements: docs/requirements-docs.txt
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										317
									
								
								CMakeLists.txt
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										317
									
								
								CMakeLists.txt
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							@ -15,6 +15,7 @@ project(vllm_extensions LANGUAGES CXX)
 | 
			
		||||
 | 
			
		||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 | 
			
		||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 | 
			
		||||
 | 
			
		||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 | 
			
		||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
 | 
			
		||||
 | 
			
		||||
@ -23,17 +24,17 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
 | 
			
		||||
# Suppress potential warnings about unused manually-specified variables
 | 
			
		||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
 | 
			
		||||
 | 
			
		||||
# Prevent installation of dependencies (cutlass) by default.
 | 
			
		||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Supported python versions.  These versions will be searched in order, the
 | 
			
		||||
# first match will be selected.  These should be kept in sync with setup.py.
 | 
			
		||||
#
 | 
			
		||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 | 
			
		||||
 | 
			
		||||
# Supported NVIDIA architectures.
 | 
			
		||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 | 
			
		||||
 | 
			
		||||
# Supported AMD GPU architectures.
 | 
			
		||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 | 
			
		||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Supported/expected torch versions for CUDA/ROCm.
 | 
			
		||||
@ -43,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 | 
			
		||||
#
 | 
			
		||||
# Note: the CUDA torch version is derived from pyproject.toml and various
 | 
			
		||||
# requirements.txt files and should be kept consistent.  The ROCm torch
 | 
			
		||||
# versions are derived from docker/Dockerfile.rocm
 | 
			
		||||
# versions are derived from Dockerfile.rocm
 | 
			
		||||
#
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Try to find python package with an executable that exactly matches
 | 
			
		||||
@ -79,15 +80,6 @@ endif()
 | 
			
		||||
#
 | 
			
		||||
find_package(Torch REQUIRED)
 | 
			
		||||
 | 
			
		||||
# Supported NVIDIA architectures.
 | 
			
		||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 | 
			
		||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
 | 
			
		||||
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
 | 
			
		||||
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 | 
			
		||||
else()
 | 
			
		||||
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Forward the non-CUDA device extensions to external CMake scripts.
 | 
			
		||||
#
 | 
			
		||||
@ -182,6 +174,9 @@ include(FetchContent)
 | 
			
		||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
 | 
			
		||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Set rocm version dev int.
 | 
			
		||||
#
 | 
			
		||||
if(VLLM_GPU_LANG STREQUAL "HIP")
 | 
			
		||||
  #
 | 
			
		||||
  # Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
 | 
			
		||||
@ -189,6 +184,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 | 
			
		||||
  set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
 | 
			
		||||
  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  #
 | 
			
		||||
  # Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
 | 
			
		||||
  # a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
 | 
			
		||||
@ -231,35 +227,28 @@ endif()
 | 
			
		||||
#
 | 
			
		||||
 | 
			
		||||
set(VLLM_EXT_SRC
 | 
			
		||||
  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
 | 
			
		||||
  "csrc/mamba/causal_conv1d/causal_conv1d.cu"
 | 
			
		||||
  "csrc/cache_kernels.cu"
 | 
			
		||||
  "csrc/attention/paged_attention_v1.cu"
 | 
			
		||||
  "csrc/attention/paged_attention_v2.cu"
 | 
			
		||||
  "csrc/attention/merge_attn_states.cu"
 | 
			
		||||
  "csrc/attention/vertical_slash_index.cu"
 | 
			
		||||
  "csrc/pos_encoding_kernels.cu"
 | 
			
		||||
  "csrc/activation_kernels.cu"
 | 
			
		||||
  "csrc/layernorm_kernels.cu"
 | 
			
		||||
  "csrc/layernorm_quant_kernels.cu"
 | 
			
		||||
  "csrc/sampler.cu"
 | 
			
		||||
  "csrc/cuda_view.cu"
 | 
			
		||||
  "csrc/quantization/gptq/q_gemm.cu"
 | 
			
		||||
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
 | 
			
		||||
  "csrc/quantization/fp8/common.cu"
 | 
			
		||||
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
 | 
			
		||||
  "csrc/quantization/gguf/gguf_kernel.cu"
 | 
			
		||||
  "csrc/quantization/activation_kernels.cu"
 | 
			
		||||
  "csrc/cuda_utils_kernels.cu"
 | 
			
		||||
  "csrc/prepare_inputs/advance_step.cu"
 | 
			
		||||
  "csrc/custom_all_reduce.cu"
 | 
			
		||||
  "csrc/torch_bindings.cpp")
 | 
			
		||||
 | 
			
		||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 | 
			
		||||
 | 
			
		||||
  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
 | 
			
		||||
  set(CUTLASS_REVISION "v3.9.2" CACHE STRING "CUTLASS revision to use")
 | 
			
		||||
  # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
 | 
			
		||||
  # Please keep this in sync with FetchContent_Declare line below.
 | 
			
		||||
  set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
 | 
			
		||||
 | 
			
		||||
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
 | 
			
		||||
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
 | 
			
		||||
@ -277,7 +266,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
        cutlass
 | 
			
		||||
        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
 | 
			
		||||
        # Please keep this in sync with CUTLASS_REVISION line above.
 | 
			
		||||
        GIT_TAG ${CUTLASS_REVISION}
 | 
			
		||||
        GIT_TAG v3.8.0
 | 
			
		||||
        GIT_PROGRESS TRUE
 | 
			
		||||
 | 
			
		||||
        # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
 | 
			
		||||
@ -289,16 +278,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  FetchContent_MakeAvailable(cutlass)
 | 
			
		||||
 | 
			
		||||
  list(APPEND VLLM_EXT_SRC
 | 
			
		||||
    "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
 | 
			
		||||
    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
 | 
			
		||||
    "csrc/quantization/aqlm/gemm_kernels.cu"
 | 
			
		||||
    "csrc/quantization/awq/gemm_kernels.cu"
 | 
			
		||||
    "csrc/custom_all_reduce.cu"
 | 
			
		||||
    "csrc/permute_cols.cu"
 | 
			
		||||
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
 | 
			
		||||
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
 | 
			
		||||
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
 | 
			
		||||
    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
 | 
			
		||||
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
 | 
			
		||||
    "csrc/cutlass_extensions/common.cpp"
 | 
			
		||||
    "csrc/attention/mla/cutlass_mla_entry.cu")
 | 
			
		||||
    "csrc/cutlass_extensions/common.cpp")
 | 
			
		||||
 | 
			
		||||
  set_gencode_flags_for_srcs(
 | 
			
		||||
    SRCS "${VLLM_EXT_SRC}"
 | 
			
		||||
@ -307,55 +297,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  # Only build Marlin kernels if we are building for at least some compatible archs.
 | 
			
		||||
  # Keep building Marlin for 9.0 as there are some group sizes and shapes that
 | 
			
		||||
  # are not supported by Machete yet.
 | 
			
		||||
  # 9.0 for latest bf16 atomicAdd PTX
 | 
			
		||||
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
 | 
			
		||||
  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
 | 
			
		||||
  if (MARLIN_ARCHS)
 | 
			
		||||
 | 
			
		||||
    #
 | 
			
		||||
    # For the Marlin kernels we automatically generate sources for various
 | 
			
		||||
    # preselected input type pairs and schedules.
 | 
			
		||||
    # Generate sources:
 | 
			
		||||
    set(MARLIN_GEN_SCRIPT
 | 
			
		||||
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
 | 
			
		||||
    file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
 | 
			
		||||
 | 
			
		||||
    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
 | 
			
		||||
    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
 | 
			
		||||
 | 
			
		||||
    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
 | 
			
		||||
        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
 | 
			
		||||
      execute_process(
 | 
			
		||||
        COMMAND ${CMAKE_COMMAND} -E env
 | 
			
		||||
        PYTHONPATH=$PYTHONPATH
 | 
			
		||||
          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
 | 
			
		||||
        RESULT_VARIABLE marlin_generation_result
 | 
			
		||||
        OUTPUT_VARIABLE marlin_generation_result
 | 
			
		||||
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
 | 
			
		||||
        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
 | 
			
		||||
      )
 | 
			
		||||
 | 
			
		||||
      if (NOT marlin_generation_result EQUAL 0)
 | 
			
		||||
        message(FATAL_ERROR "Marlin generation failed."
 | 
			
		||||
                            " Result: \"${marlin_generation_result}\""
 | 
			
		||||
                            "\nCheck the log for details: "
 | 
			
		||||
                            "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
 | 
			
		||||
      else()
 | 
			
		||||
        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
 | 
			
		||||
            CACHE STRING "Last run Marlin generate script hash" FORCE)
 | 
			
		||||
        message(STATUS "Marlin generation completed successfully.")
 | 
			
		||||
      endif()
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Marlin generation script has not changed, skipping generation.")
 | 
			
		||||
    endif()
 | 
			
		||||
 | 
			
		||||
    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
 | 
			
		||||
      CUDA_ARCHS "${MARLIN_ARCHS}")
 | 
			
		||||
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 | 
			
		||||
 | 
			
		||||
    set(MARLIN_SRCS
 | 
			
		||||
       "csrc/quantization/fp8/fp8_marlin.cu"
 | 
			
		||||
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
 | 
			
		||||
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
 | 
			
		||||
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
 | 
			
		||||
@ -372,89 +317,43 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
                   " in CUDA target architectures")
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  # Only build AllSpark kernels if we are building for at least some compatible archs.
 | 
			
		||||
  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
 | 
			
		||||
  if (ALLSPARK_ARCHS)
 | 
			
		||||
    set(ALLSPARK_SRCS
 | 
			
		||||
       "csrc/quantization/gptq_allspark/allspark_repack.cu"
 | 
			
		||||
       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${ALLSPARK_SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${ALLSPARK_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
 | 
			
		||||
    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    message(STATUS "Not building AllSpark kernels as no compatible archs found"
 | 
			
		||||
                   " in CUDA target architectures")
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  set(SCALED_MM_3X_ARCHS)
 | 
			
		||||
  # The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
 | 
			
		||||
  # CUDA 12.0 or later
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
 | 
			
		||||
  # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
 | 
			
		||||
    set(SRCS
 | 
			
		||||
       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
 | 
			
		||||
       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
 | 
			
		||||
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
 | 
			
		||||
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
 | 
			
		||||
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
 | 
			
		||||
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
 | 
			
		||||
    # Let scaled_mm_c2x know it doesn't need to build these arches
 | 
			
		||||
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
 | 
			
		||||
    message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
 | 
			
		||||
                     "later if you intend on running FP8 quantized models on "
 | 
			
		||||
                     "Hopper.")
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
 | 
			
		||||
                     "in CUDA target architectures")
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
 | 
			
		||||
  # require CUDA 12.8 or later
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
    set(SRCS
 | 
			
		||||
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
 | 
			
		||||
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
 | 
			
		||||
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
 | 
			
		||||
    )
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
 | 
			
		||||
    # Let scaled_mm_c2x know it doesn't need to build these arches
 | 
			
		||||
    list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.8, we recommend upgrading to CUDA 12.8 or "
 | 
			
		||||
                     "later if you intend on running FP8 quantized models on "
 | 
			
		||||
                     "Blackwell.")
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
 | 
			
		||||
                     "in CUDA target architectures")
 | 
			
		||||
    endif()
 | 
			
		||||
    # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
 | 
			
		||||
    # build any 3x kernels
 | 
			
		||||
    set(SCALED_MM_3X_ARCHS)
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  #
 | 
			
		||||
  # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
 | 
			
		||||
  # kernels for the remaining archs that are not already built for 3x.
 | 
			
		||||
  # (Build 8.9 for FP8)
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
 | 
			
		||||
    "7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
 | 
			
		||||
    "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
 | 
			
		||||
  # subtract out the archs that are already built for 3x
 | 
			
		||||
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
 | 
			
		||||
  if (SCALED_MM_2X_ARCHS)
 | 
			
		||||
@ -479,18 +378,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  # 2:4 Sparse Kernels
 | 
			
		||||
 | 
			
		||||
  # The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
 | 
			
		||||
  # require CUDA 12.2 or later (and only work on Hopper).
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
 | 
			
		||||
  # require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
 | 
			
		||||
    set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
      CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
 | 
			
		||||
    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
    message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
 | 
			
		||||
      message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
 | 
			
		||||
                     "if you intend on running FP8 sparse quantized models on Hopper.")
 | 
			
		||||
@ -505,9 +403,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
 | 
			
		||||
    set(SRCS
 | 
			
		||||
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
 | 
			
		||||
      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
 | 
			
		||||
      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
 | 
			
		||||
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
 | 
			
		||||
    )
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${FP4_ARCHS}")
 | 
			
		||||
@ -520,52 +417,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
    set(FP4_ARCHS)
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  # CUTLASS MLA Archs and flags
 | 
			
		||||
  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
 | 
			
		||||
    set(SRCS
 | 
			
		||||
      "csrc/attention/mla/cutlass_mla_kernels.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${MLA_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
 | 
			
		||||
    # Add MLA-specific include directories only to MLA source files
 | 
			
		||||
    set_source_files_properties(${SRCS}
 | 
			
		||||
      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
 | 
			
		||||
    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
 | 
			
		||||
    # clear MLA_ARCHS
 | 
			
		||||
    set(MLA_ARCHS)
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  # CUTLASS MoE kernels
 | 
			
		||||
 | 
			
		||||
  # The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
 | 
			
		||||
  # on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
 | 
			
		||||
  # if it's possible to compile MoE kernels that use its output.
 | 
			
		||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
 | 
			
		||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
 | 
			
		||||
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
 | 
			
		||||
             "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${SRCS}"
 | 
			
		||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
			
		||||
    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
			
		||||
    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
 | 
			
		||||
    message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
 | 
			
		||||
      message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
 | 
			
		||||
                     "not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
 | 
			
		||||
                     "if you intend on running FP8 quantized MoE models on Hopper.")
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
 | 
			
		||||
                     "in CUDA target architectures")
 | 
			
		||||
    endif()
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  #
 | 
			
		||||
  # Machete kernels
 | 
			
		||||
 | 
			
		||||
@ -647,7 +498,6 @@ define_gpu_extension_target(
 | 
			
		||||
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
 | 
			
		||||
  ARCHITECTURES ${VLLM_GPU_ARCHES}
 | 
			
		||||
  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
 | 
			
		||||
  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
 | 
			
		||||
  USE_SABI 3
 | 
			
		||||
  WITH_SOABI)
 | 
			
		||||
 | 
			
		||||
@ -666,71 +516,28 @@ set(VLLM_MOE_EXT_SRC
 | 
			
		||||
  "csrc/moe/moe_align_sum_kernels.cu"
 | 
			
		||||
  "csrc/moe/topk_softmax_kernels.cu")
 | 
			
		||||
 | 
			
		||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
set_gencode_flags_for_srcs(
 | 
			
		||||
  SRCS "${VLLM_MOE_EXT_SRC}"
 | 
			
		||||
  CUDA_ARCHS "${CUDA_ARCHS}")
 | 
			
		||||
 | 
			
		||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  set(VLLM_MOE_WNA16_SRC
 | 
			
		||||
    "csrc/moe/moe_wna16.cu")
 | 
			
		||||
 | 
			
		||||
  set_gencode_flags_for_srcs(
 | 
			
		||||
    SRCS "${VLLM_MOE_WNA16_SRC}"
 | 
			
		||||
    CUDA_ARCHS "${CUDA_ARCHS}")
 | 
			
		||||
 | 
			
		||||
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
 | 
			
		||||
  # 9.0 for latest bf16 atomicAdd PTX
 | 
			
		||||
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
 | 
			
		||||
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
 | 
			
		||||
  if (MARLIN_MOE_ARCHS)
 | 
			
		||||
    set(MARLIN_MOE_SRC
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
 | 
			
		||||
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
 | 
			
		||||
        "csrc/moe/marlin_moe_ops.cu")
 | 
			
		||||
 | 
			
		||||
    #
 | 
			
		||||
    # For the Marlin MOE kernels we automatically generate sources for various
 | 
			
		||||
    # preselected input type pairs and schedules.
 | 
			
		||||
    # Generate sources:
 | 
			
		||||
    set(MOE_MARLIN_GEN_SCRIPT
 | 
			
		||||
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
 | 
			
		||||
    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
 | 
			
		||||
 | 
			
		||||
    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
 | 
			
		||||
    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
 | 
			
		||||
 | 
			
		||||
    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
 | 
			
		||||
        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
 | 
			
		||||
      execute_process(
 | 
			
		||||
        COMMAND ${CMAKE_COMMAND} -E env
 | 
			
		||||
        PYTHONPATH=$PYTHONPATH
 | 
			
		||||
          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
 | 
			
		||||
        RESULT_VARIABLE moe_marlin_generation_result
 | 
			
		||||
        OUTPUT_VARIABLE moe_marlin_generation_output
 | 
			
		||||
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
 | 
			
		||||
        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
 | 
			
		||||
      )
 | 
			
		||||
 | 
			
		||||
      if (NOT moe_marlin_generation_result EQUAL 0)
 | 
			
		||||
        message(FATAL_ERROR "Marlin MOE generation failed."
 | 
			
		||||
                            " Result: \"${moe_marlin_generation_result}\""
 | 
			
		||||
                            "\nCheck the log for details: "
 | 
			
		||||
                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
 | 
			
		||||
      else()
 | 
			
		||||
        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
 | 
			
		||||
            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
 | 
			
		||||
        message(STATUS "Marlin MOE generation completed successfully.")
 | 
			
		||||
      endif()
 | 
			
		||||
    else()
 | 
			
		||||
      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
 | 
			
		||||
    endif()
 | 
			
		||||
 | 
			
		||||
    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
 | 
			
		||||
    set_gencode_flags_for_srcs(
 | 
			
		||||
      SRCS "${MOE_WNAA16_MARLIN_SRC}"
 | 
			
		||||
      SRCS "${MARLIN_MOE_SRC}"
 | 
			
		||||
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
 | 
			
		||||
 | 
			
		||||
    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
 | 
			
		||||
 | 
			
		||||
    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
 | 
			
		||||
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
 | 
			
		||||
  else()
 | 
			
		||||
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
 | 
			
		||||
@ -738,17 +545,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  endif()
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
  set(MOE_PERMUTE_SRC
 | 
			
		||||
      "csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
 | 
			
		||||
      "csrc/moe/moe_permute_unpermute_op.cu")
 | 
			
		||||
 | 
			
		||||
  set_gencode_flags_for_srcs(
 | 
			
		||||
    SRCS "${MARLIN_PERMUTE_SRC}"
 | 
			
		||||
    CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
 | 
			
		||||
 | 
			
		||||
  list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
 | 
			
		||||
endif()
 | 
			
		||||
message(STATUS "Enabling moe extension.")
 | 
			
		||||
define_gpu_extension_target(
 | 
			
		||||
  _moe_C
 | 
			
		||||
@ -757,8 +553,6 @@ define_gpu_extension_target(
 | 
			
		||||
  SOURCES ${VLLM_MOE_EXT_SRC}
 | 
			
		||||
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
 | 
			
		||||
  ARCHITECTURES ${VLLM_GPU_ARCHES}
 | 
			
		||||
  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
 | 
			
		||||
  INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
 | 
			
		||||
  USE_SABI 3
 | 
			
		||||
  WITH_SOABI)
 | 
			
		||||
 | 
			
		||||
@ -768,7 +562,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 | 
			
		||||
  #
 | 
			
		||||
  set(VLLM_ROCM_EXT_SRC
 | 
			
		||||
    "csrc/rocm/torch_bindings.cpp"
 | 
			
		||||
    "csrc/rocm/skinny_gemms.cu"
 | 
			
		||||
    "csrc/rocm/attention.cu")
 | 
			
		||||
 | 
			
		||||
  define_gpu_extension_target(
 | 
			
		||||
@ -785,7 +578,5 @@ endif()
 | 
			
		||||
# For CUDA we also build and ship some external projects.
 | 
			
		||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
    include(cmake/external_projects/flashmla.cmake)
 | 
			
		||||
 | 
			
		||||
    # vllm-flash-attn should be last as it overwrites some CMake functions
 | 
			
		||||
    include(cmake/external_projects/vllm_flash_attn.cmake)
 | 
			
		||||
endif ()
 | 
			
		||||
 | 
			
		||||
@ -1,3 +1,3 @@
 | 
			
		||||
# Contributing to vLLM
 | 
			
		||||
 | 
			
		||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
 | 
			
		||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
 | 
			
		||||
 | 
			
		||||
@ -2,14 +2,14 @@
 | 
			
		||||
# to run the OpenAI compatible server.
 | 
			
		||||
 | 
			
		||||
# Please update any changes made here to
 | 
			
		||||
# docs/contributing/dockerfile/dockerfile.md and
 | 
			
		||||
# docs/assets/contributing/dockerfile-stages-dependency.png
 | 
			
		||||
# docs/source/contributing/dockerfile/dockerfile.md and
 | 
			
		||||
# docs/source/assets/contributing/dockerfile-stages-dependency.png
 | 
			
		||||
 | 
			
		||||
ARG CUDA_VERSION=12.8.1
 | 
			
		||||
ARG CUDA_VERSION=12.4.1
 | 
			
		||||
#################### BASE BUILD IMAGE ####################
 | 
			
		||||
# prepare basic build environment
 | 
			
		||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 | 
			
		||||
ARG CUDA_VERSION=12.8.1
 | 
			
		||||
ARG CUDA_VERSION=12.4.1
 | 
			
		||||
ARG PYTHON_VERSION=3.12
 | 
			
		||||
ARG TARGETPLATFORM
 | 
			
		||||
ENV DEBIAN_FRONTEND=noninteractive
 | 
			
		||||
@ -19,10 +19,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 | 
			
		||||
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
 | 
			
		||||
    && apt-get update -y \
 | 
			
		||||
    && apt-get install -y ccache software-properties-common git curl sudo \
 | 
			
		||||
    && for i in 1 2 3; do \
 | 
			
		||||
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
 | 
			
		||||
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
 | 
			
		||||
    done \
 | 
			
		||||
    && add-apt-repository ppa:deadsnakes/ppa \
 | 
			
		||||
    && apt-get update -y \
 | 
			
		||||
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
 | 
			
		||||
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
 | 
			
		||||
@ -34,11 +31,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    python3 -m pip install uv
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
 | 
			
		||||
 | 
			
		||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 | 
			
		||||
# as it was causing spam when compiling the CUTLASS kernels
 | 
			
		||||
RUN apt-get install -y gcc-10 g++-10
 | 
			
		||||
@ -63,21 +55,19 @@ WORKDIR /workspace
 | 
			
		||||
# after this step
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121";  \
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
COPY requirements/common.txt requirements/common.txt
 | 
			
		||||
COPY requirements/cuda.txt requirements/cuda.txt
 | 
			
		||||
COPY requirements-common.txt requirements-common.txt
 | 
			
		||||
COPY requirements-cuda.txt requirements-cuda.txt
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -r requirements/cuda.txt \
 | 
			
		||||
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 | 
			
		||||
    uv pip install --system -r requirements-cuda.txt
 | 
			
		||||
 | 
			
		||||
# cuda arch list used by torch
 | 
			
		||||
# can be useful for both `dev` and `test`
 | 
			
		||||
# explicitly set the list to avoid issues with torch 2.2
 | 
			
		||||
# see https://github.com/pytorch/pytorch/pull/123243
 | 
			
		||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
 | 
			
		||||
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 | 
			
		||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 | 
			
		||||
# Override the arch list for flash-attn to reduce the binary size
 | 
			
		||||
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
 | 
			
		||||
@ -89,21 +79,15 @@ FROM base AS build
 | 
			
		||||
ARG TARGETPLATFORM
 | 
			
		||||
 | 
			
		||||
# install build dependencies
 | 
			
		||||
COPY requirements/build.txt requirements/build.txt
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
 | 
			
		||||
COPY requirements-build.txt requirements-build.txt
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -r requirements/build.txt \
 | 
			
		||||
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 | 
			
		||||
    uv pip install --system -r requirements-build.txt
 | 
			
		||||
 | 
			
		||||
COPY . .
 | 
			
		||||
ARG GIT_REPO_CHECK=0
 | 
			
		||||
RUN --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
 | 
			
		||||
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 | 
			
		||||
 | 
			
		||||
# max jobs used by Ninja to build extensions
 | 
			
		||||
ARG max_jobs=2
 | 
			
		||||
@ -140,9 +124,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    --mount=type=bind,source=.git,target=.git  \
 | 
			
		||||
    if [ "$USE_SCCACHE" != "1" ]; then \
 | 
			
		||||
        # Clean any existing CMake artifacts
 | 
			
		||||
        rm -rf .deps && \
 | 
			
		||||
        mkdir -p .deps && \
 | 
			
		||||
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
@ -162,35 +143,23 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
 | 
			
		||||
#################### DEV IMAGE ####################
 | 
			
		||||
FROM base as dev
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
 | 
			
		||||
 | 
			
		||||
# Workaround for #17068
 | 
			
		||||
COPY requirements-lint.txt requirements-lint.txt
 | 
			
		||||
COPY requirements-test.txt requirements-test.txt
 | 
			
		||||
COPY requirements-dev.txt requirements-dev.txt
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 | 
			
		||||
 | 
			
		||||
COPY requirements/lint.txt requirements/lint.txt
 | 
			
		||||
COPY requirements/test.txt requirements/test.txt
 | 
			
		||||
COPY requirements/dev.txt requirements/dev.txt
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -r requirements/dev.txt \
 | 
			
		||||
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 | 
			
		||||
    uv pip install --system -r requirements-dev.txt
 | 
			
		||||
#################### DEV IMAGE ####################
 | 
			
		||||
 | 
			
		||||
#################### vLLM installation IMAGE ####################
 | 
			
		||||
# image with vLLM installed
 | 
			
		||||
# TODO: Restore to base image after FlashInfer AOT wheel fixed
 | 
			
		||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
 | 
			
		||||
ARG CUDA_VERSION=12.8.1
 | 
			
		||||
ARG CUDA_VERSION=12.4.1
 | 
			
		||||
ARG PYTHON_VERSION=3.12
 | 
			
		||||
WORKDIR /vllm-workspace
 | 
			
		||||
ENV DEBIAN_FRONTEND=noninteractive
 | 
			
		||||
ARG TARGETPLATFORM
 | 
			
		||||
 | 
			
		||||
SHELL ["/bin/bash", "-c"]
 | 
			
		||||
 | 
			
		||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 | 
			
		||||
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 | 
			
		||||
 | 
			
		||||
@ -200,10 +169,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 | 
			
		||||
    && apt-get update -y \
 | 
			
		||||
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
 | 
			
		||||
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
 | 
			
		||||
    && for i in 1 2 3; do \
 | 
			
		||||
        add-apt-repository -y ppa:deadsnakes/ppa && break || \
 | 
			
		||||
        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
 | 
			
		||||
    done \
 | 
			
		||||
    && add-apt-repository ppa:deadsnakes/ppa \
 | 
			
		||||
    && apt-get update -y \
 | 
			
		||||
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
 | 
			
		||||
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
 | 
			
		||||
@ -215,11 +181,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    python3 -m pip install uv
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
 | 
			
		||||
 | 
			
		||||
# Workaround for https://github.com/openai/triton/issues/2507 and
 | 
			
		||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 | 
			
		||||
# this won't be needed for future versions of this docker image
 | 
			
		||||
@ -232,61 +193,40 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 | 
			
		||||
# after this step
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319";  \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu128 --pre pytorch_triton==3.3.0+gitab727c40; \
 | 
			
		||||
        uv pip install --system --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
# Install vllm wheel first, so that torch etc will be installed.
 | 
			
		||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system dist/*.whl --verbose \
 | 
			
		||||
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 | 
			
		||||
    uv pip install --system dist/*.whl --verbose
 | 
			
		||||
 | 
			
		||||
# If we need to build FlashInfer wheel before its release:
 | 
			
		||||
# $ export FLASHINFER_ENABLE_AOT=1
 | 
			
		||||
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
 | 
			
		||||
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a'
 | 
			
		||||
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
 | 
			
		||||
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
 | 
			
		||||
# $ cd flashinfer
 | 
			
		||||
# $ git checkout v0.2.6.post1
 | 
			
		||||
# $ python -m flashinfer.aot
 | 
			
		||||
# $ python -m build --no-isolation --wheel
 | 
			
		||||
# $ ls -la dist
 | 
			
		||||
# -rw-rw-r-- 1 mgoin mgoin 205M Jun  9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 | 
			
		||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
 | 
			
		||||
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
 | 
			
		||||
# $ rm -rf build
 | 
			
		||||
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
 | 
			
		||||
# $ ls dist
 | 
			
		||||
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
. /etc/environment && \
 | 
			
		||||
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
 | 
			
		||||
    # FlashInfer already has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
 | 
			
		||||
    if [[ "$CUDA_VERSION" == 12.8* ]]; then \
 | 
			
		||||
        uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl; \
 | 
			
		||||
    else \
 | 
			
		||||
        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a' && \
 | 
			
		||||
        git clone https://github.com/flashinfer-ai/flashinfer.git --single-branch --branch v0.2.6.post1 --recursive && \
 | 
			
		||||
        # Needed to build AOT kernels
 | 
			
		||||
        (cd flashinfer && \
 | 
			
		||||
            python3 -m flashinfer.aot && \
 | 
			
		||||
            uv pip install --system --no-build-isolation . \
 | 
			
		||||
        ) && \
 | 
			
		||||
        rm -rf flashinfer; \
 | 
			
		||||
    fi \
 | 
			
		||||
    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl ; \
 | 
			
		||||
fi
 | 
			
		||||
COPY examples examples
 | 
			
		||||
COPY benchmarks benchmarks
 | 
			
		||||
COPY ./vllm/collect_env.py .
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
. /etc/environment && \
 | 
			
		||||
uv pip list
 | 
			
		||||
 | 
			
		||||
# Even when we build Flashinfer with AOT mode, there's still
 | 
			
		||||
# Although we build Flashinfer with AOT mode, there's still
 | 
			
		||||
# some issues w.r.t. JIT compilation. Therefore we need to
 | 
			
		||||
# install build dependencies for JIT compilation.
 | 
			
		||||
# TODO: Remove this once FlashInfer AOT wheel is fixed
 | 
			
		||||
COPY requirements/build.txt requirements/build.txt
 | 
			
		||||
COPY requirements-build.txt requirements-build.txt
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system -r requirements/build.txt \
 | 
			
		||||
    --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 | 
			
		||||
    uv pip install --system -r requirements-build.txt
 | 
			
		||||
 | 
			
		||||
#################### vLLM installation IMAGE ####################
 | 
			
		||||
 | 
			
		||||
@ -297,21 +237,9 @@ FROM vllm-base AS test
 | 
			
		||||
 | 
			
		||||
ADD . /vllm-workspace/
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
 | 
			
		||||
 | 
			
		||||
# Workaround for #17068
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 | 
			
		||||
 | 
			
		||||
# install development dependencies (for testing)
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
 | 
			
		||||
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
 | 
			
		||||
        uv pip install --system -r requirements/dev.txt; \
 | 
			
		||||
    fi
 | 
			
		||||
    uv pip install --system -r requirements-dev.txt
 | 
			
		||||
 | 
			
		||||
# install development dependencies (for testing)
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
@ -330,26 +258,19 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 | 
			
		||||
# will not be imported by other tests
 | 
			
		||||
RUN mkdir test_docs
 | 
			
		||||
RUN mv docs test_docs/
 | 
			
		||||
RUN cp -r examples test_docs/
 | 
			
		||||
RUN mv vllm test_docs/
 | 
			
		||||
RUN mv mkdocs.yaml test_docs/
 | 
			
		||||
#################### TEST IMAGE ####################
 | 
			
		||||
 | 
			
		||||
#################### OPENAI API SERVER ####################
 | 
			
		||||
# base openai image with additional requirements, for any subsequent openai-style images
 | 
			
		||||
FROM vllm-base AS vllm-openai-base
 | 
			
		||||
ARG TARGETPLATFORM
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
ENV UV_HTTP_TIMEOUT=500
 | 
			
		||||
 | 
			
		||||
# install additional dependencies for openai api server
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
 | 
			
		||||
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
 | 
			
		||||
    else \
 | 
			
		||||
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.3' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
 | 
			
		||||
        uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
ENV VLLM_USAGE_SOURCE production-docker-image
 | 
			
		||||
@ -26,18 +26,18 @@ WORKDIR /workspace
 | 
			
		||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 | 
			
		||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    --mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
 | 
			
		||||
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
 | 
			
		||||
    pip install --upgrade pip && \
 | 
			
		||||
    pip install -r requirements/build.txt
 | 
			
		||||
    pip install -r requirements-build.txt
 | 
			
		||||
 | 
			
		||||
FROM cpu-test-arm AS build
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/vllm
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    --mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
 | 
			
		||||
    --mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
 | 
			
		||||
    pip install -v -r requirements/cpu.txt
 | 
			
		||||
    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
 | 
			
		||||
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
 | 
			
		||||
    pip install -v -r requirements-cpu.txt
 | 
			
		||||
 | 
			
		||||
COPY . .
 | 
			
		||||
ARG GIT_REPO_CHECK=0
 | 
			
		||||
							
								
								
									
										69
									
								
								Dockerfile.cpu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								Dockerfile.cpu
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,69 @@
 | 
			
		||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 | 
			
		||||
 | 
			
		||||
FROM ubuntu:22.04 AS cpu-test-1
 | 
			
		||||
 | 
			
		||||
ENV CCACHE_DIR=/root/.cache/ccache
 | 
			
		||||
 | 
			
		||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/var/cache/apt \
 | 
			
		||||
    apt-get update -y \
 | 
			
		||||
    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
 | 
			
		||||
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
 | 
			
		||||
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 | 
			
		||||
 | 
			
		||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
 | 
			
		||||
# intel-openmp provides additional performance improvement vs. openmp
 | 
			
		||||
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    pip install intel-openmp==2025.0.1
 | 
			
		||||
 | 
			
		||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
 | 
			
		||||
 | 
			
		||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
 | 
			
		||||
 | 
			
		||||
RUN pip install intel_extension_for_pytorch==2.5.0
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace
 | 
			
		||||
 | 
			
		||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 | 
			
		||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
 | 
			
		||||
    pip install --upgrade pip && \
 | 
			
		||||
    pip install -r requirements-build.txt
 | 
			
		||||
 | 
			
		||||
FROM cpu-test-1 AS build
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/vllm
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
 | 
			
		||||
    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
 | 
			
		||||
    pip install -v -r requirements-cpu.txt
 | 
			
		||||
 | 
			
		||||
COPY . .
 | 
			
		||||
ARG GIT_REPO_CHECK=0
 | 
			
		||||
RUN --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 | 
			
		||||
 | 
			
		||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 | 
			
		||||
ARG VLLM_CPU_DISABLE_AVX512
 | 
			
		||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/ccache \
 | 
			
		||||
    --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
 | 
			
		||||
    pip install dist/*.whl && \
 | 
			
		||||
    rm -rf dist
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/
 | 
			
		||||
 | 
			
		||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
 | 
			
		||||
 | 
			
		||||
# install development dependencies (for testing)
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/pip \
 | 
			
		||||
    pip install -e tests/vllm_test_utils
 | 
			
		||||
 | 
			
		||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 | 
			
		||||
@ -1,10 +1,10 @@
 | 
			
		||||
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 | 
			
		||||
FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
 | 
			
		||||
 | 
			
		||||
COPY ./ /workspace/vllm
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/vllm
 | 
			
		||||
 | 
			
		||||
RUN pip install -v -r requirements/hpu.txt
 | 
			
		||||
RUN pip install -v -r requirements-hpu.txt
 | 
			
		||||
 | 
			
		||||
ENV no_proxy=localhost,127.0.0.1
 | 
			
		||||
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
 | 
			
		||||
@ -1,6 +1,6 @@
 | 
			
		||||
# default base image
 | 
			
		||||
# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
 | 
			
		||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
 | 
			
		||||
ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
 | 
			
		||||
 | 
			
		||||
FROM $BASE_IMAGE
 | 
			
		||||
 | 
			
		||||
@ -21,8 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
 | 
			
		||||
WORKDIR ${APP_MOUNT}/vllm
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install --upgrade pip
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
 | 
			
		||||
RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 | 
			
		||||
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
 | 
			
		||||
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
 | 
			
		||||
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 | 
			
		||||
RUN python3 -m pip install pytest
 | 
			
		||||
 | 
			
		||||
# uninstall transformers-neuronx package explicitly to avoid version conflict
 | 
			
		||||
@ -34,8 +35,8 @@ RUN --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install -U \
 | 
			
		||||
        'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
 | 
			
		||||
        -r requirements/neuron.txt
 | 
			
		||||
        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
 | 
			
		||||
        -r requirements-neuron.txt
 | 
			
		||||
 | 
			
		||||
ENV VLLM_TARGET_DEVICE neuron
 | 
			
		||||
RUN --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
@ -48,8 +49,6 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 | 
			
		||||
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
 | 
			
		||||
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
 | 
			
		||||
 | 
			
		||||
RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
 | 
			
		||||
 | 
			
		||||
# overwrite entrypoint to run bash script
 | 
			
		||||
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
 | 
			
		||||
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user