mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-10-25 09:54:38 +08:00 
			
		
		
		
	Compare commits
	
		
			4 Commits
		
	
	
		
			gpu_ids2
			...
			optimize-p
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| d5bf492f16 | |||
| 8c7bab79f5 | |||
| 1936d7bab0 | |||
| 996cf2de5c | 
| @ -1,53 +1,36 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import os | ||||
| import sys | ||||
| import zipfile | ||||
|  | ||||
| # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB | ||||
| # Note that we have 400 MiB quota, please use it wisely. | ||||
| # See https://github.com/pypi/support/issues/3792 . | ||||
| # Please also sync the value with the one in Dockerfile. | ||||
| VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400)) | ||||
| MAX_SIZE_MB = 200 | ||||
|  | ||||
|  | ||||
| def print_top_10_largest_files(zip_file): | ||||
|     """Print the top 10 largest files in the given zip file.""" | ||||
|     with zipfile.ZipFile(zip_file, "r") as z: | ||||
|     with zipfile.ZipFile(zip_file, 'r') as z: | ||||
|         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] | ||||
|         file_sizes.sort(key=lambda x: x[1], reverse=True) | ||||
|         for f, size in file_sizes[:10]: | ||||
|             print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.") | ||||
|             print(f"{f}: {size/(1024*1024)} MBs uncompressed.") | ||||
|  | ||||
|  | ||||
| def check_wheel_size(directory): | ||||
|     """Check the size of .whl files in the given directory.""" | ||||
|     for root, _, files in os.walk(directory): | ||||
|         for file_name in files: | ||||
|             if file_name.endswith(".whl"): | ||||
|                 wheel_path = os.path.join(root, file_name) | ||||
|                 wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024) | ||||
|                 if wheel_size_mb > VLLM_MAX_SIZE_MB: | ||||
|         for f in files: | ||||
|             if f.endswith(".whl"): | ||||
|                 wheel_path = os.path.join(root, f) | ||||
|                 wheel_size = os.path.getsize(wheel_path) | ||||
|                 wheel_size_mb = wheel_size / (1024 * 1024) | ||||
|                 if wheel_size_mb > MAX_SIZE_MB: | ||||
|                     print( | ||||
|                         f"Not allowed: Wheel {wheel_path} is larger " | ||||
|                         f"({wheel_size_mb:.2f} MB) than the limit " | ||||
|                         f"({VLLM_MAX_SIZE_MB} MB)." | ||||
|                     ) | ||||
|                         f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) " | ||||
|                         f"compare to the allowed size ({MAX_SIZE_MB} MB).") | ||||
|                     print_top_10_largest_files(wheel_path) | ||||
|                     return 1 | ||||
|                 else: | ||||
|                     print( | ||||
|                         f"Wheel {wheel_path} is within the allowed size " | ||||
|                         f"({wheel_size_mb:.2f} MB)." | ||||
|                     ) | ||||
|                     print(f"Wheel {wheel_path} is within the allowed size " | ||||
|                           f"({wheel_size_mb} MB).") | ||||
|     return 0 | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     if len(sys.argv) < 2: | ||||
|         print("Usage: python check-wheel-size.py <directory>") | ||||
|         sys.exit(1) | ||||
|  | ||||
|     directory = sys.argv[1] | ||||
|     sys.exit(check_wheel_size(directory)) | ||||
|     import sys | ||||
|     sys.exit(check_wheel_size(sys.argv[1])) | ||||
|  | ||||
							
								
								
									
										18
									
								
								.buildkite/download-images.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								.buildkite/download-images.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -ex | ||||
| set -o pipefail | ||||
|  | ||||
| (which wget && which curl) || (apt-get update && apt-get install -y wget curl) | ||||
|  | ||||
| # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/ | ||||
| mkdir -p images | ||||
| cd images | ||||
| wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt | ||||
| wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt | ||||
| wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt | ||||
| wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt | ||||
| wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg | ||||
| wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg | ||||
|  | ||||
| cd - | ||||
| @ -1,27 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import argparse | ||||
| import os | ||||
|  | ||||
| template = """<!DOCTYPE html> | ||||
| <html> | ||||
|     <body> | ||||
|     <h1>Links for vLLM</h1/> | ||||
|         <a href="../{wheel_html_escaped}">{wheel}</a><br/> | ||||
|     </body> | ||||
| </html> | ||||
| """ | ||||
|  | ||||
| parser = argparse.ArgumentParser() | ||||
| parser.add_argument("--wheel", help="The wheel path.", required=True) | ||||
| args = parser.parse_args() | ||||
|  | ||||
| filename = os.path.basename(args.wheel) | ||||
|  | ||||
| with open("index.html", "w") as f: | ||||
|     print(f"Generated index.html for {args.wheel}") | ||||
|     # cloudfront requires escaping the '+' character | ||||
|     f.write( | ||||
|         template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) | ||||
|     ) | ||||
| @ -1,13 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2 | ||||
| model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.671 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.664 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| trust_remote_code: True | ||||
| @ -1,12 +0,0 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 | ||||
| model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.905 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.905 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 | ||||
| model_name: "meta-llama/Meta-Llama-3-70B-Instruct" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.892 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.892 | ||||
| limit: 250 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.752 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.754 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.753 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.753 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.755 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.755 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 | ||||
| model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.753 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.753 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.764 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.764 | ||||
| limit: 250 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.728 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.728 | ||||
| limit: 250 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.758 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.759 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 | ||||
| model_name: "meta-llama/Meta-Llama-3-8B-Instruct" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.756 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.752 | ||||
| limit: 250 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 | ||||
| model_name: "HandH1998/QQQ-Llama-3-8b-g128" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.419 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.416 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,11 +0,0 @@ | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1 | ||||
| model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.335 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.323 | ||||
| limit: 1319 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 | ||||
| model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.356 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.358 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "mgoin/Minitron-4B-Base-FP8" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.231 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.22 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 | ||||
| model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.86 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.86 | ||||
| limit: 250 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 | ||||
| model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.624 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.624 | ||||
| limit: 250 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 | ||||
| model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.616 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.632 | ||||
| limit: 250 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1 | ||||
| model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.30 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.465 | ||||
| limit: 1319 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.578 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.585 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 | ||||
| model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.593 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.588 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.595 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.582 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 | ||||
| model_name: "Qwen/Qwen2-57B-A14B-Instruct" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.792 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.824 | ||||
| limit: 250 | ||||
| num_fewshot: 5 | ||||
| @ -1,11 +0,0 @@ | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1 | ||||
| model_name: "Qwen/Qwen2.5-1.5B-Instruct" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.54 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.59 | ||||
| limit: 1319 | ||||
| num_fewshot: 5 | ||||
| @ -1,11 +0,0 @@ | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1 | ||||
| model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.47 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.64 | ||||
| limit: 1319 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2 | ||||
| model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.6353 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.637 | ||||
| limit: null | ||||
| num_fewshot: null  | ||||
| @ -1,6 +0,0 @@ | ||||
| Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml | ||||
| Meta-Llama-3-70B-Instruct.yaml | ||||
| Mixtral-8x7B-Instruct-v0.1.yaml | ||||
| Qwen2-57B-A14-Instruct.yaml | ||||
| DeepSeek-V2-Lite-Chat.yaml | ||||
| Meta-Llama-3-8B-QQQ.yaml | ||||
| @ -1,6 +0,0 @@ | ||||
| Qwen2.5-1.5B-Instruct.yaml | ||||
| Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml | ||||
| Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml | ||||
| Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml | ||||
| Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml | ||||
| Qwen1.5-MoE-W4A16-compressed-tensors.yaml | ||||
| @ -1,44 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| from pathlib import Path | ||||
|  | ||||
| import pytest | ||||
|  | ||||
|  | ||||
| def pytest_addoption(parser): | ||||
|     parser.addoption( | ||||
|         "--config-list-file", | ||||
|         action="store", | ||||
|         help="Path to the file listing model config YAMLs (one per line)", | ||||
|     ) | ||||
|     parser.addoption( | ||||
|         "--tp-size", | ||||
|         action="store", | ||||
|         default="1", | ||||
|         help="Tensor parallel size to use for evaluation", | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture(scope="session") | ||||
| def config_list_file(pytestconfig, config_dir): | ||||
|     rel_path = pytestconfig.getoption("--config-list-file") | ||||
|     return config_dir / rel_path | ||||
|  | ||||
|  | ||||
| @pytest.fixture(scope="session") | ||||
| def tp_size(pytestconfig): | ||||
|     return pytestconfig.getoption("--tp-size") | ||||
|  | ||||
|  | ||||
| def pytest_generate_tests(metafunc): | ||||
|     if "config_filename" in metafunc.fixturenames: | ||||
|         rel_path = metafunc.config.getoption("--config-list-file") | ||||
|         config_list_file = Path(rel_path).resolve() | ||||
|         config_dir = config_list_file.parent | ||||
|         with open(config_list_file, encoding="utf-8") as f: | ||||
|             configs = [ | ||||
|                 config_dir / line.strip() | ||||
|                 for line in f | ||||
|                 if line.strip() and not line.startswith("#") | ||||
|             ] | ||||
|         metafunc.parametrize("config_filename", configs) | ||||
| @ -1,46 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # We can use this script to compute baseline accuracy on GSM for transformers. | ||||
| # | ||||
| # Make sure you have lm-eval-harness installed: | ||||
| #   pip install lm-eval==0.4.4 | ||||
|  | ||||
| usage() { | ||||
|     echo`` | ||||
|     echo "Runs lm eval harness on GSM8k using huggingface transformers." | ||||
|     echo "This pathway is intended to be used to create baselines for " | ||||
|     echo "our automated nm-test-accuracy workflow" | ||||
|     echo | ||||
|     echo "usage: ${0} <options>" | ||||
|     echo | ||||
|     echo "  -m    - huggingface stub or local directory of the model" | ||||
|     echo "  -b    - batch size to run the evaluation at" | ||||
|     echo "  -l    - limit number of samples to run" | ||||
|     echo "  -f    - number of fewshot samples to use" | ||||
|     echo | ||||
| } | ||||
|  | ||||
| while getopts "m:b:l:f:" OPT; do | ||||
|   case ${OPT} in | ||||
|     m )  | ||||
|         MODEL="$OPTARG" | ||||
|         ;; | ||||
|     b )  | ||||
|         BATCH_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     l )  | ||||
|         LIMIT="$OPTARG" | ||||
|         ;; | ||||
|     f )  | ||||
|         FEWSHOT="$OPTARG" | ||||
|         ;; | ||||
|     \? )  | ||||
|         usage | ||||
|         exit 1 | ||||
|         ;; | ||||
|   esac | ||||
| done | ||||
|  | ||||
| lm_eval --model hf \ | ||||
|   --model_args "pretrained=$MODEL,parallelize=True" \ | ||||
|   --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ | ||||
|   --batch_size "$BATCH_SIZE" | ||||
| @ -1,51 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # We can use this script to compute baseline accuracy on GSM for vllm. | ||||
| # We use this for fp8, which HF does not support. | ||||
| # | ||||
| # Make sure you have lm-eval-harness installed: | ||||
| #   pip install lm-eval==0.4.4 | ||||
|  | ||||
| usage() { | ||||
|     echo`` | ||||
|     echo "Runs lm eval harness on GSM8k using huggingface transformers." | ||||
|     echo "This pathway is intended to be used to create baselines for " | ||||
|     echo "our automated nm-test-accuracy workflow" | ||||
|     echo | ||||
|     echo "usage: ${0} <options>" | ||||
|     echo | ||||
|     echo "  -m    - huggingface stub or local directory of the model" | ||||
|     echo "  -b    - batch size to run the evaluation at" | ||||
|     echo "  -l    - limit number of samples to run" | ||||
|     echo "  -f    - number of fewshot samples to use" | ||||
|     echo "  -t    - tensor parallel size to run at" | ||||
|     echo | ||||
| } | ||||
|  | ||||
| while getopts "m:b:l:f:t:" OPT; do | ||||
|   case ${OPT} in | ||||
|     m )  | ||||
|         MODEL="$OPTARG" | ||||
|         ;; | ||||
|     b )  | ||||
|         BATCH_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     l )  | ||||
|         LIMIT="$OPTARG" | ||||
|         ;; | ||||
|     f )  | ||||
|         FEWSHOT="$OPTARG" | ||||
|         ;; | ||||
|     t ) | ||||
|         TP_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     \? )  | ||||
|         usage | ||||
|         exit 1 | ||||
|         ;; | ||||
|   esac | ||||
| done | ||||
|  | ||||
| lm_eval --model vllm \ | ||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ | ||||
|   --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ | ||||
|   --batch_size "$BATCH_SIZE" | ||||
| @ -1,57 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| """ | ||||
| LM eval harness on model to compare vs HF baseline computed offline. | ||||
| Configs are found in configs/$MODEL.yaml | ||||
|  | ||||
| pytest -s -v test_lm_eval_correctness.py \ | ||||
|     --config-list-file=configs/models-small.txt \ | ||||
|     --tp-size=1 | ||||
| """ | ||||
|  | ||||
| import lm_eval | ||||
| import numpy as np | ||||
| import yaml | ||||
|  | ||||
| RTOL = 0.08 | ||||
|  | ||||
|  | ||||
| def launch_lm_eval(eval_config, tp_size): | ||||
|     trust_remote_code = eval_config.get("trust_remote_code", False) | ||||
|     max_model_len = eval_config.get("max_model_len", 4096) | ||||
|     model_args = ( | ||||
|         f"pretrained={eval_config['model_name']}," | ||||
|         f"tensor_parallel_size={tp_size}," | ||||
|         f"enforce_eager=true," | ||||
|         f"add_bos_token=true," | ||||
|         f"trust_remote_code={trust_remote_code}," | ||||
|         f"max_model_len={max_model_len}" | ||||
|     ) | ||||
|     results = lm_eval.simple_evaluate( | ||||
|         model="vllm", | ||||
|         model_args=model_args, | ||||
|         tasks=[task["name"] for task in eval_config["tasks"]], | ||||
|         num_fewshot=eval_config["num_fewshot"], | ||||
|         limit=eval_config["limit"], | ||||
|         batch_size="auto", | ||||
|     ) | ||||
|     return results | ||||
|  | ||||
|  | ||||
| def test_lm_eval_correctness_param(config_filename, tp_size): | ||||
|     eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) | ||||
|  | ||||
|     results = launch_lm_eval(eval_config, tp_size) | ||||
|  | ||||
|     success = True | ||||
|     for task in eval_config["tasks"]: | ||||
|         for metric in task["metrics"]: | ||||
|             ground_truth = metric["value"] | ||||
|             measured_value = results["results"][task["name"]][metric["name"]] | ||||
|             print( | ||||
|                 f"{task['name']} | {metric['name']}: " | ||||
|                 f"ground_truth={ground_truth} | measured={measured_value}" | ||||
|             ) | ||||
|             success = success and np.isclose(ground_truth, measured_value, rtol=RTOL) | ||||
|  | ||||
|     assert success | ||||
| @ -1,181 +0,0 @@ | ||||
| # vLLM benchmark suite | ||||
|  | ||||
| ## Introduction | ||||
|  | ||||
| This directory contains two sets of benchmark for vllm. | ||||
|  | ||||
| - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance | ||||
| - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. | ||||
|  | ||||
| See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. | ||||
|  | ||||
| ## Performance benchmark quick overview | ||||
|  | ||||
| **Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models. | ||||
|  | ||||
| **Benchmarking Duration**: about 1hr. | ||||
|  | ||||
| **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. | ||||
|  | ||||
| ## Nightly benchmark quick overview | ||||
|  | ||||
| **Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B. | ||||
|  | ||||
| **Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy. | ||||
|  | ||||
| **Benchmarking Duration**: about 3.5hrs. | ||||
|  | ||||
| ## Trigger the benchmark | ||||
|  | ||||
| Performance benchmark will be triggered when: | ||||
| - A PR being merged into vllm. | ||||
| - Every commit for those PRs with `perf-benchmarks` label AND `ready` label. | ||||
|  | ||||
| Manually Trigger the benchmark | ||||
|  | ||||
| ```bash | ||||
| bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
| ``` | ||||
|  | ||||
| Runtime environment variables: | ||||
| - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. | ||||
| - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). | ||||
| - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). | ||||
| - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). | ||||
| - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. | ||||
| - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. | ||||
|  | ||||
| Nightly benchmark will be triggered when: | ||||
| - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. | ||||
|  | ||||
| ## Performance benchmark details | ||||
|  | ||||
| See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. | ||||
| > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. | ||||
| ### Latency test | ||||
|  | ||||
| Here is an example of one test inside `latency-tests.json`: | ||||
|  | ||||
| ```json | ||||
| [ | ||||
|     { | ||||
|         "test_name": "latency_llama8B_tp1", | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-8B", | ||||
|             "tensor_parallel_size": 1, | ||||
|             "load_format": "dummy", | ||||
|             "num_iters_warmup": 5, | ||||
|             "num_iters": 15 | ||||
|         } | ||||
|     }, | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| In this example: | ||||
|  | ||||
| - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. | ||||
| - The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` | ||||
|  | ||||
| Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. | ||||
|  | ||||
| WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. | ||||
|  | ||||
| ### Throughput test | ||||
|  | ||||
| The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. | ||||
|  | ||||
| The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. | ||||
|  | ||||
| ### Serving test | ||||
|  | ||||
| We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: | ||||
|  | ||||
| ```json | ||||
| [ | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp1_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-8B", | ||||
|             "tensor_parallel_size": 1, | ||||
|             "swap_space": 16, | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-8B", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
| ] | ||||
| ``` | ||||
|  | ||||
| Inside this example: | ||||
|  | ||||
| - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. | ||||
| - The `server-parameters` includes the command line arguments for vLLM server. | ||||
| - The `client-parameters` includes the command line arguments for `benchmark_serving.py`. | ||||
| - The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py` | ||||
|  | ||||
| The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. | ||||
|  | ||||
| WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. | ||||
|  | ||||
| ### Visualizing the results | ||||
|  | ||||
| The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. | ||||
| You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. | ||||
| If you do not see the table, please wait till the benchmark finish running. | ||||
| The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. | ||||
| The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. | ||||
|  | ||||
| The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. | ||||
| When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. | ||||
| `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT. | ||||
|  | ||||
| Here is an example using the script to compare result_a and result_b without detail test name. | ||||
| `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name` | ||||
|  | ||||
| |    | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        | | ||||
| |----|----------------------------------------|----------------------------------------|----------| | ||||
| | 0  | 142.633982                             | 156.526018                             | 1.097396 | | ||||
| | 1  | 241.620334                             | 294.018783                             | 1.216863 | | ||||
| | 2  | 218.298905                             | 262.664916                             | 1.203235 | | ||||
| | 3  | 242.743860                             | 299.816190                             | 1.235113 | | ||||
|  | ||||
| Here is an example using the script to compare result_a and result_b with detail test name. | ||||
| `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` | ||||
| |   | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio        | | ||||
| |---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------| | ||||
| | 0 | serving_llama8B_tp1_sharegpt_qps_1          | 142.633982                             | serving_llama8B_tp1_sharegpt_qps_1          | 156.526018                             | 1.097396 | | ||||
| | 1 | serving_llama8B_tp1_sharegpt_qps_16         | 241.620334                             | serving_llama8B_tp1_sharegpt_qps_16         | 294.018783                             | 1.216863 | | ||||
| | 2 | serving_llama8B_tp1_sharegpt_qps_4          | 218.298905                             | serving_llama8B_tp1_sharegpt_qps_4          | 262.664916                             | 1.203235 | | ||||
| | 3 | serving_llama8B_tp1_sharegpt_qps_inf        | 242.743860                             | serving_llama8B_tp1_sharegpt_qps_inf        | 299.816190                             | 1.235113 | | ||||
| | 4 | serving_llama8B_tp2_random_1024_128_qps_1   | 96.613390                              | serving_llama8B_tp4_random_1024_128_qps_1   | 108.404853                             | 1.122048 | | ||||
|  | ||||
| ## Nightly test details | ||||
|  | ||||
| See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. | ||||
|  | ||||
| ### Workflow | ||||
|  | ||||
| - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines. | ||||
| - Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container. | ||||
| - The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark. | ||||
| - At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. | ||||
|  | ||||
| ### Nightly tests | ||||
|  | ||||
| In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. | ||||
|  | ||||
| ### Docker containers | ||||
|  | ||||
| The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. | ||||
|  | ||||
| WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`. | ||||
|  | ||||
| WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). | ||||
| @ -1,184 +0,0 @@ | ||||
| steps: | ||||
|   - label: "Wait for container to be ready" | ||||
|     key: wait-for-container-image | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     plugins: | ||||
|     - kubernetes: | ||||
|         podSpec: | ||||
|           containers: | ||||
|           - image: badouralix/curl-jq | ||||
|             command: | ||||
|             - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh | ||||
|   - label: "Cleanup H100" | ||||
|     agents: | ||||
|       queue: H100 | ||||
|     depends_on: ~ | ||||
|     command: docker system prune -a --volumes --force | ||||
|    | ||||
|   - label: "A100" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch == "main" | ||||
|     plugins: | ||||
|     - kubernetes: | ||||
|         podSpec: | ||||
|           priorityClassName: perf-benchmark | ||||
|           containers: | ||||
|           - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT | ||||
|             command: | ||||
|             - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
|             resources: | ||||
|               limits: | ||||
|                 nvidia.com/gpu: 8 | ||||
|             volumeMounts: | ||||
|             - name: devshm | ||||
|               mountPath: /dev/shm | ||||
|             env: | ||||
|             - name: VLLM_USAGE_SOURCE | ||||
|               value: ci-test | ||||
|             - name: HF_TOKEN | ||||
|               valueFrom: | ||||
|                 secretKeyRef: | ||||
|                   name: hf-token-secret | ||||
|                   key: token | ||||
|           nodeSelector: | ||||
|             nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB | ||||
|           volumes: | ||||
|           - name: devshm | ||||
|             emptyDir: | ||||
|               medium: Memory | ||||
|  | ||||
|   - label: "H200" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: H200 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch == "main" | ||||
|     plugins: | ||||
|     - docker#v5.12.0: | ||||
|         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT | ||||
|         command: | ||||
|         - bash | ||||
|         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
|         mount-buildkite-agent: true | ||||
|         propagate-environment: true | ||||
|         ipc: host | ||||
|         gpus: 4,5,6,7 | ||||
|         volumes: | ||||
|           - /data/benchmark-hf-cache:/root/.cache/huggingface | ||||
|         environment: | ||||
|         - VLLM_USAGE_SOURCE | ||||
|         - HF_TOKEN | ||||
|  | ||||
|   #- block: "Run H100 Benchmark" | ||||
|     #key: block-h100 | ||||
|     #depends_on: ~ | ||||
|  | ||||
|   - label: "H100" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: H100 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch == "main" | ||||
|     plugins: | ||||
|     - docker#v5.12.0: | ||||
|         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT | ||||
|         command: | ||||
|         - bash | ||||
|         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
|         mount-buildkite-agent: true | ||||
|         propagate-environment: true | ||||
|         ipc: host | ||||
|         gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used | ||||
|         volumes: | ||||
|           - /data/benchmark-hf-cache:/root/.cache/huggingface | ||||
|         environment: | ||||
|         - VLLM_USAGE_SOURCE | ||||
|         - HF_TOKEN | ||||
|  | ||||
|   # Premerge benchmark | ||||
|   - label: "A100" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch != "main" | ||||
|     plugins: | ||||
|     - kubernetes: | ||||
|         podSpec: | ||||
|           priorityClassName: perf-benchmark | ||||
|           containers: | ||||
|           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT | ||||
|             command: | ||||
|             - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
|             resources: | ||||
|               limits: | ||||
|                 nvidia.com/gpu: 8 | ||||
|             volumeMounts: | ||||
|             - name: devshm | ||||
|               mountPath: /dev/shm | ||||
|             env: | ||||
|             - name: VLLM_USAGE_SOURCE | ||||
|               value: ci-test | ||||
|             - name: HF_TOKEN | ||||
|               valueFrom: | ||||
|                 secretKeyRef: | ||||
|                   name: hf-token-secret | ||||
|                   key: token | ||||
|           nodeSelector: | ||||
|             nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB | ||||
|           volumes: | ||||
|           - name: devshm | ||||
|             emptyDir: | ||||
|               medium: Memory | ||||
|  | ||||
|   - label: "H200" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: H200 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch != "main" | ||||
|     plugins: | ||||
|     - docker#v5.12.0: | ||||
|         image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT | ||||
|         command: | ||||
|         - bash | ||||
|         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
|         mount-buildkite-agent: true | ||||
|         propagate-environment: true | ||||
|         ipc: host | ||||
|         gpus: 4,5,6,7 | ||||
|         volumes: | ||||
|           - /data/benchmark-hf-cache:/root/.cache/huggingface | ||||
|         environment: | ||||
|         - VLLM_USAGE_SOURCE | ||||
|         - HF_TOKEN | ||||
|  | ||||
|   #- block: "Run H100 Benchmark" | ||||
|     #key: block-h100 | ||||
|     #depends_on: ~ | ||||
|  | ||||
|   - label: "H100" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: H100 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch != "main" | ||||
|     plugins: | ||||
|     - docker#v5.12.0: | ||||
|         image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT | ||||
|         command: | ||||
|         - bash | ||||
|         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
|         mount-buildkite-agent: true | ||||
|         propagate-environment: true | ||||
|         ipc: host | ||||
|         gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used | ||||
|         volumes: | ||||
|           - /data/benchmark-hf-cache:/root/.cache/huggingface | ||||
|         environment: | ||||
|         - VLLM_USAGE_SOURCE | ||||
|         - HF_TOKEN | ||||
| @ -1,27 +0,0 @@ | ||||
|  | ||||
| ## Description | ||||
|  | ||||
| This file contains the downloading link for benchmarking results. | ||||
|  | ||||
| - [benchmarking pipeline](artifact://nightly-pipeline.yaml) | ||||
| - [benchmarking results](artifact://results.zip) | ||||
| - [benchmarking code](artifact://nightly-benchmarks.zip) | ||||
|  | ||||
| Please download the visualization scripts in the post | ||||
|  | ||||
| ## Results reproduction | ||||
|  | ||||
| - Find the docker we use in `benchmarking pipeline` | ||||
| - Deploy the docker, and inside the docker: | ||||
|   - Download `nightly-benchmarks.zip`. | ||||
|   - In the same folder, run the following code: | ||||
|  | ||||
|   ```bash | ||||
|   export HF_TOKEN=<your HF token> | ||||
|   apt update | ||||
|   apt install -y git | ||||
|   unzip nightly-benchmarks.zip | ||||
|   VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh | ||||
|   ``` | ||||
|  | ||||
| And the results will be inside `./benchmarks/results`. | ||||
| @ -1,39 +0,0 @@ | ||||
|  | ||||
| # Nightly benchmark | ||||
|  | ||||
| This benchmark aims to: | ||||
|  | ||||
| - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. | ||||
| - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. | ||||
|  | ||||
| Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. | ||||
|  | ||||
| Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176) | ||||
|  | ||||
| ## Setup | ||||
|  | ||||
| - Docker images: | ||||
|   - vLLM: `vllm/vllm-openai:v0.6.2` | ||||
|   - SGLang: `lmsysorg/sglang:v0.3.2-cu121` | ||||
|   - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` | ||||
|   - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` | ||||
|     - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.* | ||||
|   - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. | ||||
| - Hardware | ||||
|   - 8x Nvidia A100 GPUs | ||||
| - Workload: | ||||
|   - Dataset | ||||
|     - ShareGPT dataset | ||||
|     - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output) | ||||
|     - Decode-heavy dataset (in average 462 input tokens, 256 output tokens) | ||||
|     - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use. | ||||
|   - Models: llama-3 8B, llama-3 70B. | ||||
|     - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)). | ||||
|   - Average QPS (query per second): 2, 4, 8, 16, 32 and inf. | ||||
|     - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. | ||||
|   - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). | ||||
|  | ||||
| ## Known issues | ||||
|  | ||||
| - TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). | ||||
| - TGI does not support `ignore-eos` flag. | ||||
| @ -1,196 +0,0 @@ | ||||
| common_pod_spec: &common_pod_spec | ||||
|   priorityClassName: perf-benchmark | ||||
|   nodeSelector: | ||||
|     nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB | ||||
|   volumes: | ||||
|     - name: devshm | ||||
|       emptyDir: | ||||
|         medium: Memory | ||||
|     - name: hf-cache | ||||
|       hostPath: | ||||
|         path: /root/.cache/huggingface | ||||
|         type: Directory | ||||
|  | ||||
| common_container_settings: &common_container_settings | ||||
|   command: | ||||
|     - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh | ||||
|   resources: | ||||
|     limits: | ||||
|       nvidia.com/gpu: 8 | ||||
|   volumeMounts: | ||||
|     - name: devshm | ||||
|       mountPath: /dev/shm | ||||
|     - name: hf-cache | ||||
|       mountPath: /root/.cache/huggingface | ||||
|   env: | ||||
|     - name: VLLM_USAGE_SOURCE | ||||
|       value: ci-test | ||||
|     - name: HF_HOME | ||||
|       value: /root/.cache/huggingface | ||||
|     - name: VLLM_SOURCE_CODE_LOC | ||||
|       value: /workspace/build/buildkite/vllm/performance-benchmark | ||||
|     - name: HF_TOKEN | ||||
|       valueFrom: | ||||
|         secretKeyRef: | ||||
|           name: hf-token-secret | ||||
|           key: token | ||||
|  | ||||
| steps: | ||||
|   - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." | ||||
|  | ||||
|  | ||||
|  | ||||
|   - label: "A100 vllm step 10" | ||||
|     priority: 100 | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     plugins: | ||||
|       - kubernetes: | ||||
|           podSpec: | ||||
|             <<: *common_pod_spec | ||||
|             containers: | ||||
|               - image: vllm/vllm-openai:v0.6.2 | ||||
|                 <<: *common_container_settings | ||||
|  | ||||
|  | ||||
|  | ||||
|   - label: "A100 sglang benchmark" | ||||
|     priority: 100 | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     plugins: | ||||
|       - kubernetes: | ||||
|           podSpec: | ||||
|             <<: *common_pod_spec | ||||
|             containers: | ||||
|               - image: lmsysorg/sglang:v0.3.2-cu121 | ||||
|                 <<: *common_container_settings | ||||
|  | ||||
|   - label: "A100 lmdeploy benchmark" | ||||
|     priority: 100 | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     plugins: | ||||
|       - kubernetes: | ||||
|           podSpec: | ||||
|             <<: *common_pod_spec | ||||
|             containers: | ||||
|               - image: openmmlab/lmdeploy:v0.6.1-cu12 | ||||
|                 <<: *common_container_settings | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|   - label: "A100 trt llama-8B" | ||||
|     priority: 100 | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     plugins: | ||||
|       - kubernetes: | ||||
|           podSpec: | ||||
|             <<: *common_pod_spec | ||||
|             containers: | ||||
|               - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 | ||||
|                 <<: *common_container_settings | ||||
|                 env: | ||||
|                   - name: VLLM_USAGE_SOURCE | ||||
|                     value: ci-test | ||||
|                   - name: HF_HOME | ||||
|                     value: /root/.cache/huggingface | ||||
|                   - name: VLLM_SOURCE_CODE_LOC | ||||
|                     value: /workspace/build/buildkite/vllm/performance-benchmark | ||||
|                   - name: HF_TOKEN | ||||
|                     valueFrom: | ||||
|                       secretKeyRef: | ||||
|                         name: hf-token-secret | ||||
|                         key: token | ||||
|                   - name: TEST_SELECTOR | ||||
|                     value: "llama8B" | ||||
|  | ||||
|  | ||||
|   - label: "A100 trt llama-70B" | ||||
|     priority: 100 | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     plugins: | ||||
|       - kubernetes: | ||||
|           podSpec: | ||||
|             <<: *common_pod_spec | ||||
|             containers: | ||||
|               - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 | ||||
|                 <<: *common_container_settings | ||||
|                 env: | ||||
|                   - name: VLLM_USAGE_SOURCE | ||||
|                     value: ci-test | ||||
|                   - name: HF_HOME | ||||
|                     value: /root/.cache/huggingface | ||||
|                   - name: VLLM_SOURCE_CODE_LOC | ||||
|                     value: /workspace/build/buildkite/vllm/performance-benchmark | ||||
|                   - name: HF_TOKEN | ||||
|                     valueFrom: | ||||
|                       secretKeyRef: | ||||
|                         name: hf-token-secret | ||||
|                         key: token | ||||
|                   - name: TEST_SELECTOR | ||||
|                     value: "llama70B" | ||||
|  | ||||
|  | ||||
|   # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image  | ||||
|   # - label: "A100 trt benchmark" | ||||
|   #   priority: 100 | ||||
|   #   agents: | ||||
|   #     queue: A100 | ||||
|   #   plugins: | ||||
|   #     - kubernetes: | ||||
|   #         podSpec: | ||||
|   #           <<: *common_pod_spec | ||||
|   #           containers: | ||||
|   #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3 | ||||
|   #               <<: *common_container_settings | ||||
|  | ||||
|  | ||||
|   # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`. | ||||
|   # - label: "A100 tgi benchmark" | ||||
|   #   priority: 100 | ||||
|   #   agents: | ||||
|   #     queue: A100 | ||||
|   #   plugins: | ||||
|   #     - kubernetes: | ||||
|   #         podSpec: | ||||
|   #           <<: *common_pod_spec | ||||
|   #           containers: | ||||
|   #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0 | ||||
|   #               <<: *common_container_settings | ||||
|          | ||||
|   - wait | ||||
|  | ||||
|   - label: "Collect the results" | ||||
|     priority: 100 | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     plugins: | ||||
|       - kubernetes: | ||||
|           podSpec: | ||||
|             <<: *common_pod_spec | ||||
|             containers: | ||||
|             - image: vllm/vllm-openai:v0.5.0.post1 | ||||
|               command: | ||||
|               - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh | ||||
|               resources: | ||||
|                 limits: | ||||
|                   nvidia.com/gpu: 8 | ||||
|               volumeMounts: | ||||
|               - name: devshm | ||||
|                 mountPath: /dev/shm | ||||
|               env: | ||||
|               - name: VLLM_USAGE_SOURCE | ||||
|                 value: ci-test | ||||
|               - name: VLLM_SOURCE_CODE_LOC | ||||
|                 value: /workspace/build/buildkite/vllm/performance-benchmark | ||||
|               - name: HF_TOKEN | ||||
|                 valueFrom: | ||||
|                   secretKeyRef: | ||||
|                     name: hf-token-secret | ||||
|                     key: token | ||||
|  | ||||
|   - block: ":rocket: check the results!" | ||||
| @ -1,64 +0,0 @@ | ||||
|  | ||||
| ## Latency tests | ||||
|  | ||||
| - Input length: 32 tokens. | ||||
| - Output length: 128 tokens. | ||||
| - Batch size: fixed (8). | ||||
| - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. | ||||
| - CPU Models: llama-3.1 8B. | ||||
| - Evaluation metrics: end-to-end latency (mean, median, p99). | ||||
|  | ||||
| {latency_tests_markdown_table} | ||||
|  | ||||
| ## Throughput tests | ||||
|  | ||||
| - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). | ||||
| - Output length: the corresponding output length of these 200 prompts. | ||||
| - Batch size: dynamically determined by vllm to achieve maximum throughput. | ||||
| - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. | ||||
| - CPU Models: llama-3.1 8B. | ||||
| - Evaluation metrics: throughput. | ||||
|  | ||||
| {throughput_tests_markdown_table} | ||||
|  | ||||
| ## Serving tests | ||||
|  | ||||
| - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). | ||||
| - Output length: the corresponding output length of these 200 prompts. | ||||
| - Batch size: dynamically determined by vllm and the arrival pattern of the requests. | ||||
| - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). | ||||
| - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. | ||||
| - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 | ||||
| - CPU Models: llama-3.1 8B. | ||||
| - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). | ||||
| - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts. | ||||
|  | ||||
| {serving_tests_markdown_table} | ||||
|  | ||||
| ## Platform Information | ||||
|  | ||||
| {platform_markdown_table} | ||||
|  | ||||
| ## json version of the benchmarking tables | ||||
|  | ||||
| This section contains the data of the markdown tables above in JSON format. | ||||
| You can load the benchmarking tables into pandas dataframes as follows: | ||||
|  | ||||
| ```python | ||||
| import json | ||||
| import pandas as pd | ||||
|  | ||||
| benchmarking_results_json = """The json string""" | ||||
| benchmarking_results = json.loads(benchmarking_results_json) | ||||
| latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"]) | ||||
| throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"]) | ||||
| serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"]) | ||||
| ``` | ||||
|  | ||||
| The json string for all benchmarking tables: | ||||
|  | ||||
| ```json | ||||
| {benchmarking_results_in_json_string} | ||||
| ``` | ||||
|  | ||||
| You can also check the raw experiment data in the Artifact tab of the Buildkite page. | ||||
| @ -1,66 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| import argparse | ||||
|  | ||||
| import pandas as pd | ||||
|  | ||||
|  | ||||
| def compare_data_columns( | ||||
|     files, name_column, data_column, drop_column, ignore_test_name=False | ||||
| ): | ||||
|     print("\ncompare_data_column: " + data_column) | ||||
|     frames = [] | ||||
|     compare_frames = [] | ||||
|     for file in files: | ||||
|         data_df = pd.read_json(file) | ||||
|         serving_df = data_df.dropna(subset=[drop_column], ignore_index=True) | ||||
|         if ignore_test_name is False: | ||||
|             serving_df = serving_df.rename(columns={name_column: file + "_name"}) | ||||
|             frames.append(serving_df[file + "_name"]) | ||||
|         serving_df = serving_df.rename(columns={data_column: file}) | ||||
|         frames.append(serving_df[file]) | ||||
|         compare_frames.append(serving_df[file]) | ||||
|         if len(compare_frames) >= 2: | ||||
|             # Compare numbers among two files | ||||
|             ratio_df = compare_frames[1] / compare_frames[0] | ||||
|             frames.append(ratio_df) | ||||
|             compare_frames.pop(1) | ||||
|  | ||||
|     concat_df = pd.concat(frames, axis=1) | ||||
|     return concat_df | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument( | ||||
|         "-f", "--file", action="append", type=str, help="input file name" | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--ignore_test_name", action="store_true", help="ignore_test_name or not" | ||||
|     ) | ||||
|     args = parser.parse_args() | ||||
|     files = args.file | ||||
|     print("comparing : " + ", ".join(files)) | ||||
|  | ||||
|     drop_column = "P99" | ||||
|     name_column = "Test name" | ||||
|     data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] | ||||
|     html_msgs_for_data_cols = [ | ||||
|         "Compare Output Tokens /n", | ||||
|         "Median TTFT /n", | ||||
|         "Median TPOT /n", | ||||
|     ] | ||||
|     ignore_test_name = args.ignore_test_name | ||||
|     with open("perf_comparison.html", "w") as text_file: | ||||
|         for i in range(len(data_cols_to_compare)): | ||||
|             output_df = compare_data_columns( | ||||
|                 files, | ||||
|                 name_column, | ||||
|                 data_cols_to_compare[i], | ||||
|                 drop_column, | ||||
|                 ignore_test_name=ignore_test_name, | ||||
|             ) | ||||
|             print(output_df) | ||||
|             html = output_df.to_html() | ||||
|             text_file.write(html_msgs_for_data_cols[i]) | ||||
|             text_file.write(html) | ||||
| @ -1,268 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import json | ||||
| import os | ||||
| from importlib import util | ||||
| from pathlib import Path | ||||
|  | ||||
| import pandas as pd | ||||
| import psutil | ||||
| from tabulate import tabulate | ||||
|  | ||||
| results_folder = Path("results/") | ||||
|  | ||||
| # latency results and the keys that will be printed into markdown | ||||
| latency_results = [] | ||||
| latency_column_mapping = { | ||||
|     "test_name": "Test name", | ||||
|     "gpu_type": "GPU", | ||||
|     "avg_latency": "Mean latency (ms)", | ||||
|     # "P10": "P10 (s)", | ||||
|     # "P25": "P25 (s)", | ||||
|     "P50": "Median latency (ms)", | ||||
|     # "P75": "P75 (s)", | ||||
|     # "P90": "P90 (s)", | ||||
|     "P99": "P99 latency (ms)", | ||||
| } | ||||
|  | ||||
| # throughput tests and the keys that will be printed into markdown | ||||
| throughput_results = [] | ||||
| throughput_results_column_mapping = { | ||||
|     "test_name": "Test name", | ||||
|     "gpu_type": "GPU", | ||||
|     "num_requests": "# of req.", | ||||
|     "total_num_tokens": "Total # of tokens", | ||||
|     "elapsed_time": "Elapsed time (s)", | ||||
|     "requests_per_second": "Tput (req/s)", | ||||
|     "tokens_per_second": "Tput (tok/s)", | ||||
| } | ||||
|  | ||||
| # serving results and the keys that will be printed into markdown | ||||
| serving_results = [] | ||||
| serving_column_mapping = { | ||||
|     "test_name": "Test name", | ||||
|     "gpu_type": "GPU", | ||||
|     "completed": "# of req.", | ||||
|     "request_throughput": "Tput (req/s)", | ||||
|     "total_token_throughput": "Total Token Tput (tok/s)", | ||||
|     "output_throughput": "Output Tput (tok/s)", | ||||
|     "total_input_tokens": "Total input tokens", | ||||
|     "total_output_tokens": "Total output tokens", | ||||
|     "mean_ttft_ms": "Mean TTFT (ms)", | ||||
|     "median_ttft_ms": "Median TTFT (ms)", | ||||
|     "p99_ttft_ms": "P99 TTFT (ms)", | ||||
|     "mean_tpot_ms": "Mean TPOT (ms)", | ||||
|     "median_tpot_ms": "Median", | ||||
|     "p99_tpot_ms": "P99", | ||||
|     "mean_itl_ms": "Mean ITL (ms)", | ||||
|     "median_itl_ms": "Median ITL (ms)", | ||||
|     "p99_itl_ms": "P99 ITL (ms)", | ||||
| } | ||||
|  | ||||
|  | ||||
| def read_markdown(file): | ||||
|     if os.path.exists(file): | ||||
|         with open(file) as f: | ||||
|             return f.read() + "\n" | ||||
|     else: | ||||
|         return f"{file} not found.\n" | ||||
|  | ||||
|  | ||||
| def results_to_json(latency, throughput, serving): | ||||
|     return json.dumps( | ||||
|         { | ||||
|             "latency": latency.to_dict(), | ||||
|             "throughput": throughput.to_dict(), | ||||
|             "serving": serving.to_dict(), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def get_size_with_unit(bytes, suffix="B"): | ||||
|     """ | ||||
|     Scale bytes to its proper format | ||||
|     e.g: | ||||
|         1253656 => '1.20MB' | ||||
|         1253656678 => '1.17GB' | ||||
|     """ | ||||
|     factor = 1024 | ||||
|     for unit in ["", "K", "M", "G", "T", "P"]: | ||||
|         if bytes < factor: | ||||
|             return f"{bytes:.2f}{unit}{suffix}" | ||||
|         bytes /= factor | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     # collect results | ||||
|     for test_file in results_folder.glob("*.json"): | ||||
|         with open(test_file) as f: | ||||
|             raw_result = json.loads(f.read()) | ||||
|  | ||||
|         if "serving" in str(test_file): | ||||
|             # this result is generated via `benchmark_serving.py` | ||||
|  | ||||
|             # attach the benchmarking command to raw_result | ||||
|             try: | ||||
|                 with open(test_file.with_suffix(".commands")) as f: | ||||
|                     command = json.loads(f.read()) | ||||
|             except OSError as e: | ||||
|                 print(e) | ||||
|                 continue | ||||
|  | ||||
|             raw_result.update(command) | ||||
|  | ||||
|             # update the test name of this result | ||||
|             raw_result.update({"test_name": test_file.stem}) | ||||
|  | ||||
|             # add the result to raw_result | ||||
|             serving_results.append(raw_result) | ||||
|             continue | ||||
|  | ||||
|         elif "latency" in f.name: | ||||
|             # this result is generated via `benchmark_latency.py` | ||||
|  | ||||
|             # attach the benchmarking command to raw_result | ||||
|             try: | ||||
|                 with open(test_file.with_suffix(".commands")) as f: | ||||
|                     command = json.loads(f.read()) | ||||
|             except OSError as e: | ||||
|                 print(e) | ||||
|                 continue | ||||
|  | ||||
|             raw_result.update(command) | ||||
|  | ||||
|             # update the test name of this result | ||||
|             raw_result.update({"test_name": test_file.stem}) | ||||
|  | ||||
|             # get different percentiles | ||||
|             for perc in [10, 25, 50, 75, 90, 99]: | ||||
|                 # Multiply 1000 to convert the time unit from s to ms | ||||
|                 raw_result.update( | ||||
|                     {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]} | ||||
|                 ) | ||||
|             raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 | ||||
|  | ||||
|             # add the result to raw_result | ||||
|             latency_results.append(raw_result) | ||||
|             continue | ||||
|  | ||||
|         elif "throughput" in f.name: | ||||
|             # this result is generated via `benchmark_throughput.py` | ||||
|  | ||||
|             # attach the benchmarking command to raw_result | ||||
|             try: | ||||
|                 with open(test_file.with_suffix(".commands")) as f: | ||||
|                     command = json.loads(f.read()) | ||||
|             except OSError as e: | ||||
|                 print(e) | ||||
|                 continue | ||||
|  | ||||
|             raw_result.update(command) | ||||
|  | ||||
|             # update the test name of this result | ||||
|             raw_result.update({"test_name": test_file.stem}) | ||||
|  | ||||
|             # add the result to raw_result | ||||
|             throughput_results.append(raw_result) | ||||
|             continue | ||||
|  | ||||
|         print(f"Skipping {test_file}") | ||||
|  | ||||
|     latency_results = pd.DataFrame.from_dict(latency_results) | ||||
|     serving_results = pd.DataFrame.from_dict(serving_results) | ||||
|     throughput_results = pd.DataFrame.from_dict(throughput_results) | ||||
|  | ||||
|     svmem = psutil.virtual_memory() | ||||
|     platform_data = { | ||||
|         "Physical cores": [psutil.cpu_count(logical=False)], | ||||
|         "Total cores": [psutil.cpu_count(logical=True)], | ||||
|         "Total Memory": [get_size_with_unit(svmem.total)], | ||||
|     } | ||||
|  | ||||
|     if util.find_spec("numa") is not None: | ||||
|         from numa import info | ||||
|  | ||||
|         platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()] | ||||
|  | ||||
|     if util.find_spec("cpuinfo") is not None: | ||||
|         from cpuinfo import get_cpu_info | ||||
|  | ||||
|         platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]] | ||||
|  | ||||
|     platform_results = pd.DataFrame.from_dict( | ||||
|         platform_data, orient="index", columns=["Platform Info"] | ||||
|     ) | ||||
|  | ||||
|     raw_results_json = results_to_json( | ||||
|         latency_results, throughput_results, serving_results | ||||
|     ) | ||||
|  | ||||
|     # remapping the key, for visualization purpose | ||||
|     if not latency_results.empty: | ||||
|         latency_results = latency_results[list(latency_column_mapping.keys())].rename( | ||||
|             columns=latency_column_mapping | ||||
|         ) | ||||
|     if not serving_results.empty: | ||||
|         serving_results = serving_results[list(serving_column_mapping.keys())].rename( | ||||
|             columns=serving_column_mapping | ||||
|         ) | ||||
|     if not throughput_results.empty: | ||||
|         throughput_results = throughput_results[ | ||||
|             list(throughput_results_column_mapping.keys()) | ||||
|         ].rename(columns=throughput_results_column_mapping) | ||||
|  | ||||
|     processed_results_json = results_to_json( | ||||
|         latency_results, throughput_results, serving_results | ||||
|     ) | ||||
|  | ||||
|     for df in [latency_results, serving_results, throughput_results]: | ||||
|         if df.empty: | ||||
|             continue | ||||
|  | ||||
|         # Sort all dataframes by their respective "Test name" columns | ||||
|         df.sort_values(by="Test name", inplace=True) | ||||
|  | ||||
|         # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", | ||||
|         # we want to turn it into "8xGPUTYPE" | ||||
|         df["GPU"] = df["GPU"].apply( | ||||
|             lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}" | ||||
|         ) | ||||
|  | ||||
|     # get markdown tables | ||||
|     latency_md_table = tabulate( | ||||
|         latency_results, headers="keys", tablefmt="pipe", showindex=False | ||||
|     ) | ||||
|     serving_md_table = tabulate( | ||||
|         serving_results, headers="keys", tablefmt="pipe", showindex=False | ||||
|     ) | ||||
|     throughput_md_table = tabulate( | ||||
|         throughput_results, headers="keys", tablefmt="pipe", showindex=False | ||||
|     ) | ||||
|     platform_md_table = tabulate( | ||||
|         platform_results, headers="keys", tablefmt="pipe", showindex=True | ||||
|     ) | ||||
|  | ||||
|     # document the result | ||||
|     with open(results_folder / "benchmark_results.md", "w") as f: | ||||
|         results = read_markdown( | ||||
|             "../.buildkite/nightly-benchmarks/" | ||||
|             + "performance-benchmarks-descriptions.md" | ||||
|         ) | ||||
|         results = results.format( | ||||
|             latency_tests_markdown_table=latency_md_table, | ||||
|             throughput_tests_markdown_table=throughput_md_table, | ||||
|             serving_tests_markdown_table=serving_md_table, | ||||
|             platform_markdown_table=platform_md_table, | ||||
|             benchmarking_results_in_json_string=processed_results_json, | ||||
|         ) | ||||
|         f.write(results) | ||||
|  | ||||
|     # document benchmarking results in json | ||||
|     with open(results_folder / "benchmark_results.json", "w") as f: | ||||
|         results = ( | ||||
|             latency_results.to_dict(orient="records") | ||||
|             + throughput_results.to_dict(orient="records") | ||||
|             + serving_results.to_dict(orient="records") | ||||
|         ) | ||||
|         f.write(json.dumps(results)) | ||||
| @ -1,26 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import argparse | ||||
|  | ||||
| from transformers import AutoTokenizer | ||||
|  | ||||
|  | ||||
| def main(model, cachedir): | ||||
|     # Load the tokenizer and save it to the specified directory | ||||
|     tokenizer = AutoTokenizer.from_pretrained(model) | ||||
|     tokenizer.save_pretrained(cachedir) | ||||
|     print(f"Tokenizer saved to {cachedir}") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description="Download and save Hugging Face tokenizer" | ||||
|     ) | ||||
|     parser.add_argument("--model", type=str, required=True, help="Name of the model") | ||||
|     parser.add_argument( | ||||
|         "--cachedir", type=str, required=True, help="Directory to save the tokenizer" | ||||
|     ) | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|     main(args.model, args.cachedir) | ||||
| @ -1,97 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import argparse | ||||
| import json | ||||
| from pathlib import Path | ||||
|  | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| from tabulate import tabulate | ||||
|  | ||||
|  | ||||
| def parse_arguments(): | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description="Parse command line arguments for summary-nightly-results script." | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--results-folder", | ||||
|         type=str, | ||||
|         required=True, | ||||
|         help="The folder where the results are stored.", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--description", type=str, required=True, help="Description of the results." | ||||
|     ) | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|     return args | ||||
|  | ||||
|  | ||||
| def get_perf(df, method, model, metric): | ||||
|     means = [] | ||||
|  | ||||
|     for qps in [2, 4, 8, 16, "inf"]: | ||||
|         target = df["Test name"].str.contains(model) | ||||
|         target = target & df["Engine"].str.contains(method) | ||||
|         target = target & df["Test name"].str.contains("qps_" + str(qps)) | ||||
|         filtered_df = df[target] | ||||
|  | ||||
|         if filtered_df.empty: | ||||
|             means.append(0.0) | ||||
|         else: | ||||
|             means.append(filtered_df[metric].values[0]) | ||||
|  | ||||
|     return np.array(means) | ||||
|  | ||||
|  | ||||
| def get_perf_w_std(df, method, model, metric): | ||||
|     if metric in ["TTFT", "ITL"]: | ||||
|         mean = get_perf(df, method, model, "Mean " + metric + " (ms)") | ||||
|         mean = mean.tolist() | ||||
|         std = get_perf(df, method, model, "Std " + metric + " (ms)") | ||||
|         if std.mean() == 0: | ||||
|             std = None | ||||
|         success = get_perf(df, method, model, "Successful req.") | ||||
|         if std is not None: | ||||
|             std = std / np.sqrt(success) | ||||
|             std = std.tolist() | ||||
|  | ||||
|     else: | ||||
|         assert metric == "Tput" | ||||
|         mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( | ||||
|             df, method, model, "Output Tput (tok/s)" | ||||
|         ) | ||||
|         mean = mean.tolist() | ||||
|         std = None | ||||
|  | ||||
|     return mean, std | ||||
|  | ||||
|  | ||||
| def main(args): | ||||
|     results_folder = Path(args.results_folder) | ||||
|  | ||||
|     results = [] | ||||
|  | ||||
|     # collect results | ||||
|     for test_file in results_folder.glob("*_nightly_results.json"): | ||||
|         with open(test_file) as f: | ||||
|             results = results + json.loads(f.read()) | ||||
|  | ||||
|     # generate markdown table | ||||
|     df = pd.DataFrame.from_dict(results) | ||||
|  | ||||
|     md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False) | ||||
|  | ||||
|     with open(args.description) as f: | ||||
|         description = f.read() | ||||
|  | ||||
|     description = description.format(nightly_results_benchmarking_table=md_table) | ||||
|  | ||||
|     with open("nightly_results.md", "w") as f: | ||||
|         f.write(description) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     args = parse_arguments() | ||||
|     main(args) | ||||
| @ -1,9 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| from lmdeploy.serve.openai.api_client import APIClient | ||||
|  | ||||
| api_client = APIClient("http://localhost:8000") | ||||
| model_name = api_client.available_models[0] | ||||
|  | ||||
| print(model_name) | ||||
| @ -1,228 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # Currently FP8 benchmark is NOT enabled. | ||||
|  | ||||
| set -x | ||||
| server_params=$1 | ||||
| common_params=$2 | ||||
|  | ||||
| json2args() { | ||||
|   # transforms the JSON string to command line args, and '_' is replaced to '-' | ||||
|   # example: | ||||
|   # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } | ||||
|   # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 | ||||
|   local json_string=$1 | ||||
|   local args=$( | ||||
|     echo "$json_string" | jq -r ' | ||||
|       to_entries | | ||||
|       map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | | ||||
|       join(" ") | ||||
|     ' | ||||
|   ) | ||||
|   echo "$args" | ||||
| } | ||||
|  | ||||
| launch_trt_server() { | ||||
|  | ||||
|   model_path=$(echo "$common_params" | jq -r '.model') | ||||
|   model_name="${model_path#*/}" | ||||
|   model_type=$(echo "$server_params" | jq -r '.model_type') | ||||
|   model_dtype=$(echo "$server_params" | jq -r '.model_dtype') | ||||
|   model_tp_size=$(echo "$common_params" | jq -r '.tp') | ||||
|   max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') | ||||
|   max_input_len=$(echo "$server_params" | jq -r '.max_input_len') | ||||
|   max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len') | ||||
|   max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens') | ||||
|   trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') | ||||
|  | ||||
|   # create model caching directory | ||||
|   cd ~ | ||||
|   rm -rf models | ||||
|   mkdir -p models | ||||
|   cd models | ||||
|   models_dir=$(pwd) | ||||
|   trt_model_path=${models_dir}/${model_name}-trt-ckpt | ||||
|   trt_engine_path=${models_dir}/${model_name}-trt-engine | ||||
|  | ||||
|   # clone tensorrt backend | ||||
|   cd / | ||||
|   rm -rf tensorrtllm_backend | ||||
|   git clone https://github.com/triton-inference-server/tensorrtllm_backend.git | ||||
|   git lfs install | ||||
|   cd tensorrtllm_backend | ||||
|   git checkout "$trt_llm_version" | ||||
|   git submodule update --init --recursive | ||||
|  | ||||
|   # build trtllm engine | ||||
|   cd /tensorrtllm_backend | ||||
|   cd "./tensorrt_llm/examples/${model_type}" | ||||
|   python3 convert_checkpoint.py \ | ||||
|     --model_dir "${model_path}" \ | ||||
|     --dtype "${model_dtype}" \ | ||||
|     --tp_size "${model_tp_size}" \ | ||||
|     --output_dir "${trt_model_path}" | ||||
|   trtllm-build \ | ||||
|     --checkpoint_dir "${trt_model_path}" \ | ||||
|     --use_fused_mlp \ | ||||
|     --reduce_fusion disable \ | ||||
|     --workers 8 \ | ||||
|     --gpt_attention_plugin "${model_dtype}" \ | ||||
|     --gemm_plugin "${model_dtype}" \ | ||||
|     --tp_size "${model_tp_size}" \ | ||||
|     --max_batch_size "${max_batch_size}" \ | ||||
|     --max_input_len "${max_input_len}" \ | ||||
|     --max_seq_len "${max_seq_len}" \ | ||||
|     --max_num_tokens "${max_num_tokens}" \ | ||||
|     --output_dir "${trt_engine_path}" | ||||
|  | ||||
|   # handle triton protobuf files and launch triton server | ||||
|   cd /tensorrtllm_backend | ||||
|   mkdir triton_model_repo | ||||
|   cp -r all_models/inflight_batcher_llm/* triton_model_repo/ | ||||
|   cd triton_model_repo | ||||
|   rm -rf ./tensorrt_llm/1/* | ||||
|   cp -r "${trt_engine_path}"/* ./tensorrt_llm/1 | ||||
|   python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false | ||||
|   python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5" | ||||
|   python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false" | ||||
|   python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size" | ||||
|   python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1" | ||||
|   cd /tensorrtllm_backend | ||||
|   python3 scripts/launch_triton_server.py \ | ||||
|     --world_size="${model_tp_size}" \ | ||||
|     --model_repo=/tensorrtllm_backend/triton_model_repo & | ||||
|  | ||||
| } | ||||
|  | ||||
| launch_tgi_server() { | ||||
|   model=$(echo "$common_params" | jq -r '.model') | ||||
|   tp=$(echo "$common_params" | jq -r '.tp') | ||||
|   port=$(echo "$common_params" | jq -r '.port') | ||||
|   server_args=$(json2args "$server_params") | ||||
|  | ||||
|   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then | ||||
|     echo "Key 'fp8' exists in common params." | ||||
|     server_command="/tgi-entrypoint.sh \ | ||||
|                 --model-id $model \ | ||||
|                 --num-shard $tp \ | ||||
|                 --port $port \ | ||||
|                 --quantize fp8 \ | ||||
|                 $server_args" | ||||
|   else | ||||
|     echo "Key 'fp8' does not exist in common params." | ||||
|     server_command="/tgi-entrypoint.sh \ | ||||
|                 --model-id $model \ | ||||
|                 --num-shard $tp \ | ||||
|                 --port $port \ | ||||
|                 $server_args" | ||||
|   fi | ||||
|  | ||||
|   echo "Server command: $server_command" | ||||
|   eval "$server_command" & | ||||
|  | ||||
| } | ||||
|  | ||||
| launch_lmdeploy_server() { | ||||
|   model=$(echo "$common_params" | jq -r '.model') | ||||
|   tp=$(echo "$common_params" | jq -r '.tp') | ||||
|   port=$(echo "$common_params" | jq -r '.port') | ||||
|   server_args=$(json2args "$server_params") | ||||
|  | ||||
|   server_command="lmdeploy serve api_server $model \ | ||||
|     --tp $tp \ | ||||
|     --server-port $port \ | ||||
|     $server_args" | ||||
|  | ||||
|   # run the server | ||||
|   echo "Server command: $server_command" | ||||
|   bash -c "$server_command" & | ||||
| } | ||||
|  | ||||
| launch_sglang_server() { | ||||
|  | ||||
|   model=$(echo "$common_params" | jq -r '.model') | ||||
|   tp=$(echo "$common_params" | jq -r '.tp') | ||||
|   port=$(echo "$common_params" | jq -r '.port') | ||||
|   server_args=$(json2args "$server_params") | ||||
|  | ||||
|   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then | ||||
|     echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." | ||||
|     model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') | ||||
|     server_command="python3 \ | ||||
|         -m sglang.launch_server \ | ||||
|         --tp $tp \ | ||||
|         --model-path $model \ | ||||
|         --port $port \ | ||||
|         $server_args" | ||||
|   else | ||||
|     echo "Key 'fp8' does not exist in common params." | ||||
|     server_command="python3 \ | ||||
|         -m sglang.launch_server \ | ||||
|         --tp $tp \ | ||||
|         --model-path $model \ | ||||
|         --port $port \ | ||||
|         $server_args" | ||||
|   fi | ||||
|  | ||||
|   # run the server | ||||
|   echo "Server command: $server_command" | ||||
|   eval "$server_command" & | ||||
| } | ||||
|  | ||||
| launch_vllm_server() { | ||||
|  | ||||
|   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') | ||||
|  | ||||
|   model=$(echo "$common_params" | jq -r '.model') | ||||
|   tp=$(echo "$common_params" | jq -r '.tp') | ||||
|   port=$(echo "$common_params" | jq -r '.port') | ||||
|   server_args=$(json2args "$server_params") | ||||
|  | ||||
|   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then | ||||
|     echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." | ||||
|     model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') | ||||
|     server_command="python3 \ | ||||
|         -m vllm.entrypoints.openai.api_server \ | ||||
|         -tp $tp \ | ||||
|         --model $model \ | ||||
|         --port $port \ | ||||
|         $server_args" | ||||
|   else | ||||
|     echo "Key 'fp8' does not exist in common params." | ||||
|     server_command="python3 \ | ||||
|         -m vllm.entrypoints.openai.api_server \ | ||||
|         -tp $tp \ | ||||
|         --model $model \ | ||||
|         --port $port \ | ||||
|         $server_args" | ||||
|   fi | ||||
|  | ||||
|   # run the server | ||||
|   echo "Server command: $server_command" | ||||
|   eval "$server_command" & | ||||
| } | ||||
|  | ||||
| main() { | ||||
|  | ||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then | ||||
|     launch_trt_server | ||||
|   fi | ||||
|  | ||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then | ||||
|     launch_tgi_server | ||||
|   fi | ||||
|  | ||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then | ||||
|     launch_lmdeploy_server | ||||
|   fi | ||||
|  | ||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then | ||||
|     launch_sglang_server | ||||
|   fi | ||||
|  | ||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then | ||||
|     launch_vllm_server | ||||
|   fi | ||||
| } | ||||
|  | ||||
| main | ||||
| @ -1,78 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -ex | ||||
| set -o pipefail | ||||
|  | ||||
|  | ||||
| main() { | ||||
|  | ||||
|     (which wget && which curl) || (apt-get update && apt-get install -y wget curl) | ||||
|     (which jq) || (apt-get update && apt-get -y install jq) | ||||
|     (which zip) || (apt-get install -y zip) | ||||
|  | ||||
|     if [ ! -f /workspace/buildkite-agent ]; then | ||||
|         echo "buildkite-agent binary not found. Skip plotting the results." | ||||
|         exit 0 | ||||
|     fi | ||||
|  | ||||
|     # initial annotation | ||||
|     #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" | ||||
|  | ||||
|     # download results | ||||
|     cd "$VLLM_SOURCE_CODE_LOC/benchmarks" | ||||
|     mkdir -p results/ | ||||
|     /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ | ||||
|     ls | ||||
|     ls results/ | ||||
|  | ||||
|     # upload benchmark results | ||||
|     zip -r results.zip results/ | ||||
|     /workspace/buildkite-agent artifact upload "results.zip" | ||||
|  | ||||
|     # upload benchmarking scripts | ||||
|     cd "$VLLM_SOURCE_CODE_LOC/" | ||||
|     zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ | ||||
|     /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" | ||||
|  | ||||
|     cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" | ||||
|     # upload benchmarking pipeline | ||||
|     /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" | ||||
|  | ||||
|     cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" | ||||
|     /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md | ||||
|      | ||||
|  | ||||
|  | ||||
|     # The figures should be generated by a separate process outside the CI/CD pipeline | ||||
|  | ||||
|     # # generate figures | ||||
|     # python3 -m pip install tabulate pandas matplotlib | ||||
|  | ||||
|     # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \ | ||||
|     #     --description $description \ | ||||
|     #     --results-folder results/  | ||||
|  | ||||
|  | ||||
|     # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ | ||||
|     #     --description $description \ | ||||
|     #     --results-folder results/ \ | ||||
|     #     --dataset sharegpt | ||||
|  | ||||
|     # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ | ||||
|     #     --description $description \ | ||||
|     #     --results-folder results/ \ | ||||
|     #     --dataset sonnet_2048_128 | ||||
|  | ||||
|     # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ | ||||
|     #     --description $description \ | ||||
|     #     --results-folder results/ \ | ||||
|     #     --dataset sonnet_128_2048 | ||||
|      | ||||
|     # # upload results and figures | ||||
|     # /workspace/buildkite-agent artifact upload "nightly_results*.png" | ||||
|     # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml | ||||
|     # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json | ||||
|     # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md | ||||
| } | ||||
|  | ||||
| main "$@" | ||||
| @ -1,462 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -o pipefail | ||||
| set -x | ||||
|  | ||||
| check_gpus() { | ||||
|   # check the number of GPUs and GPU type. | ||||
|   declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) | ||||
|   if [[ $gpu_count -gt 0 ]]; then | ||||
|     echo "GPU found." | ||||
|   else | ||||
|     echo "Need at least 1 GPU to run benchmarking." | ||||
|     exit 1 | ||||
|   fi | ||||
|   declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" | ||||
|   echo "GPU type is $gpu_type" | ||||
| } | ||||
|  | ||||
| check_hf_token() { | ||||
|   # check if HF_TOKEN is available and valid | ||||
|   if [[ -z "$HF_TOKEN" ]]; then | ||||
|     echo "Error: HF_TOKEN is not set." | ||||
|     exit 1 | ||||
|   elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then | ||||
|     echo "Error: HF_TOKEN does not start with 'hf_'." | ||||
|     exit 1 | ||||
|   else | ||||
|     echo "HF_TOKEN is set and valid." | ||||
|   fi | ||||
| } | ||||
|  | ||||
|  | ||||
| upload_to_buildkite() { | ||||
|   # upload the benchmarking results to buildkite | ||||
|  | ||||
|   # if the agent binary is not found, skip uploading the results, exit 0 | ||||
|   if [ ! -f /workspace/buildkite-agent ]; then | ||||
|     echo "buildkite-agent binary not found. Skip uploading the results." | ||||
|     return 0 | ||||
|   fi | ||||
|   # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md | ||||
|   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" | ||||
| } | ||||
|  | ||||
|  | ||||
| get_current_llm_serving_engine() { | ||||
|  | ||||
|   if which lmdeploy >/dev/null; then | ||||
|     echo "Container: lmdeploy" | ||||
|     export CURRENT_LLM_SERVING_ENGINE=lmdeploy | ||||
|     return | ||||
|   fi | ||||
|  | ||||
|   if [ -e /tgi-entrypoint.sh ]; then | ||||
|     echo "Container: tgi" | ||||
|     export CURRENT_LLM_SERVING_ENGINE=tgi | ||||
|     return | ||||
|   fi | ||||
|  | ||||
|   if which trtllm-build >/dev/null; then | ||||
|     echo "Container: tensorrt-llm" | ||||
|     export CURRENT_LLM_SERVING_ENGINE=trt | ||||
|     return | ||||
|   fi | ||||
|  | ||||
|   if [ -e /sgl-workspace ]; then | ||||
|     echo "Container: sglang" | ||||
|     export CURRENT_LLM_SERVING_ENGINE=sglang | ||||
|     return | ||||
|   fi | ||||
|  | ||||
|   if [ -e /vllm-workspace ]; then | ||||
|     echo "Container: vllm" | ||||
|     # move to a completely irrelevant directory, to avoid import vllm from current folder | ||||
|     export CURRENT_LLM_SERVING_ENGINE=vllm | ||||
|      | ||||
|     return | ||||
|   fi | ||||
| } | ||||
|  | ||||
| json2args() { | ||||
|   # transforms the JSON string to command line args, and '_' is replaced to '-' | ||||
|   # example: | ||||
|   # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } | ||||
|   # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 | ||||
|   local json_string=$1 | ||||
|   local args=$( | ||||
|     echo "$json_string" | jq -r ' | ||||
|       to_entries | | ||||
|       map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | | ||||
|       join(" ") | ||||
|     ' | ||||
|   ) | ||||
|   echo "$args" | ||||
| } | ||||
|  | ||||
| kill_gpu_processes() { | ||||
|   pkill -f python | ||||
|   pkill -f python3 | ||||
|   pkill -f tritonserver | ||||
|   pkill -f pt_main_thread | ||||
|   pkill -f text-generation | ||||
|   pkill -f lmdeploy | ||||
|  | ||||
|   while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do | ||||
|     sleep 1 | ||||
|   done | ||||
| } | ||||
|  | ||||
| wait_for_server() { | ||||
|   # wait for vllm server to start | ||||
|   # return 1 if vllm server crashes | ||||
|   timeout 1200 bash -c ' | ||||
|     until curl -s localhost:8000/v1/completions > /dev/null; do | ||||
|       sleep 1 | ||||
|     done' && return 0 || return 1 | ||||
| } | ||||
|  | ||||
| ensure_installed() { | ||||
|   # Ensure that the given command is installed by apt-get | ||||
|   local cmd=$1 | ||||
|   if ! which "$cmd" >/dev/null; then | ||||
|     apt-get update && apt-get install -y "$cmd" | ||||
|   fi | ||||
| } | ||||
|  | ||||
| run_serving_tests() { | ||||
|   # run serving tests using `benchmark_serving.py` | ||||
|   # $1: a json file specifying serving test cases | ||||
|  | ||||
|   local serving_test_file | ||||
|   serving_test_file=$1 | ||||
|  | ||||
|   # Iterate over serving tests | ||||
|   jq -c '.[]' "$serving_test_file" | while read -r params; do | ||||
|     # get the test name, and append the GPU type back to it. | ||||
|     test_name=$(echo "$params" | jq -r '.test_name') | ||||
|  | ||||
|     # if TEST_SELECTOR is set, only run the test cases that match the selector | ||||
|     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then | ||||
|       echo "Skip test case $test_name." | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     # prepend the current serving engine to the test name | ||||
|     test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} | ||||
|  | ||||
|     # get common parameters | ||||
|     common_params=$(echo "$params" | jq -r '.common_parameters') | ||||
|     model=$(echo "$common_params" | jq -r '.model') | ||||
|     tp=$(echo "$common_params" | jq -r '.tp') | ||||
|     dataset_name=$(echo "$common_params" | jq -r '.dataset_name') | ||||
|     dataset_path=$(echo "$common_params" | jq -r '.dataset_path') | ||||
|     port=$(echo "$common_params" | jq -r '.port') | ||||
|     num_prompts=$(echo "$common_params" | jq -r '.num_prompts') | ||||
|     reuse_server=$(echo "$common_params" | jq -r '.reuse_server') | ||||
|  | ||||
|     # get client and server arguments | ||||
|     server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") | ||||
|     client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters") | ||||
|     client_args=$(json2args "$client_params") | ||||
|     qps_list=$(echo "$params" | jq -r '.qps_list') | ||||
|     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') | ||||
|     echo "Running over qps list $qps_list" | ||||
|  | ||||
|     # check if there is enough GPU to run the test | ||||
|     if [[ $gpu_count -lt $tp ]]; then | ||||
|       echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     if [[ $reuse_server == "true" ]]; then | ||||
|       echo "Reuse previous server for test case $test_name" | ||||
|     else | ||||
|       kill_gpu_processes | ||||
|       bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ | ||||
|         "$server_params" "$common_params" | ||||
|     fi | ||||
|  | ||||
|     if wait_for_server; then | ||||
|       echo "" | ||||
|       echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." | ||||
|     else | ||||
|       echo "" | ||||
|       echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." | ||||
|       break | ||||
|     fi | ||||
|  | ||||
|     # prepare tokenizer | ||||
|     # this is required for lmdeploy. | ||||
|     cd "$VLLM_SOURCE_CODE_LOC/benchmarks" | ||||
|     rm -rf /tokenizer_cache | ||||
|     mkdir /tokenizer_cache | ||||
|     python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ | ||||
|       --model "$model" \ | ||||
|       --cachedir /tokenizer_cache | ||||
|     cd "$VLLM_SOURCE_CODE_LOC/benchmarks" | ||||
|  | ||||
|  | ||||
|     # change model name for lmdeploy (it will not follow standard hf name) | ||||
|     if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then | ||||
|       model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) | ||||
|     fi | ||||
|  | ||||
|     # iterate over different QPS | ||||
|     for qps in $qps_list; do | ||||
|       # remove the surrounding single quote from qps | ||||
|       if [[ "$qps" == *"inf"* ]]; then | ||||
|         echo "qps was $qps" | ||||
|         qps="inf" | ||||
|         echo "now qps is $qps" | ||||
|       fi | ||||
|  | ||||
|       new_test_name=$test_name"_qps_"$qps | ||||
|  | ||||
|       backend=$CURRENT_LLM_SERVING_ENGINE | ||||
|  | ||||
|       if [[ $backend = "trt" ]]; then | ||||
|         backend="tensorrt-llm" | ||||
|       fi | ||||
|  | ||||
|       if [[ "$backend" == *"vllm"* ]]; then | ||||
|         backend="vllm" | ||||
|       fi | ||||
|  | ||||
|       if [[ "$dataset_name" = "sharegpt" ]]; then | ||||
|  | ||||
|         client_command="python3 benchmark_serving.py \ | ||||
|           --backend $backend \ | ||||
|           --tokenizer /tokenizer_cache \ | ||||
|           --model $model \ | ||||
|           --dataset-name $dataset_name \ | ||||
|           --dataset-path $dataset_path \ | ||||
|           --num-prompts $num_prompts \ | ||||
|           --port $port \ | ||||
|           --save-result \ | ||||
|           --result-dir $RESULTS_FOLDER \ | ||||
|           --result-filename ${new_test_name}.json \ | ||||
|           --request-rate $qps \ | ||||
|           --ignore-eos \ | ||||
|           $client_args" | ||||
|  | ||||
|       elif [[ "$dataset_name" = "sonnet" ]]; then | ||||
|  | ||||
|         sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len') | ||||
|         sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') | ||||
|         sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') | ||||
|  | ||||
|         client_command="python3 benchmark_serving.py \ | ||||
|           --backend $backend \ | ||||
|           --tokenizer /tokenizer_cache \ | ||||
|           --model $model \ | ||||
|           --dataset-name $dataset_name \ | ||||
|           --dataset-path $dataset_path \ | ||||
|           --num-prompts $num_prompts \ | ||||
|           --sonnet-input-len $sonnet_input_len \ | ||||
|           --sonnet-output-len $sonnet_output_len \ | ||||
|           --sonnet-prefix-len $sonnet_prefix_len \ | ||||
|           --port $port \ | ||||
|           --save-result \ | ||||
|           --result-dir $RESULTS_FOLDER \ | ||||
|           --result-filename ${new_test_name}.json \ | ||||
|           --request-rate $qps \ | ||||
|           --ignore-eos \ | ||||
|           $client_args" | ||||
|  | ||||
|       else | ||||
|    | ||||
|         echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name." | ||||
|         exit 1 | ||||
|  | ||||
|       fi | ||||
|  | ||||
|          | ||||
|  | ||||
|       echo "Running test case $test_name with qps $qps" | ||||
|       echo "Client command: $client_command" | ||||
|  | ||||
|       eval "$client_command" | ||||
|  | ||||
|       server_command="None" | ||||
|  | ||||
|       # record the benchmarking commands | ||||
|       jq_output=$(jq -n \ | ||||
|         --arg server "$server_command" \ | ||||
|         --arg client "$client_command" \ | ||||
|         --arg gpu "$gpu_type" \ | ||||
|         --arg engine "$CURRENT_LLM_SERVING_ENGINE" \ | ||||
|         '{ | ||||
|           server_command: $server, | ||||
|           client_command: $client, | ||||
|           gpu_type: $gpu, | ||||
|           engine: $engine | ||||
|         }') | ||||
|       echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" | ||||
|  | ||||
|     done | ||||
|  | ||||
|   done | ||||
|  | ||||
|   kill_gpu_processes | ||||
| } | ||||
|  | ||||
| run_genai_perf_tests() { | ||||
|   # run genai-perf tests  | ||||
|  | ||||
|   # $1: a json file specifying genai-perf test cases | ||||
|   local genai_perf_test_file | ||||
|   genai_perf_test_file=$1 | ||||
|  | ||||
|   # Iterate over genai-perf tests | ||||
|   jq -c '.[]' "$genai_perf_test_file" | while read -r params; do | ||||
|     # get the test name, and append the GPU type back to it. | ||||
|     test_name=$(echo "$params" | jq -r '.test_name')     | ||||
|      | ||||
|     # if TEST_SELECTOR is set, only run the test cases that match the selector | ||||
|     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then | ||||
|       echo "Skip test case $test_name." | ||||
|       continue | ||||
|     fi | ||||
|      | ||||
|     # prepend the current serving engine to the test name | ||||
|     test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} | ||||
|  | ||||
|     # get common parameters | ||||
|     common_params=$(echo "$params" | jq -r '.common_parameters') | ||||
|     model=$(echo "$common_params" | jq -r '.model') | ||||
|     tp=$(echo "$common_params" | jq -r '.tp') | ||||
|     dataset_name=$(echo "$common_params" | jq -r '.dataset_name') | ||||
|     dataset_path=$(echo "$common_params" | jq -r '.dataset_path') | ||||
|     port=$(echo "$common_params" | jq -r '.port') | ||||
|     num_prompts=$(echo "$common_params" | jq -r '.num_prompts') | ||||
|     reuse_server=$(echo "$common_params" | jq -r '.reuse_server') | ||||
|  | ||||
|     # get client and server arguments | ||||
|     server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") | ||||
|     qps_list=$(echo "$params" | jq -r '.qps_list') | ||||
|     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') | ||||
|     echo "Running over qps list $qps_list" | ||||
|  | ||||
|     # check if there is enough GPU to run the test | ||||
|     if [[ $gpu_count -lt $tp ]]; then | ||||
|       echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     if [[ $reuse_server == "true" ]]; then | ||||
|       echo "Reuse previous server for test case $test_name" | ||||
|     else | ||||
|       kill_gpu_processes | ||||
|       bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ | ||||
|         "$server_params" "$common_params" | ||||
|     fi | ||||
|  | ||||
|     if wait_for_server; then | ||||
|       echo "" | ||||
|       echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." | ||||
|     else | ||||
|       echo "" | ||||
|       echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." | ||||
|       break | ||||
|     fi | ||||
|  | ||||
|     # iterate over different QPS | ||||
|     for qps in $qps_list; do | ||||
|       # remove the surrounding single quote from qps | ||||
|       if [[ "$qps" == *"inf"* ]]; then | ||||
|         echo "qps was $qps" | ||||
|         qps=$num_prompts | ||||
|         echo "now qps is $qps" | ||||
|       fi | ||||
|      | ||||
|       new_test_name=$test_name"_qps_"$qps | ||||
|       backend=$CURRENT_LLM_SERVING_ENGINE | ||||
|        | ||||
|       if [[ "$backend" == *"vllm"* ]]; then | ||||
|         backend="vllm" | ||||
|       fi | ||||
|       #TODO: add output dir. | ||||
|       client_command="genai-perf profile \ | ||||
|         -m $model \ | ||||
|         --service-kind openai \ | ||||
|         --backend vllm \ | ||||
|         --endpoint-type chat \ | ||||
|         --streaming \ | ||||
|         --url localhost:$port \ | ||||
|         --request-rate $qps \ | ||||
|         --num-prompts $num_prompts \ | ||||
|       " | ||||
|  | ||||
|     echo "Client command: $client_command" | ||||
|  | ||||
|     eval "$client_command" | ||||
|  | ||||
|     #TODO: process/record outputs | ||||
|     done | ||||
|   done | ||||
|  | ||||
|   kill_gpu_processes | ||||
|  | ||||
| } | ||||
|  | ||||
| prepare_dataset() { | ||||
|  | ||||
|   # download sharegpt dataset | ||||
|   cd "$VLLM_SOURCE_CODE_LOC/benchmarks" | ||||
|   wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||||
|  | ||||
|   # duplicate sonnet by 4x, to allow benchmarking with input length 2048 | ||||
|   cd "$VLLM_SOURCE_CODE_LOC/benchmarks" | ||||
|   echo "" > sonnet_4x.txt | ||||
|   for _ in {1..4} | ||||
|   do | ||||
|     cat sonnet.txt >> sonnet_4x.txt | ||||
|   done | ||||
|    | ||||
| } | ||||
|  | ||||
| main() { | ||||
|  | ||||
|   # check if the environment variable is successfully injected from yaml | ||||
|  | ||||
|   check_gpus | ||||
|   check_hf_token | ||||
|   get_current_llm_serving_engine | ||||
|  | ||||
|   pip install -U transformers | ||||
|  | ||||
|   pip install -r requirements/dev.txt | ||||
|   which genai-perf | ||||
|  | ||||
|   # check storage | ||||
|   df -h | ||||
|  | ||||
|   ensure_installed wget | ||||
|   ensure_installed curl | ||||
|   ensure_installed jq | ||||
|   # genai-perf dependency | ||||
|   ensure_installed libb64-0d | ||||
|  | ||||
|   prepare_dataset | ||||
|  | ||||
|   cd "$VLLM_SOURCE_CODE_LOC/benchmarks" | ||||
|   declare -g RESULTS_FOLDER=results/ | ||||
|   mkdir -p $RESULTS_FOLDER | ||||
|   BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" | ||||
|  | ||||
|   # run the test | ||||
|   run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" | ||||
|  | ||||
|   # run genai-perf tests | ||||
|   run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" | ||||
|   mv artifacts/ $RESULTS_FOLDER/ | ||||
|  | ||||
|   # upload benchmark results to buildkite | ||||
|   python3 -m pip install tabulate pandas | ||||
|   python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" | ||||
|   upload_to_buildkite | ||||
|  | ||||
| } | ||||
|  | ||||
| main "$@" | ||||
| @ -1,474 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script should be run inside the CI process | ||||
| # This script assumes that we are already inside the vllm/ directory | ||||
| # Benchmarking results will be available inside vllm/benchmarks/results/ | ||||
|  | ||||
| # Do not set -e, as the mixtral 8x22B model tends to crash occasionally | ||||
| # and we still want to see other benchmarking results even when mixtral crashes. | ||||
| set -x | ||||
| set -o pipefail | ||||
|  | ||||
| check_gpus() { | ||||
|   if command -v nvidia-smi; then | ||||
|     # check the number of GPUs and GPU type. | ||||
|     declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) | ||||
|   elif command -v amd-smi; then | ||||
|     declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) | ||||
|   fi | ||||
|  | ||||
|   if [[ $gpu_count -gt 0 ]]; then | ||||
|     echo "GPU found." | ||||
|   else | ||||
|     echo "Need at least 1 GPU to run benchmarking." | ||||
|     exit 1 | ||||
|   fi | ||||
|   if command -v nvidia-smi; then | ||||
|     declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') | ||||
|   elif command -v amd-smi; then | ||||
|     declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}') | ||||
|   fi | ||||
|   echo "GPU type is $gpu_type" | ||||
| } | ||||
|  | ||||
| check_cpus() { | ||||
|   # check the number of CPUs and NUMA Node and GPU type. | ||||
|   declare -g numa_count=$(python3 -c  "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)") | ||||
|   if [[ $numa_count -gt 0 ]]; then | ||||
|     echo "NUMA found." | ||||
|     echo $numa_count | ||||
|   else | ||||
|     echo "Need at least 1 NUMA to run benchmarking." | ||||
|     exit 1 | ||||
|   fi | ||||
|   declare -g gpu_type="cpu" | ||||
|   echo "GPU type is $gpu_type" | ||||
| } | ||||
|  | ||||
| check_hf_token() { | ||||
|   # check if HF_TOKEN is available and valid | ||||
|   if [[ -z "$HF_TOKEN" ]]; then | ||||
|     echo "Error: HF_TOKEN is not set." | ||||
|     exit 1 | ||||
|   elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then | ||||
|     echo "Error: HF_TOKEN does not start with 'hf_'." | ||||
|     exit 1 | ||||
|   else | ||||
|     echo "HF_TOKEN is set and valid." | ||||
|   fi | ||||
| } | ||||
|  | ||||
| ensure_sharegpt_downloaded() { | ||||
|   local FILE=ShareGPT_V3_unfiltered_cleaned_split.json | ||||
|   if [ ! -f "$FILE" ]; then | ||||
|     wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE | ||||
|   else | ||||
|     echo "$FILE already exists." | ||||
|   fi | ||||
| } | ||||
|  | ||||
| json2args() { | ||||
|   # transforms the JSON string to command line args, and '_' is replaced to '-' | ||||
|   # example: | ||||
|   # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } | ||||
|   # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 | ||||
|   local json_string=$1 | ||||
|   local args=$( | ||||
|     echo "$json_string" | jq -r ' | ||||
|       to_entries | | ||||
|       map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | | ||||
|       join(" ") | ||||
|     ' | ||||
|   ) | ||||
|   echo "$args" | ||||
| } | ||||
|  | ||||
| json2envs() { | ||||
|   # transforms the JSON string to environment variables. | ||||
|   # example: | ||||
|   # input: { "VLLM_CPU_KVCACHE_SPACE": 5 } | ||||
|   # output: VLLM_CPU_KVCACHE_SPACE=5 | ||||
|   local json_string=$1 | ||||
|   local args=$( | ||||
|     echo "$json_string" | jq -r ' | ||||
|       to_entries | | ||||
|       map((.key ) + "=" + (.value | tostring)) | | ||||
|       join(" ") | ||||
|     ' | ||||
|   ) | ||||
|   echo "$args" | ||||
| } | ||||
|  | ||||
| wait_for_server() { | ||||
|   # wait for vllm server to start | ||||
|   # return 1 if vllm server crashes | ||||
|   timeout 1200 bash -c ' | ||||
|     until curl -X POST localhost:8000/v1/completions; do | ||||
|       sleep 1 | ||||
|     done' && return 0 || return 1 | ||||
| } | ||||
|  | ||||
| kill_processes_launched_by_current_bash() { | ||||
|   # Kill all python processes launched from current bash script | ||||
|   current_shell_pid=$$ | ||||
|   processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}') | ||||
|   if [ -n "$processes" ]; then | ||||
|     echo "Killing the following processes matching '$1':" | ||||
|     echo "$processes" | ||||
|     echo "$processes" | xargs kill -9 | ||||
|   else | ||||
|     echo "No processes found matching '$1'." | ||||
|   fi | ||||
| } | ||||
|  | ||||
| kill_gpu_processes() { | ||||
|  | ||||
|   ps -aux | ||||
|   lsof -t -i:8000 | xargs -r kill -9 | ||||
|   pgrep python3 | xargs -r kill -9 | ||||
|  | ||||
|  | ||||
|   # wait until GPU memory usage smaller than 1GB | ||||
|   if command -v nvidia-smi; then | ||||
|     while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do | ||||
|       sleep 1 | ||||
|     done | ||||
|   elif command -v amd-smi; then | ||||
|     while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do | ||||
|       sleep 1 | ||||
|     done | ||||
|   fi | ||||
|  | ||||
|   # remove vllm config file | ||||
|   rm -rf ~/.config/vllm | ||||
|  | ||||
| } | ||||
|  | ||||
| upload_to_buildkite() { | ||||
|   # upload the benchmarking results to buildkite | ||||
|  | ||||
|   # if the agent binary is not found, skip uploading the results, exit 0 | ||||
|   # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent | ||||
|   if command -v buildkite-agent >/dev/null 2>&1; then | ||||
|     BUILDKITE_AGENT_COMMAND="buildkite-agent" | ||||
|   elif [ -f /workspace/buildkite-agent ]; then | ||||
|     BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent" | ||||
|   else | ||||
|     echo "buildkite-agent binary not found. Skip uploading the results." | ||||
|     return 0 | ||||
|   fi | ||||
|  | ||||
|   # Use the determined command to annotate and upload artifacts | ||||
|   $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md" | ||||
|   $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" | ||||
| } | ||||
|  | ||||
| run_latency_tests() { | ||||
|   # run latency tests using `benchmark_latency.py` | ||||
|   # $1: a json file specifying latency test cases | ||||
|  | ||||
|   local latency_test_file | ||||
|   latency_test_file=$1 | ||||
|  | ||||
|   # Iterate over latency tests | ||||
|   jq -c '.[]' "$latency_test_file" | while read -r params; do | ||||
|     # get the test name, and append the GPU type back to it. | ||||
|     test_name=$(echo "$params" | jq -r '.test_name') | ||||
|     if [[ ! "$test_name" =~ ^latency_ ]]; then | ||||
|       echo "In latency-test.json, test_name must start with \"latency_\"." | ||||
|       exit 1 | ||||
|     fi | ||||
|  | ||||
|     # if TEST_SELECTOR is set, only run the test cases that match the selector | ||||
|     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then | ||||
|       echo "Skip test case $test_name." | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     # get arguments | ||||
|     latency_params=$(echo "$params" | jq -r '.parameters') | ||||
|     latency_args=$(json2args "$latency_params") | ||||
|     latency_environment_variables=$(echo "$params" | jq -r '.environment_variables') | ||||
|     latency_envs=$(json2envs "$latency_environment_variables") | ||||
|  | ||||
|     # check if there is enough GPU to run the test | ||||
|     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') | ||||
|     if [ "$ON_CPU" == "1" ];then | ||||
|       if [[ $numa_count -lt $tp ]]; then | ||||
|         echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." | ||||
|         continue | ||||
|       fi | ||||
|     else | ||||
|       if [[ $gpu_count -lt $tp ]]; then | ||||
|         echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." | ||||
|         continue | ||||
|       fi | ||||
|     fi | ||||
|  | ||||
|     latency_command=" $latency_envs python3 benchmark_latency.py \ | ||||
|       --output-json $RESULTS_FOLDER/${test_name}.json \ | ||||
|       $latency_args" | ||||
|  | ||||
|     echo "Running test case $test_name" | ||||
|     echo "Latency command: $latency_command" | ||||
|  | ||||
|     # recoding benchmarking command ang GPU command | ||||
|     jq_output=$(jq -n \ | ||||
|       --arg latency "$latency_command" \ | ||||
|       --arg gpu "$gpu_type" \ | ||||
|       '{ | ||||
|         latency_command: $latency, | ||||
|         gpu_type: $gpu | ||||
|       }') | ||||
|     echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" | ||||
|  | ||||
|     # run the benchmark | ||||
|     eval "$latency_command" | ||||
|  | ||||
|     kill_gpu_processes | ||||
|  | ||||
|   done | ||||
| } | ||||
|  | ||||
| run_throughput_tests() { | ||||
|   # run throughput tests using `benchmark_throughput.py` | ||||
|   # $1: a json file specifying throughput test cases | ||||
|  | ||||
|   local throughput_test_file | ||||
|   throughput_test_file=$1 | ||||
|  | ||||
|   # Iterate over throughput tests | ||||
|   jq -c '.[]' "$throughput_test_file" | while read -r params; do | ||||
|     # get the test name, and append the GPU type back to it. | ||||
|     test_name=$(echo "$params" | jq -r '.test_name') | ||||
|     if [[ ! "$test_name" =~ ^throughput_ ]]; then | ||||
|       echo "In throughput-test.json, test_name must start with \"throughput_\"." | ||||
|       exit 1 | ||||
|     fi | ||||
|  | ||||
|     # if TEST_SELECTOR is set, only run the test cases that match the selector | ||||
|     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then | ||||
|       echo "Skip test case $test_name." | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     # get arguments | ||||
|     throughput_params=$(echo "$params" | jq -r '.parameters') | ||||
|     throughput_args=$(json2args "$throughput_params") | ||||
|     throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables') | ||||
|     throughput_envs=$(json2envs "$throughput_environment_variables") | ||||
|  | ||||
|     # check if there is enough GPU to run the test | ||||
|     tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') | ||||
|     if [ "$ON_CPU" == "1" ];then | ||||
|       if [[ $numa_count -lt $tp ]]; then | ||||
|         echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." | ||||
|         continue | ||||
|       fi | ||||
|     else | ||||
|       if [[ $gpu_count -lt $tp ]]; then | ||||
|         echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." | ||||
|         continue | ||||
|       fi | ||||
|     fi | ||||
|  | ||||
|     throughput_command=" $throughput_envs python3 benchmark_throughput.py \ | ||||
|       --output-json $RESULTS_FOLDER/${test_name}.json \ | ||||
|       $throughput_args" | ||||
|  | ||||
|     echo "Running test case $test_name" | ||||
|     echo "Throughput command: $throughput_command" | ||||
|     # recoding benchmarking command ang GPU command | ||||
|     jq_output=$(jq -n \ | ||||
|       --arg command "$throughput_command" \ | ||||
|       --arg gpu "$gpu_type" \ | ||||
|       '{ | ||||
|         throughput_command: $command, | ||||
|         gpu_type: $gpu | ||||
|       }') | ||||
|     echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" | ||||
|  | ||||
|     # run the benchmark | ||||
|     eval "$throughput_command" | ||||
|  | ||||
|     kill_gpu_processes | ||||
|  | ||||
|   done | ||||
| } | ||||
|  | ||||
| run_serving_tests() { | ||||
|   # run serving tests using `benchmark_serving.py` | ||||
|   # $1: a json file specifying serving test cases | ||||
|  | ||||
|   local serving_test_file | ||||
|   serving_test_file=$1 | ||||
|  | ||||
|   # Iterate over serving tests | ||||
|   jq -c '.[]' "$serving_test_file" | while read -r params; do | ||||
|     # get the test name, and append the GPU type back to it. | ||||
|     test_name=$(echo "$params" | jq -r '.test_name') | ||||
|     if [[ ! "$test_name" =~ ^serving_ ]]; then | ||||
|       echo "In serving-test.json, test_name must start with \"serving_\"." | ||||
|       exit 1 | ||||
|     fi | ||||
|  | ||||
|     # if TEST_SELECTOR is set, only run the test cases that match the selector | ||||
|     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then | ||||
|       echo "Skip test case $test_name." | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     # get client and server arguments | ||||
|     server_params=$(echo "$params" | jq -r '.server_parameters') | ||||
|     server_envs=$(echo "$params" | jq -r '.server_environment_variables') | ||||
|     client_params=$(echo "$params" | jq -r '.client_parameters') | ||||
|     server_args=$(json2args "$server_params") | ||||
|     server_envs=$(json2envs "$server_envs") | ||||
|     client_args=$(json2args "$client_params") | ||||
|     qps_list=$(echo "$params" | jq -r '.qps_list') | ||||
|     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') | ||||
|     echo "Running over qps list $qps_list" | ||||
|  | ||||
|     # check if there is enough resources to run the test | ||||
|     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') | ||||
|     if [ "$ON_CPU" == "1" ];then | ||||
|       if [[ $numa_count -lt $tp ]]; then | ||||
|         echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name." | ||||
|         continue | ||||
|       fi | ||||
|     else | ||||
|       if [[ $gpu_count -lt $tp ]]; then | ||||
|         echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." | ||||
|         continue | ||||
|       fi | ||||
|     fi | ||||
|  | ||||
|     # check if server model and client model is aligned | ||||
|     server_model=$(echo "$server_params" | jq -r '.model') | ||||
|     client_model=$(echo "$client_params" | jq -r '.model') | ||||
|     if [[ $server_model != "$client_model" ]]; then | ||||
|       echo "Server model and client model must be the same. Skip testcase $test_name." | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     server_command="$server_envs python3 \ | ||||
|       -m vllm.entrypoints.openai.api_server \ | ||||
|       $server_args" | ||||
|  | ||||
|     # run the server | ||||
|     echo "Running test case $test_name" | ||||
|     echo "Server command: $server_command" | ||||
|     # support remote vllm server | ||||
|     client_remote_args="" | ||||
|     if [[ -z "${REMOTE_HOST}" ]]; then | ||||
|       bash -c "$server_command" & | ||||
|       server_pid=$! | ||||
|       # wait until the server is alive | ||||
|       if wait_for_server; then | ||||
|         echo "" | ||||
|         echo "vLLM server is up and running." | ||||
|       else | ||||
|         echo "" | ||||
|         echo "vLLM failed to start within the timeout period." | ||||
|       fi | ||||
|     else | ||||
|       server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" | ||||
|       if [[ ${REMOTE_PORT} ]]; then | ||||
|         client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT " | ||||
|       else | ||||
|         client_remote_args=" --host=$REMOTE_HOST " | ||||
|       fi | ||||
|     fi | ||||
|  | ||||
|     # iterate over different QPS | ||||
|     for qps in $qps_list; do | ||||
|       # remove the surrounding single quote from qps | ||||
|       if [[ "$qps" == *"inf"* ]]; then | ||||
|         echo "qps was $qps" | ||||
|         qps="inf" | ||||
|         echo "now qps is $qps" | ||||
|       fi | ||||
|  | ||||
|       new_test_name=$test_name"_qps_"$qps | ||||
|  | ||||
|       # pass the tensor parallel size to the client so that it can be displayed | ||||
|       # on the benchmark dashboard | ||||
|       client_command="python3 benchmark_serving.py \ | ||||
|         --save-result \ | ||||
|         --result-dir $RESULTS_FOLDER \ | ||||
|         --result-filename ${new_test_name}.json \ | ||||
|         --request-rate $qps \ | ||||
|         --metadata "tensor_parallel_size=$tp" \ | ||||
|         $client_args $client_remote_args " | ||||
|  | ||||
|       echo "Running test case $test_name with qps $qps" | ||||
|       echo "Client command: $client_command" | ||||
|  | ||||
|       bash -c "$client_command" | ||||
|  | ||||
|       # record the benchmarking commands | ||||
|       jq_output=$(jq -n \ | ||||
|         --arg server "$server_command" \ | ||||
|         --arg client "$client_command" \ | ||||
|         --arg gpu "$gpu_type" \ | ||||
|         '{ | ||||
|           server_command: $server, | ||||
|           client_command: $client, | ||||
|           gpu_type: $gpu | ||||
|         }') | ||||
|       echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" | ||||
|  | ||||
|     done | ||||
|  | ||||
|     # clean up | ||||
|     kill -9 $server_pid | ||||
|     kill_gpu_processes | ||||
|   done | ||||
| } | ||||
|  | ||||
| main() { | ||||
|   local ARCH | ||||
|   ARCH='' | ||||
|   if [ "$ON_CPU" == "1" ];then | ||||
|      check_cpus | ||||
|      ARCH='-cpu' | ||||
|   else | ||||
|      check_gpus | ||||
|   fi | ||||
|   check_hf_token | ||||
|  | ||||
|   # Set to v1 to run v1 benchmark | ||||
|   if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then | ||||
|     export VLLM_USE_V1=1 | ||||
|   fi | ||||
|  | ||||
|   # dependencies | ||||
|   (which wget && which curl) || (apt-get update && apt-get install -y wget curl) | ||||
|   (which jq) || (apt-get update && apt-get -y install jq) | ||||
|   (which lsof) || (apt-get update && apt-get install -y lsof) | ||||
|  | ||||
|   # get the current IP address, required by benchmark_serving.py | ||||
|   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') | ||||
|   # turn of the reporting of the status of each request, to clean up the terminal output | ||||
|   export VLLM_LOGGING_LEVEL="WARNING" | ||||
|  | ||||
|   # prepare for benchmarking | ||||
|   cd benchmarks || exit 1 | ||||
|   ensure_sharegpt_downloaded | ||||
|   declare -g RESULTS_FOLDER=results/ | ||||
|   mkdir -p $RESULTS_FOLDER | ||||
|   QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ | ||||
|  | ||||
|   # benchmarking | ||||
|   run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" | ||||
|   run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" | ||||
|   run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" | ||||
|  | ||||
|   # postprocess benchmarking results | ||||
|   pip install tabulate pandas | ||||
|   python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py | ||||
|  | ||||
|   upload_to_buildkite | ||||
| } | ||||
|  | ||||
| main "$@" | ||||
| @ -1,82 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import datetime | ||||
| import json | ||||
| import os | ||||
| from pathlib import Path | ||||
|  | ||||
| import pandas as pd | ||||
| from tabulate import tabulate | ||||
|  | ||||
| results_folder = Path("results/") | ||||
|  | ||||
| # serving results and the keys that will be printed into markdown | ||||
| serving_results = [] | ||||
| serving_column_mapping = { | ||||
|     "test_name": "Test name", | ||||
|     "gpu_type": "GPU", | ||||
|     "completed": "Successful req.", | ||||
|     "request_throughput": "Tput (req/s)", | ||||
|     "mean_ttft_ms": "Mean TTFT (ms)", | ||||
|     "std_ttft_ms": "Std TTFT (ms)", | ||||
|     "median_ttft_ms": "Median TTFT (ms)", | ||||
|     "mean_itl_ms": "Mean ITL (ms)", | ||||
|     "std_itl_ms": "Std ITL (ms)", | ||||
|     "median_itl_ms": "Median ITL (ms)", | ||||
|     "mean_tpot_ms": "Mean TPOT (ms)", | ||||
|     "std_tpot_ms": "Std TPOT (ms)", | ||||
|     "median_tpot_ms": "Median TPOT (ms)", | ||||
|     "total_token_throughput": "Total Token Tput (tok/s)", | ||||
|     "output_throughput": "Output Tput (tok/s)", | ||||
|     "total_input_tokens": "Total input tokens", | ||||
|     "total_output_tokens": "Total output tokens", | ||||
|     "engine": "Engine", | ||||
| } | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     # collect results | ||||
|     for test_file in results_folder.glob("*.json"): | ||||
|         with open(test_file) as f: | ||||
|             raw_result = json.loads(f.read()) | ||||
|  | ||||
|         # attach the benchmarking command to raw_result | ||||
|         with open(test_file.with_suffix(".commands")) as f: | ||||
|             command = json.loads(f.read()) | ||||
|         raw_result.update(command) | ||||
|  | ||||
|         # update the test name of this result | ||||
|         raw_result.update({"test_name": test_file.stem}) | ||||
|  | ||||
|         # add the result to raw_result | ||||
|         serving_results.append(raw_result) | ||||
|         continue | ||||
|  | ||||
|     serving_results = pd.DataFrame.from_dict(serving_results) | ||||
|  | ||||
|     if not serving_results.empty: | ||||
|         serving_results = serving_results[list(serving_column_mapping.keys())].rename( | ||||
|             columns=serving_column_mapping | ||||
|         ) | ||||
|  | ||||
|     serving_md_table_with_headers = tabulate( | ||||
|         serving_results, headers="keys", tablefmt="pipe", showindex=False | ||||
|     ) | ||||
|     # remove the first line of header | ||||
|     serving_md_table_lines = serving_md_table_with_headers.split("\n") | ||||
|     serving_md_table_without_header = "\n".join(serving_md_table_lines[2:]) | ||||
|  | ||||
|     prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | ||||
|     prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") | ||||
|  | ||||
|     # document benchmarking results in markdown | ||||
|     with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: | ||||
|         # document results with header. | ||||
|         # for those who wants to reproduce our benchmark. | ||||
|         f.write(serving_md_table_with_headers) | ||||
|         f.write("\n") | ||||
|  | ||||
|     # document benchmarking results in json | ||||
|     with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: | ||||
|         results = serving_results.to_dict(orient="records") | ||||
|         f.write(json.dumps(results)) | ||||
| @ -1,23 +0,0 @@ | ||||
| #!/bin/sh | ||||
| TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) | ||||
| if [[ "$BUILDKITE_BRANCH" == "main" ]]; then | ||||
|     URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" | ||||
| else | ||||
|     URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" | ||||
| fi | ||||
|  | ||||
| TIMEOUT_SECONDS=10 | ||||
|  | ||||
| retries=0 | ||||
| while [ $retries -lt 1000 ]; do | ||||
|     if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then | ||||
|         exit 0 | ||||
|     fi | ||||
|  | ||||
|     echo "Waiting for image to be available..." | ||||
|  | ||||
|     retries=$((retries + 1)) | ||||
|     sleep 5 | ||||
| done | ||||
|  | ||||
| exit 1 | ||||
| @ -1,23 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "llama8B_tp1_genai_perf", | ||||
|         "qps_list": [4,8,16,32], | ||||
|         "common_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-8B-Instruct", | ||||
|             "tp": 1, | ||||
|             "port": 8000, | ||||
|             "num_prompts": 500, | ||||
|             "reuse_server": false | ||||
|         }, | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "genai_perf_input_parameters": { | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,30 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "latency_llama8B_tp1", | ||||
|         "environment_variables": { | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
|             "load_format": "dummy", | ||||
|             "num_iters_warmup": 5, | ||||
|             "num_iters": 15 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "latency_llama8B_tp4", | ||||
|         "environment_variables": { | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
|             "load_format": "dummy", | ||||
|             "num_iters_warmup": 5, | ||||
|             "num_iters": 15 | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,32 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "latency_llama8B_tp1", | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
|             "load_format": "dummy", | ||||
|             "num_iters_warmup": 5, | ||||
|             "num_iters": 15 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "latency_llama70B_tp4", | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
|             "load_format": "dummy", | ||||
|             "num-iters-warmup": 5, | ||||
|             "num-iters": 15 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "latency_mixtral8x7B_tp2", | ||||
|         "parameters": { | ||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", | ||||
|             "tensor_parallel_size": 2, | ||||
|             "load_format": "dummy", | ||||
|             "num-iters-warmup": 5, | ||||
|             "num-iters": 15 | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,323 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "llama8B_tp1_sharegpt", | ||||
|         "qps_list": [4,8,16,32,"inf"], | ||||
|         "common_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-8B-Instruct", | ||||
|             "tp": 1, | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 500, | ||||
|             "port": 8000, | ||||
|             "reuse_server": false | ||||
|         }, | ||||
|         "lmdeploy_server_parameters": { | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "lmdeploy_client_parameters": { | ||||
|         }, | ||||
|         "tgi_server_parameters": { | ||||
|         }, | ||||
|         "tgi_client_parameters": { | ||||
|             "endpoint": "/generate_stream" | ||||
|         }, | ||||
|         "trt_server_parameters": { | ||||
|             "model_type": "llama", | ||||
|             "model_dtype": "bfloat16", | ||||
|             "max_batch_size": 2048, | ||||
|             "max_input_len": 4096, | ||||
|             "max_seq_len": 6144, | ||||
|             "max_num_tokens": 16384, | ||||
|             "trt_llm_version": "v0.11.0" | ||||
|         }, | ||||
|         "trt_client_parameters": { | ||||
|             "endpoint": "/v2/models/ensemble/generate_stream" | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "vllm_client_parameters": { | ||||
|         }, | ||||
|         "sglang_server_parameters": { | ||||
|             "disable_radix_cache": "", | ||||
|             "enable_torch_compile": "", | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "sglang_client_parameters": { | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "llama8B_tp1_sonnet_512_16", | ||||
|         "qps_list": [4,8,16,32,"inf"], | ||||
|         "common_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-8B-Instruct", | ||||
|             "tp": 1, | ||||
|             "dataset_name": "sonnet", | ||||
|             "dataset_path": "./sonnet_4x.txt", | ||||
|             "num_prompts": 500, | ||||
|             "port": 8000, | ||||
|             "sonnet_input_len": 512, | ||||
|             "sonnet_output_len": 16, | ||||
|             "sonnet_prefix_len": 50, | ||||
|             "reuse_server": true | ||||
|         }, | ||||
|         "lmdeploy_server_parameters": { | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "lmdeploy_client_parameters": { | ||||
|         }, | ||||
|         "tgi_server_parameters": { | ||||
|         }, | ||||
|         "tgi_client_parameters": { | ||||
|             "endpoint": "/generate_stream" | ||||
|         }, | ||||
|         "trt_server_parameters": { | ||||
|             "model_type": "llama", | ||||
|             "model_dtype": "bfloat16", | ||||
|             "max_batch_size": 2048, | ||||
|             "max_input_len": 4096, | ||||
|             "max_seq_len": 6144, | ||||
|             "max_num_tokens": 16384, | ||||
|             "trt_llm_version": "v0.11.0" | ||||
|         }, | ||||
|         "trt_client_parameters": { | ||||
|             "endpoint": "/v2/models/ensemble/generate_stream" | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "vllm_client_parameters": { | ||||
|         }, | ||||
|         "sglang_server_parameters": { | ||||
|             "disable_radix_cache": "", | ||||
|             "enable_torch_compile": "", | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "sglang_client_parameters": { | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "llama8B_tp1_sonnet_512_256", | ||||
|         "qps_list": [4,8,16,32,"inf"], | ||||
|         "common_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-8B-Instruct", | ||||
|             "tp": 1, | ||||
|             "dataset_name": "sonnet", | ||||
|             "dataset_path": "./sonnet_4x.txt", | ||||
|             "num_prompts": 500, | ||||
|             "port": 8000, | ||||
|             "sonnet_input_len": 512, | ||||
|             "sonnet_output_len": 256, | ||||
|             "sonnet_prefix_len": 50, | ||||
|             "reuse_server": true | ||||
|         }, | ||||
|         "lmdeploy_server_parameters": { | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "lmdeploy_client_parameters": { | ||||
|         }, | ||||
|         "tgi_server_parameters": { | ||||
|         }, | ||||
|         "tgi_client_parameters": { | ||||
|             "endpoint": "/generate_stream" | ||||
|         }, | ||||
|         "trt_server_parameters": { | ||||
|             "model_type": "llama", | ||||
|             "model_dtype": "bfloat16", | ||||
|             "max_batch_size": 2048, | ||||
|             "max_input_len": 4096, | ||||
|             "max_seq_len": 6144, | ||||
|             "max_num_tokens": 16384, | ||||
|             "trt_llm_version": "v0.11.0" | ||||
|         }, | ||||
|         "trt_client_parameters": { | ||||
|             "endpoint": "/v2/models/ensemble/generate_stream" | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "vllm_client_parameters": { | ||||
|         }, | ||||
|         "sglang_server_parameters": { | ||||
|             "disable_radix_cache": "", | ||||
|             "enable_torch_compile": "", | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "sglang_client_parameters": { | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "llama70B_tp4_sharegpt", | ||||
|         "qps_list": [4,8,16,32,"inf"], | ||||
|         "common_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-70B-Instruct", | ||||
|             "tp": 4, | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 500, | ||||
|             "port": 8000, | ||||
|             "reuse_server": false | ||||
|         }, | ||||
|         "lmdeploy_server_parameters": { | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "lmdeploy_client_parameters": { | ||||
|         }, | ||||
|         "tgi_server_parameters": { | ||||
|         }, | ||||
|         "tgi_client_parameters": { | ||||
|             "endpoint": "/generate_stream" | ||||
|         }, | ||||
|         "trt_server_parameters": { | ||||
|             "model_type": "llama", | ||||
|             "model_dtype": "bfloat16", | ||||
|             "max_batch_size": 2048, | ||||
|             "max_input_len": 4096, | ||||
|             "max_seq_len": 6144, | ||||
|             "max_num_tokens": 16384, | ||||
|             "trt_llm_version": "v0.11.0" | ||||
|         }, | ||||
|         "trt_client_parameters": { | ||||
|             "endpoint": "/v2/models/ensemble/generate_stream" | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "vllm_client_parameters": { | ||||
|         }, | ||||
|         "sglang_server_parameters": { | ||||
|             "disable_radix_cache": "", | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "sglang_client_parameters": { | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "llama70B_tp4_sonnet_512_16", | ||||
|         "qps_list": [4,8,16,32,"inf"], | ||||
|         "common_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-70B-Instruct", | ||||
|             "tp": 4, | ||||
|             "dataset_name": "sonnet", | ||||
|             "dataset_path": "./sonnet_4x.txt", | ||||
|             "num_prompts": 500, | ||||
|             "port": 8000, | ||||
|             "sonnet_input_len": 512, | ||||
|             "sonnet_output_len": 16, | ||||
|             "sonnet_prefix_len": 50, | ||||
|             "reuse_server": true | ||||
|         }, | ||||
|         "lmdeploy_server_parameters": { | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "lmdeploy_client_parameters": { | ||||
|         }, | ||||
|         "tgi_server_parameters": { | ||||
|         }, | ||||
|         "tgi_client_parameters": { | ||||
|             "endpoint": "/generate_stream" | ||||
|         }, | ||||
|         "trt_server_parameters": { | ||||
|             "model_type": "llama", | ||||
|             "model_dtype": "bfloat16", | ||||
|             "max_batch_size": 2048, | ||||
|             "max_input_len": 4096, | ||||
|             "max_seq_len": 6144, | ||||
|             "max_num_tokens": 16384, | ||||
|             "trt_llm_version": "v0.11.0" | ||||
|         }, | ||||
|         "trt_client_parameters": { | ||||
|             "endpoint": "/v2/models/ensemble/generate_stream" | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "vllm_client_parameters": { | ||||
|         }, | ||||
|         "sglang_server_parameters": { | ||||
|             "disable_radix_cache": "", | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "sglang_client_parameters": { | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "llama70B_tp4_sonnet_512_256", | ||||
|         "qps_list": [4,8,16,32,"inf"], | ||||
|         "common_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-70B-Instruct", | ||||
|             "tp": 4, | ||||
|             "dataset_name": "sonnet", | ||||
|             "dataset_path": "./sonnet_4x.txt", | ||||
|             "num_prompts": 500, | ||||
|             "port": 8000, | ||||
|             "sonnet_input_len": 512, | ||||
|             "sonnet_output_len": 256, | ||||
|             "sonnet_prefix_len": 50, | ||||
|             "reuse_server": true | ||||
|         }, | ||||
|         "lmdeploy_server_parameters": { | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "lmdeploy_client_parameters": { | ||||
|         }, | ||||
|         "tgi_server_parameters": { | ||||
|         }, | ||||
|         "tgi_client_parameters": { | ||||
|             "endpoint": "/generate_stream" | ||||
|         }, | ||||
|         "trt_server_parameters": { | ||||
|             "model_type": "llama", | ||||
|             "model_dtype": "bfloat16", | ||||
|             "max_batch_size": 2048, | ||||
|             "max_input_len": 4096, | ||||
|             "max_seq_len": 6144, | ||||
|             "max_num_tokens": 16384, | ||||
|             "trt_llm_version": "v0.11.0" | ||||
|         }, | ||||
|         "trt_client_parameters": { | ||||
|             "endpoint": "/v2/models/ensemble/generate_stream" | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "vllm_client_parameters": { | ||||
|         }, | ||||
|         "sglang_server_parameters": { | ||||
|             "disable_radix_cache": "", | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "sglang_client_parameters": { | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,158 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp1_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
| 	    "enforce_eager": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
| 	    "max_concurrency": 60, | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp2_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
| 	    "enforce_eager": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
| 	    "max_concurrency": 60, | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp4_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
| 	    "enforce_eager": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
| 	    "max_concurrency": 60, | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp4_random_1024_128", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
| 	    "enforce_eager": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 1024, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
| 	    "max_concurrency": 100, | ||||
|             "num_prompts": 100 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_pp6_random_1024_128", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "pipeline_parallel_size": 6, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
| 	    "enforce_eager": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 1024, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
| 	    "max_concurrency": 100, | ||||
|             "num_prompts": 100 | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,81 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp1_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
|             "swap_space": 16, | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama70B_tp4_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
|             "swap_space": 16, | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_mixtral8x7B_tp2_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "server_parameters": { | ||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", | ||||
|             "tensor_parallel_size": 2, | ||||
|             "swap_space": 16, | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama70B_tp4_sharegpt_specdecode", | ||||
|         "qps_list": [2], | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", | ||||
|             "disable_log_requests": "",  | ||||
|             "tensor_parallel_size": 4, | ||||
|             "swap_space": 16, | ||||
|             "speculative_config": { | ||||
|                 "model": "turboderp/Qwama-0.5B-Instruct", | ||||
|                 "num_speculative_tokens": 4, | ||||
|                 "draft_tensor_parallel_size": 1 | ||||
|             } | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200  | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,32 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "throughput_llama8B_tp1", | ||||
|         "environment_variables": { | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
|             "load_format": "dummy", | ||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200, | ||||
|             "backend": "vllm" | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "throughput_llama8B_tp4", | ||||
|         "environment_variables": { | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
|             "load_format": "dummy", | ||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200, | ||||
|             "backend": "vllm" | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,35 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "throughput_llama8B_tp1", | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
|             "load_format": "dummy", | ||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200, | ||||
|             "backend": "vllm" | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "throughput_llama70B_tp4", | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
|             "load_format": "dummy", | ||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200, | ||||
|             "backend": "vllm" | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "throughput_mixtral8x7B_tp2", | ||||
|         "parameters": { | ||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", | ||||
|             "tensor_parallel_size": 2, | ||||
|             "load_format": "dummy", | ||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200, | ||||
|             "backend": "vllm" | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,46 +0,0 @@ | ||||
| # This local pyproject file is part of the migration from yapf to ruff format. | ||||
| # It uses the same core rules as the main pyproject.toml file, but with the | ||||
| # following differences: | ||||
| # - ruff line length is overridden to 88 | ||||
| # - deprecated typing ignores (UP006, UP035) have been removed | ||||
|  | ||||
| [tool.ruff] | ||||
| line-length = 88 | ||||
|  | ||||
| [tool.ruff.lint.per-file-ignores] | ||||
| "vllm/third_party/**" = ["ALL"] | ||||
| "vllm/version.py" = ["F401"] | ||||
| "vllm/_version.py" = ["ALL"] | ||||
|  | ||||
| [tool.ruff.lint] | ||||
| select = [ | ||||
|     # pycodestyle | ||||
|     "E", | ||||
|     # Pyflakes | ||||
|     "F", | ||||
|     # pyupgrade | ||||
|     "UP", | ||||
|     # flake8-bugbear | ||||
|     "B", | ||||
|     # flake8-simplify | ||||
|     "SIM", | ||||
|     # isort | ||||
|     "I", | ||||
|     # flake8-logging-format | ||||
|     "G", | ||||
| ] | ||||
| ignore = [ | ||||
|     # star imports | ||||
|     "F405", "F403", | ||||
|     # lambda expression assignment | ||||
|     "E731", | ||||
|     # Loop control variable not used within loop body | ||||
|     "B007", | ||||
|     # f-string format | ||||
|     "UP032", | ||||
|     # Can remove once 3.10+ is the minimum Python version | ||||
|     "UP007", | ||||
| ] | ||||
|  | ||||
| [tool.ruff.format] | ||||
| docstring-code-format = true | ||||
| @ -1,124 +0,0 @@ | ||||
| steps: | ||||
|   - label: "Build wheel - CUDA 12.8" | ||||
|     id: build-wheel-cuda-12-8 | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   - label: "Build wheel - CUDA 12.6" | ||||
|     id: build-wheel-cuda-12-6 | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working. | ||||
|   # However, this block can be uncommented to save some compute hours. | ||||
|   # - block: "Build CUDA 11.8 wheel" | ||||
|   #   key: block-build-cu118-wheel | ||||
|  | ||||
|   - label: "Build wheel - CUDA 11.8" | ||||
|     # depends_on: block-build-cu118-wheel | ||||
|     id: build-wheel-cuda-11-8 | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   - block: "Build release image" | ||||
|     depends_on: ~ | ||||
|     key: block-release-image-build | ||||
|  | ||||
|   - label: "Build release image" | ||||
|     depends_on: block-release-image-build | ||||
|     id: build-release-image | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" | ||||
|  | ||||
|   - label: "Annotate release workflow" | ||||
|     depends_on: | ||||
|       - build-release-image | ||||
|       - build-wheel-cuda-12-8 | ||||
|       - build-wheel-cuda-12-6 | ||||
|       - build-wheel-cuda-11-8 | ||||
|     id: annotate-release-workflow | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "bash .buildkite/scripts/annotate-release.sh" | ||||
|  | ||||
|   - label: "Build and publish TPU release image" | ||||
|     depends_on: ~ | ||||
|     if: build.env("NIGHTLY") == "1" | ||||
|     agents: | ||||
|       queue: tpu_queue_postmerge | ||||
|     commands: | ||||
|       - "yes | docker system prune -a" | ||||
|       - "git fetch --all" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." | ||||
|       - "docker push vllm/vllm-tpu:nightly" | ||||
|       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" | ||||
|     plugins: | ||||
|       - docker-login#v3.0.0: | ||||
|           username: vllmbot | ||||
|           password-env: DOCKERHUB_TOKEN | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   - input: "Provide Release version here" | ||||
|     id: input-release-version | ||||
|     fields: | ||||
|       - text: "What is the release version?" | ||||
|         key: release-version | ||||
|  | ||||
|   - block: "Build CPU release image" | ||||
|     key: block-cpu-release-image-build | ||||
|     depends_on: ~ | ||||
|  | ||||
|   - label: "Build and publish CPU release image" | ||||
|     depends_on: block-cpu-release-image-build | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   - block: "Build Neuron release image" | ||||
|     key: block-neuron-release-image-build | ||||
|     depends_on: ~ | ||||
|  | ||||
|   - label: "Build and publish Neuron release image" | ||||
|     depends_on: block-neuron-release-image-build | ||||
|     agents: | ||||
|       queue: neuron-postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest" | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
							
								
								
									
										73
									
								
								.buildkite/run-amd-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								.buildkite/run-amd-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,73 @@ | ||||
| # This script runs test inside the corresponding ROCm docker container. | ||||
| set -ex | ||||
|  | ||||
| # Print ROCm version | ||||
| echo "--- ROCm info" | ||||
| rocminfo | ||||
|  | ||||
| # cleanup older docker images | ||||
| cleanup_docker() { | ||||
|   # Get Docker's root directory | ||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||
|   if [ -z "$docker_root" ]; then | ||||
|     echo "Failed to determine Docker root directory." | ||||
|     exit 1 | ||||
|   fi | ||||
|   echo "Docker root directory: $docker_root" | ||||
|   # Check disk usage of the filesystem where Docker's root directory is located | ||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||
|   # Define the threshold | ||||
|   threshold=70 | ||||
|   if [ "$disk_usage" -gt "$threshold" ]; then | ||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||
|     # Remove dangling images (those that are not tagged and not used by any container) | ||||
|     docker image prune -f | ||||
|     # Remove unused volumes | ||||
|     docker volume prune -f | ||||
|     echo "Docker images and volumes cleanup completed." | ||||
|   else | ||||
|     echo "Disk usage is below $threshold%. No cleanup needed." | ||||
|   fi | ||||
| } | ||||
|  | ||||
| # Call the cleanup docker function | ||||
| cleanup_docker | ||||
|  | ||||
| echo "--- Resetting GPUs" | ||||
|  | ||||
| echo "reset" > /opt/amdgpu/etc/gpu_state | ||||
|  | ||||
| while true; do | ||||
|         sleep 3 | ||||
|         if grep -q clean /opt/amdgpu/etc/gpu_state; then | ||||
|                 echo "GPUs state is \"clean\"" | ||||
|                 break | ||||
|         fi | ||||
| done | ||||
|  | ||||
| echo "--- Building container" | ||||
| sha=$(git rev-parse --short HEAD) | ||||
| image_name=rocm_${sha} | ||||
| container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo) | ||||
| docker build \ | ||||
|         -t ${image_name} \ | ||||
|         -f Dockerfile.rocm \ | ||||
|         --progress plain \ | ||||
|         . | ||||
|  | ||||
| remove_docker_container() { | ||||
|    docker rm -f ${container_name} || docker image rm -f ${image_name} || true | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| echo "--- Running container" | ||||
|  | ||||
| docker run \ | ||||
|         --device /dev/kfd --device /dev/dri \ | ||||
|         --network host \ | ||||
|         --rm \ | ||||
|         -e HF_TOKEN \ | ||||
|         --name ${container_name} \ | ||||
|         ${image_name} \ | ||||
|         /bin/bash -c "${@}" | ||||
|  | ||||
| @ -1,12 +1,10 @@ | ||||
| #!/bin/bash | ||||
| 
 | ||||
| # This script is run by buildkite to run the benchmarks and upload the results to buildkite | ||||
| 
 | ||||
| set -ex | ||||
| set -o pipefail | ||||
| 
 | ||||
| # cd 2 levels into the working directory | ||||
| cd "$(dirname "${BASH_SOURCE[0]}")/../.." | ||||
| # cd into parent directory of this file | ||||
| cd "$(dirname "${BASH_SOURCE[0]}")/.." | ||||
| 
 | ||||
| (which wget && which curl) || (apt-get update && apt-get install -y wget curl) | ||||
| 
 | ||||
| @ -52,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md | ||||
| sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line | ||||
| echo "" >> benchmark_results.md | ||||
| echo '```' >> benchmark_results.md | ||||
| tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines | ||||
| tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines | ||||
| echo '```' >> benchmark_results.md | ||||
| 
 | ||||
| # if the agent binary is not found, skip uploading the results, exit 0 | ||||
| if [ ! -f /usr/bin/buildkite-agent ]; then | ||||
| if [ ! -f /workspace/buildkite-agent ]; then | ||||
|     exit 0 | ||||
| fi | ||||
| 
 | ||||
| # upload the results to buildkite | ||||
| buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md | ||||
| /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md | ||||
| 
 | ||||
| # exit with the exit code of the benchmarks | ||||
| if [ $bench_latency_exit_code -ne 0 ]; then | ||||
| @ -77,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then | ||||
| fi | ||||
| 
 | ||||
| rm ShareGPT_V3_unfiltered_cleaned_split.json | ||||
| buildkite-agent artifact upload "*.json" | ||||
| /workspace/buildkite-agent artifact upload "*.json" | ||||
							
								
								
									
										24
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										24
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,24 @@ | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Try building the docker image | ||||
| docker build -t cpu-test -f Dockerfile.cpu . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f cpu-test || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image | ||||
| docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test | ||||
|  | ||||
| # offline inference | ||||
| docker exec cpu-test bash -c "python3 examples/offline_inference.py" | ||||
|  | ||||
| # Run basic model test | ||||
| docker exec cpu-test bash -c "cd tests; | ||||
|   pip install pytest Pillow protobuf | ||||
|   bash ../.buildkite/download-images.sh | ||||
|   cd ../ | ||||
|   pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" | ||||
							
								
								
									
										51
									
								
								.buildkite/run-neuron-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								.buildkite/run-neuron-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,51 @@ | ||||
| # This script build the Neuron docker image and run the API server inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -e | ||||
|  | ||||
| # Try building the docker image | ||||
| aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com | ||||
|  | ||||
| # prune old image and containers to save disk space, and only once a day | ||||
| # by using a timestamp file in tmp. | ||||
| if [ -f /tmp/neuron-docker-build-timestamp ]; then | ||||
|     last_build=$(cat /tmp/neuron-docker-build-timestamp) | ||||
|     current_time=$(date +%s) | ||||
|     if [ $((current_time - last_build)) -gt 86400 ]; then | ||||
|         docker system prune -f | ||||
|         echo $current_time > /tmp/neuron-docker-build-timestamp | ||||
|     fi | ||||
| else | ||||
|     echo $(date +%s) > /tmp/neuron-docker-build-timestamp | ||||
| fi | ||||
|  | ||||
| docker build -t neuron -f Dockerfile.neuron . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f neuron || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image | ||||
| docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ | ||||
|        --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & | ||||
|  | ||||
| # Wait for the server to start | ||||
| wait_for_server_to_start() { | ||||
|     timeout=300 | ||||
|     counter=0 | ||||
|  | ||||
|     while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do | ||||
|         sleep 1 | ||||
|         counter=$((counter + 1)) | ||||
|         if [ $counter -ge $timeout ]; then | ||||
|             echo "Timeout after $timeout seconds" | ||||
|             break | ||||
|         fi | ||||
|     done | ||||
| } | ||||
| wait_for_server_to_start | ||||
|  | ||||
| # Test a simple prompt | ||||
| curl -X POST -H "Content-Type: application/json" \ | ||||
|     localhost:8000/generate \ | ||||
|     -d '{"prompt": "San Francisco is a"}' | ||||
| @ -1,31 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -ex | ||||
|  | ||||
| # Get release version and strip leading 'v' if present | ||||
| RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//') | ||||
|  | ||||
| if [ -z "$RELEASE_VERSION" ]; then | ||||
|   echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid." | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF | ||||
| To download the wheel: | ||||
| \`\`\` | ||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . | ||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . | ||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .  | ||||
| \`\`\` | ||||
|  | ||||
| To download and upload the image: | ||||
|  | ||||
| \`\`\` | ||||
| docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} | ||||
| docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai | ||||
| docker tag vllm/vllm-openai vllm/vllm-openai:latest | ||||
| docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION} | ||||
| docker push vllm/vllm-openai:latest | ||||
| docker push vllm/vllm-openai:v${RELEASE_VERSION} | ||||
| \`\`\` | ||||
| EOF  | ||||
| @ -1,17 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # Usage: ./ci_clean_log.sh ci.log | ||||
| # This script strips timestamps and color codes from CI log files. | ||||
|  | ||||
| # Check if argument is given | ||||
| if [ $# -lt 1 ]; then | ||||
|     echo "Usage: $0 ci.log" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| INPUT_FILE="$1" | ||||
|  | ||||
| # Strip timestamps | ||||
| sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE" | ||||
|  | ||||
| # Strip colorization | ||||
| sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE" | ||||
| @ -1,244 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script runs test inside the corresponding ROCm docker container. | ||||
| set -o pipefail | ||||
|  | ||||
| # Export Python path | ||||
| export PYTHONPATH=".." | ||||
|  | ||||
| # Print ROCm version | ||||
| echo "--- Confirming Clean Initial State" | ||||
| while true; do | ||||
|         sleep 3 | ||||
|         if grep -q clean /opt/amdgpu/etc/gpu_state; then | ||||
|                 echo "GPUs state is \"clean\"" | ||||
|                 break | ||||
|         fi | ||||
| done | ||||
|  | ||||
| echo "--- ROCm info" | ||||
| rocminfo | ||||
|  | ||||
| # cleanup older docker images | ||||
| cleanup_docker() { | ||||
|   # Get Docker's root directory | ||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||
|   if [ -z "$docker_root" ]; then | ||||
|     echo "Failed to determine Docker root directory." | ||||
|     exit 1 | ||||
|   fi | ||||
|   echo "Docker root directory: $docker_root" | ||||
|   # Check disk usage of the filesystem where Docker's root directory is located | ||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||
|   # Define the threshold | ||||
|   threshold=70 | ||||
|   if [ "$disk_usage" -gt "$threshold" ]; then | ||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||
|     # Remove dangling images (those that are not tagged and not used by any container) | ||||
|     docker image prune -f | ||||
|     # Remove unused volumes / force the system prune for old images as well. | ||||
|     docker volume prune -f && docker system prune --force --filter "until=72h" --all | ||||
|     echo "Docker images and volumes cleanup completed." | ||||
|   else | ||||
|     echo "Disk usage is below $threshold%. No cleanup needed." | ||||
|   fi | ||||
| } | ||||
|  | ||||
| # Call the cleanup docker function | ||||
| cleanup_docker | ||||
|  | ||||
| echo "--- Resetting GPUs" | ||||
|  | ||||
| echo "reset" > /opt/amdgpu/etc/gpu_state | ||||
|  | ||||
| while true; do | ||||
|         sleep 3 | ||||
|         if grep -q clean /opt/amdgpu/etc/gpu_state; then | ||||
|                 echo "GPUs state is \"clean\"" | ||||
|                 break | ||||
|         fi | ||||
| done | ||||
|  | ||||
| echo "--- Pulling container"  | ||||
| image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" | ||||
| container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" | ||||
| docker pull "${image_name}" | ||||
|  | ||||
| remove_docker_container() { | ||||
|    docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| echo "--- Running container" | ||||
|  | ||||
| HF_CACHE="$(realpath ~)/huggingface" | ||||
| mkdir -p "${HF_CACHE}" | ||||
| HF_MOUNT="/root/.cache/huggingface" | ||||
|  | ||||
| commands=$@ | ||||
| echo "Commands:$commands" | ||||
|  | ||||
| if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then | ||||
|   commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"} | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then | ||||
|   commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then | ||||
|   commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"} | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then | ||||
|   commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *"pytest -v -s lora"* ]]; then | ||||
|   commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} | ||||
| fi | ||||
|  | ||||
| #ignore certain kernels tests | ||||
| if [[ $commands == *" kernels/core"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/core/test_fused_quant_layernorm.py \ | ||||
|   --ignore=kernels/core/test_permute_cols.py" | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *" kernels/attention"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/attention/test_attention_selector.py \ | ||||
|   --ignore=kernels/attention/test_blocksparse_attention.py \ | ||||
|   --ignore=kernels/attention/test_encoder_decoder_attn.py \ | ||||
|   --ignore=kernels/attention/test_flash_attn.py \ | ||||
|   --ignore=kernels/attention/test_flashinfer.py \ | ||||
|   --ignore=kernels/attention/test_prefix_prefill.py \ | ||||
|   --ignore=kernels/attention/test_cascade_flash_attn.py \ | ||||
|   --ignore=kernels/attention/test_mha_attn.py \ | ||||
|   --ignore=kernels/attention/test_lightning_attn.py \ | ||||
|   --ignore=kernels/attention/test_attention.py" | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *" kernels/quantization"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/quantization/test_int8_quant.py \ | ||||
|   --ignore=kernels/quantization/test_aqlm.py \ | ||||
|   --ignore=kernels/quantization/test_machete_mm.py \ | ||||
|   --ignore=kernels/quantization/test_block_fp8.py \ | ||||
|   --ignore=kernels/quantization/test_block_int8.py \ | ||||
|   --ignore=kernels/quantization/test_marlin_gemm.py \ | ||||
|   --ignore=kernels/quantization/test_cutlass_scaled_mm.py \ | ||||
|   --ignore=kernels/quantization/test_int8_kernel.py" | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *" kernels/mamba"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/mamba/test_mamba_mixer2.py \ | ||||
|   --ignore=kernels/mamba/test_causal_conv1d.py \ | ||||
|   --ignore=kernels/mamba/test_mamba_ssm_ssd.py" | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *" kernels/moe"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/moe/test_moe.py \ | ||||
|   --ignore=kernels/moe/test_cutlass_moe.py \ | ||||
|   --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" | ||||
| fi | ||||
|  | ||||
| #ignore certain Entrypoints/openai tests | ||||
| if [[ $commands == *" entrypoints/openai "* ]]; then | ||||
|   commands=${commands//" entrypoints/openai "/" entrypoints/openai \ | ||||
|   --ignore=entrypoints/openai/test_audio.py \ | ||||
|   --ignore=entrypoints/openai/test_shutdown.py \ | ||||
|   --ignore=entrypoints/openai/test_completion.py \ | ||||
|   --ignore=entrypoints/openai/test_sleep.py \ | ||||
|   --ignore=entrypoints/openai/test_models.py \ | ||||
|   --ignore=entrypoints/openai/test_lora_adapters.py \ | ||||
|   --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ | ||||
|   --ignore=entrypoints/openai/test_root_path.py \ | ||||
|   --ignore=entrypoints/openai/test_tokenization.py \ | ||||
|   --ignore=entrypoints/openai/test_prompt_validation.py "} | ||||
| fi | ||||
|  | ||||
| #ignore certain Entrypoints/llm tests | ||||
| if [[ $commands == *" entrypoints/llm "* ]]; then | ||||
|   commands=${commands//" entrypoints/llm "/" entrypoints/llm \ | ||||
|   --ignore=entrypoints/llm/test_chat.py \ | ||||
|   --ignore=entrypoints/llm/test_accuracy.py \ | ||||
|   --ignore=entrypoints/llm/test_init.py \ | ||||
|   --ignore=entrypoints/llm/test_generate_multiple_loras.py \ | ||||
|   --ignore=entrypoints/llm/test_prompt_validation.py "} | ||||
| fi | ||||
|  | ||||
| #Obsolete currently | ||||
| ##ignore certain Entrypoints/llm tests | ||||
| #if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then | ||||
| #  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "} | ||||
| #fi | ||||
|  | ||||
| # --ignore=entrypoints/openai/test_encoder_decoder.py \ | ||||
| # --ignore=entrypoints/openai/test_embedding.py \ | ||||
| # --ignore=entrypoints/openai/test_oot_registration.py | ||||
| # --ignore=entrypoints/openai/test_accuracy.py \ | ||||
| # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13 | ||||
|  | ||||
|  | ||||
| PARALLEL_JOB_COUNT=8 | ||||
| MYPYTHONPATH=".." | ||||
|  | ||||
| # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.  | ||||
| if [[ $commands == *"--shard-id="* ]]; then | ||||
|   # assign job count as the number of shards used    | ||||
|   commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} | ||||
|   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do | ||||
|     # assign shard-id for each shard | ||||
|     commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} | ||||
|     echo "Shard ${GPU} commands:$commands_gpu" | ||||
|     echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" | ||||
|     docker run \ | ||||
|         --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ | ||||
|         --network=host \ | ||||
|         --shm-size=16gb \ | ||||
|         --rm \ | ||||
|         -e HIP_VISIBLE_DEVICES="${GPU}" \ | ||||
|         -e HF_TOKEN \ | ||||
|         -e AWS_ACCESS_KEY_ID \ | ||||
|         -e AWS_SECRET_ACCESS_KEY \ | ||||
|         -v "${HF_CACHE}:${HF_MOUNT}" \ | ||||
|         -e "HF_HOME=${HF_MOUNT}" \ | ||||
|         -e "PYTHONPATH=${MYPYTHONPATH}" \ | ||||
|         --name "${container_name}_${GPU}" \ | ||||
|         "${image_name}" \ | ||||
|         /bin/bash -c "${commands_gpu}" \ | ||||
|         |& while read -r line; do echo ">>Shard $GPU: $line"; done & | ||||
|     PIDS+=($!) | ||||
|   done | ||||
|   #wait for all processes to finish and collect exit codes | ||||
|   for pid in "${PIDS[@]}"; do | ||||
|     wait "${pid}" | ||||
|     STATUS+=($?) | ||||
|   done | ||||
|   for st in "${STATUS[@]}"; do | ||||
|     if [[ ${st} -ne 0 ]]; then | ||||
|       echo "One of the processes failed with $st" | ||||
|       exit "${st}" | ||||
|     fi | ||||
|   done | ||||
| else | ||||
|   echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" | ||||
|   docker run \ | ||||
|           --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ | ||||
|           --network=host \ | ||||
|           --shm-size=16gb \ | ||||
|           --rm \ | ||||
|           -e HIP_VISIBLE_DEVICES=0 \ | ||||
|           -e HF_TOKEN \ | ||||
|           -e AWS_ACCESS_KEY_ID \ | ||||
|           -e AWS_SECRET_ACCESS_KEY \ | ||||
|           -v "${HF_CACHE}:${HF_MOUNT}" \ | ||||
|           -e "HF_HOME=${HF_MOUNT}" \ | ||||
|           -e "PYTHONPATH=${MYPYTHONPATH}" \ | ||||
|           --name "${container_name}" \ | ||||
|           "${image_name}" \ | ||||
|           /bin/bash -c "${commands}" | ||||
| fi | ||||
| @ -1,49 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { | ||||
|   if [[ -n "$container_id" ]]; then | ||||
|       podman stop --all -t0 | ||||
|       podman rm -f "$container_id" || true | ||||
|   fi | ||||
|   podman system prune -f | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Try building the docker image | ||||
| podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le . | ||||
|  | ||||
| # Run the image | ||||
| container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc) | ||||
|  | ||||
| function cpu_tests() { | ||||
|  | ||||
|   # offline inference | ||||
|   podman exec -it "$container_id" bash -c " | ||||
|     set -e | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" | ||||
|  | ||||
|   # Run basic model test | ||||
|   podman exec -it "$container_id" bash -c " | ||||
|     set -e | ||||
|     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib | ||||
|     pip install sentence-transformers datamodel_code_generator | ||||
|     pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model | ||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] | ||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] | ||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] | ||||
|     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] | ||||
|     pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" | ||||
| } | ||||
|  | ||||
| # All of CPU tests are expected to be finished less than 40 mins. | ||||
|  | ||||
| export container_id | ||||
| export -f cpu_tests | ||||
| timeout 40m bash -c cpu_tests | ||||
|  | ||||
| @ -1,13 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Try building the docker image | ||||
| docker build -t cpu-test -f docker/Dockerfile.s390x . | ||||
| @ -1,102 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # allow to bind to different cores | ||||
| CORE_RANGE=${CORE_RANGE:-48-95} | ||||
| OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} | ||||
| NUMA_NODE=${NUMA_NODE:-1} | ||||
|  | ||||
| export CMAKE_BUILD_PARALLEL_LEVEL=32 | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() {  | ||||
|     set -e;  | ||||
|     docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;  | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Try building the docker image | ||||
| numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . | ||||
| numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . | ||||
|  | ||||
| # Run the image, setting --shm-size=4g for tensor parallel. | ||||
| docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" | ||||
| docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 | ||||
|  | ||||
| function cpu_tests() { | ||||
|   set -e | ||||
|   export NUMA_NODE=$2 | ||||
|  | ||||
|   # list packages | ||||
|   docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " | ||||
|     set -e | ||||
|     pip list" | ||||
|  | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pip list" | ||||
|  | ||||
|   # offline inference | ||||
|   docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " | ||||
|     set -e | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" | ||||
|  | ||||
|   # Run basic model test | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     # Note: disable until supports V1 | ||||
|     # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model | ||||
|     # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model | ||||
|  | ||||
|     # Note: disable Bart until supports V1 | ||||
|     pytest -v -s tests/models/language/generation -m cpu_model \ | ||||
|                 --ignore=tests/models/language/generation/test_bart.py | ||||
|     VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \ | ||||
|                 --ignore=tests/models/language/generation/test_bart.py | ||||
|  | ||||
|     pytest -v -s tests/models/language/pooling -m cpu_model | ||||
|     pytest -v -s tests/models/multimodal/generation \ | ||||
|                 --ignore=tests/models/multimodal/generation/test_mllama.py \ | ||||
|                 --ignore=tests/models/multimodal/generation/test_pixtral.py \ | ||||
|                 -m cpu_model" | ||||
|  | ||||
|   # Run compressed-tensor test | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pytest -s -v \ | ||||
|     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"  | ||||
|  | ||||
|   # Note: disable it until supports V1 | ||||
|   # Run AWQ test | ||||
|   # docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|   #   set -e | ||||
|   #   VLLM_USE_V1=0 pytest -s -v \ | ||||
|   #   tests/quantization/test_ipex_quant.py" | ||||
|  | ||||
|   # online serving | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &  | ||||
|     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 | ||||
|     VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \ | ||||
|       --backend vllm \ | ||||
|       --dataset-name random \ | ||||
|       --model facebook/opt-125m \ | ||||
|       --num-prompts 20 \ | ||||
|       --endpoint /v1/completions \ | ||||
|       --tokenizer facebook/opt-125m" | ||||
|  | ||||
|   # Run multi-lora tests | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pytest -s -v \ | ||||
|     tests/lora/test_qwen2vl.py" | ||||
| } | ||||
|  | ||||
| # All of CPU tests are expected to be finished less than 40 mins. | ||||
| export -f cpu_tests | ||||
| timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" | ||||
| @ -1,30 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the GH200 docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile | ||||
| python3 use_existing_torch.py | ||||
|  | ||||
| # Try building the docker image | ||||
| DOCKER_BUILDKIT=1 docker build . \ | ||||
|   --file docker/Dockerfile \ | ||||
|   --target vllm-openai \ | ||||
|   --platform "linux/arm64" \ | ||||
|   -t gh200-test \ | ||||
|   --build-arg max_jobs=66 \ | ||||
|   --build-arg nvcc_threads=2 \ | ||||
|   --build-arg RUN_WHEEL_CHECK=false \ | ||||
|   --build-arg torch_cuda_arch_list="9.0+PTX" \ | ||||
|   --build-arg vllm_fa_cmake_gpu_arches="90-real" | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f gh200-test || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image and test offline inference | ||||
| docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' | ||||
|     python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B | ||||
| ' | ||||
| @ -1,58 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -exuo pipefail | ||||
|  | ||||
| # Try building the docker image | ||||
| cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - . | ||||
| FROM 1.22-413-pt2.7.1:latest | ||||
|  | ||||
| COPY ./ /workspace/vllm | ||||
|  | ||||
| WORKDIR /workspace/vllm | ||||
|  | ||||
| RUN pip install -v -r requirements/hpu.txt | ||||
| RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git | ||||
|  | ||||
| ENV no_proxy=localhost,127.0.0.1 | ||||
| ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true | ||||
|  | ||||
| RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install | ||||
|  | ||||
| # install development dependencies (for testing) | ||||
| RUN python3 -m pip install -e tests/vllm_test_utils | ||||
|  | ||||
| WORKDIR /workspace/ | ||||
|  | ||||
| RUN git clone https://github.com/vllm-project/vllm-gaudi.git | ||||
|  | ||||
| RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks | ||||
|  | ||||
| EOF | ||||
|  | ||||
| # Setup cleanup | ||||
| # certain versions of HPU software stack have a bug that can | ||||
| # override the exit code of the script, so we need to use | ||||
| # separate remove_docker_containers and remove_docker_containers_and_exit | ||||
| # functions, while other platforms only need one remove_docker_container | ||||
| # function. | ||||
| EXITCODE=1 | ||||
| remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; } | ||||
| trap 'remove_docker_containers; exit $EXITCODE;' EXIT | ||||
| remove_docker_containers | ||||
|  | ||||
| echo "Running HPU plugin v1 test" | ||||
| docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \ | ||||
|   -e HABANA_VISIBLE_DEVICES=all \ | ||||
|   hpu-plugin-v1-test-env \ | ||||
|   /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh" | ||||
|  | ||||
| EXITCODE=$? | ||||
| if [ $EXITCODE -eq 0 ]; then | ||||
|   echo "Test with basic model passed" | ||||
| else | ||||
|   echo "Test with basic model FAILED with exit code: $EXITCODE" >&2 | ||||
| fi | ||||
|  | ||||
| # The trap will handle the container removal and final exit. | ||||
| @ -1,64 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the Neuron docker image and run the API server inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -e | ||||
| set -v | ||||
|  | ||||
| image_name="neuron/vllm-ci" | ||||
| container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" | ||||
|  | ||||
| HF_CACHE="$(realpath ~)/huggingface" | ||||
| mkdir -p "${HF_CACHE}" | ||||
| HF_MOUNT="/root/.cache/huggingface" | ||||
| HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN) | ||||
|  | ||||
| NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache" | ||||
| mkdir -p "${NEURON_COMPILE_CACHE_URL}" | ||||
| NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache" | ||||
|  | ||||
| # Try building the docker image | ||||
| aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws | ||||
|  | ||||
| # prune old image and containers to save disk space, and only once a day | ||||
| # by using a timestamp file in tmp. | ||||
| if [ -f /tmp/neuron-docker-build-timestamp ]; then | ||||
|     last_build=$(cat /tmp/neuron-docker-build-timestamp) | ||||
|     current_time=$(date +%s) | ||||
|     if [ $((current_time - last_build)) -gt 86400 ]; then | ||||
|         # Remove dangling images (those that are not tagged and not used by any container) | ||||
|         docker image prune -f | ||||
|         # Remove unused volumes / force the system prune for old images as well. | ||||
|         docker volume prune -f && docker system prune -f | ||||
|         echo "$current_time" > /tmp/neuron-docker-build-timestamp | ||||
|     fi | ||||
| else | ||||
|     date "+%s" > /tmp/neuron-docker-build-timestamp | ||||
| fi | ||||
|  | ||||
| docker build -t "${image_name}" -f docker/Dockerfile.neuron . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { | ||||
|     docker image rm -f "${image_name}" || true; | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Run the image | ||||
| docker run --rm -it --device=/dev/neuron0 --network bridge \ | ||||
|        -v "${HF_CACHE}:${HF_MOUNT}" \ | ||||
|        -e "HF_HOME=${HF_MOUNT}" \ | ||||
|        -e "HF_TOKEN=${HF_TOKEN}" \ | ||||
|        -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \ | ||||
|        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ | ||||
|        --name "${container_name}" \ | ||||
|        ${image_name} \ | ||||
|        /bin/bash -c " | ||||
|             set -e; # Exit on first error | ||||
|             python3 /workspace/vllm/examples/offline_inference/neuron.py; | ||||
|             python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys; | ||||
|             for f in /workspace/vllm/tests/neuron/2_core/*.py; do | ||||
|                 echo \"Running test file: \$f\"; | ||||
|                 python3 -m pytest \$f -v --capture=tee-sys; | ||||
|             done | ||||
|        " | ||||
| @ -1,187 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -xu | ||||
|  | ||||
|  | ||||
| remove_docker_container() {  | ||||
|     docker rm -f tpu-test || true;  | ||||
|     docker rm -f vllm-tpu || true; | ||||
| } | ||||
|  | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Remove the container that might not be cleaned up in the previous run. | ||||
| remove_docker_container | ||||
|  | ||||
| # Build the docker image. | ||||
| docker build -f docker/Dockerfile.tpu -t vllm-tpu . | ||||
|  | ||||
| # Set up cleanup. | ||||
| cleanup_docker() { | ||||
|   # Get Docker's root directory | ||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||
|   if [ -z "$docker_root" ]; then | ||||
|     echo "Failed to determine Docker root directory." | ||||
|     exit 1 | ||||
|   fi | ||||
|   echo "Docker root directory: $docker_root" | ||||
|   # Check disk usage of the filesystem where Docker's root directory is located | ||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||
|   # Define the threshold | ||||
|   threshold=70 | ||||
|   if [ "$disk_usage" -gt "$threshold" ]; then | ||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||
|     # Remove dangling images (those that are not tagged and not used by any container) | ||||
|     docker image prune -f | ||||
|     # Remove unused volumes / force the system prune for old images as well. | ||||
|     docker volume prune -f && docker system prune --force --filter "until=72h" --all | ||||
|     echo "Docker images and volumes cleanup completed." | ||||
|   else | ||||
|     echo "Disk usage is below $threshold%. No cleanup needed." | ||||
|   fi | ||||
| } | ||||
| cleanup_docker | ||||
|  | ||||
| # For HF_TOKEN. | ||||
| source /etc/environment | ||||
|  | ||||
| docker run --privileged --net host --shm-size=16G -it \ | ||||
|     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ | ||||
|     vllm-tpu /bin/bash -c ' | ||||
| set -e # Exit immediately if a command exits with a non-zero status. | ||||
| set -u # Treat unset variables as an error. | ||||
|  | ||||
| echo "--- Starting script inside Docker container ---" | ||||
|  | ||||
| # Create results directory | ||||
| RESULTS_DIR=$(mktemp -d) | ||||
| # If mktemp fails, set -e will cause the script to exit. | ||||
| echo "Results will be stored in: $RESULTS_DIR" | ||||
|  | ||||
| # Install dependencies | ||||
| echo "--- Installing Python dependencies ---" | ||||
| python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ | ||||
|     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ | ||||
|     && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 | ||||
| echo "--- Python dependencies installed ---" | ||||
| export VLLM_USE_V1=1 | ||||
| export VLLM_XLA_CHECK_RECOMPILATION=1 | ||||
| export VLLM_XLA_CACHE_PATH= | ||||
| echo "Using VLLM V1" | ||||
|  | ||||
| echo "--- Hardware Information ---" | ||||
| tpu-info | ||||
| echo "--- Starting Tests ---" | ||||
| set +e | ||||
| overall_script_exit_code=0 | ||||
|  | ||||
| # --- Test Definitions --- | ||||
| # If a test fails, this function will print logs and will not cause the main script to exit. | ||||
| run_test() { | ||||
|     local test_num=$1 | ||||
|     local test_name=$2 | ||||
|     local test_command=$3 | ||||
|     local log_file="$RESULTS_DIR/test_${test_num}.log" | ||||
|     local actual_exit_code | ||||
|  | ||||
|     echo "--- TEST_$test_num: Running $test_name ---" | ||||
|      | ||||
|     # Execute the test command. | ||||
|     eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) | ||||
|     actual_exit_code=$? | ||||
|  | ||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log | ||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log | ||||
|  | ||||
|     if [ "$actual_exit_code" -ne 0 ]; then | ||||
|         echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 | ||||
|         echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 | ||||
|         if [ -f "$log_file" ]; then | ||||
|             cat "$log_file" >&2 | ||||
|         else | ||||
|             echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 | ||||
|         fi | ||||
|         echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 | ||||
|         return "$actual_exit_code" # Return the failure code | ||||
|     else | ||||
|         echo "TEST_$test_num ($test_name) PASSED." | ||||
|         return 0 # Return success | ||||
|     fi | ||||
| } | ||||
|  | ||||
| # Helper function to call run_test and update the overall script exit code | ||||
| run_and_track_test() { | ||||
|     local test_num_arg="$1" | ||||
|     local test_name_arg="$2" | ||||
|     local test_command_arg="$3" | ||||
|  | ||||
|     # Run the test | ||||
|     run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" | ||||
|     local test_specific_exit_code=$? | ||||
|  | ||||
|     # If the test failed, set the overall script exit code to 1 | ||||
|     if [ "$test_specific_exit_code" -ne 0 ]; then | ||||
|         # No need for extra echo here, run_test already logged the failure. | ||||
|         overall_script_exit_code=1 | ||||
|     fi | ||||
| } | ||||
|  | ||||
| # --- Actual Test Execution --- | ||||
| run_and_track_test 0 "test_perf.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py" | ||||
| run_and_track_test 1 "test_compilation.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py" | ||||
| run_and_track_test 2 "test_basic.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" | ||||
| run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" | ||||
| run_and_track_test 4 "test_quantization_accuracy.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" | ||||
| run_and_track_test 5 "examples/offline_inference/tpu.py" \ | ||||
|     "python3 /workspace/vllm/examples/offline_inference/tpu.py" | ||||
| run_and_track_test 6 "test_tpu_model_runner.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" | ||||
| run_and_track_test 7 "test_sampler.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" | ||||
| run_and_track_test 8 "test_topk_topp_sampler.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" | ||||
| run_and_track_test 9 "test_multimodal.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" | ||||
| run_and_track_test 10 "test_pallas.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" | ||||
| run_and_track_test 11 "test_struct_output_generate.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" | ||||
| run_and_track_test 12 "test_moe_pallas.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" | ||||
| run_and_track_test 13 "test_lora.py" \ | ||||
|     "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" | ||||
| run_and_track_test 14 "test_tpu_qkv_linear.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" | ||||
| run_and_track_test 15 "test_spmd_model_weight_loading.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" | ||||
| run_and_track_test 16 "test_kv_cache_update_kernel.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" | ||||
|  | ||||
| # After all tests have been attempted, exit with the overall status. | ||||
| if [ "$overall_script_exit_code" -ne 0 ]; then | ||||
|     echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" | ||||
| else | ||||
|     echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" | ||||
| fi | ||||
| exit "$overall_script_exit_code" | ||||
| ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. | ||||
|  | ||||
| # Capture the exit code of the docker run command | ||||
| DOCKER_RUN_EXIT_CODE=$? | ||||
|  | ||||
| # The trap will run for cleanup. | ||||
| # Exit the main script with the Docker run command's exit code. | ||||
| if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then | ||||
|     echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." | ||||
|     exit "$DOCKER_RUN_EXIT_CODE" | ||||
| else | ||||
|     echo "Docker run command completed successfully." | ||||
|     exit 0 | ||||
| fi | ||||
| # TODO: This test fails because it uses RANDOM_SEED sampling | ||||
| # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ | ||||
| @ -1,34 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}" | ||||
| container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" | ||||
|  | ||||
| # Try building the docker image | ||||
| docker build -t ${image_name} -f docker/Dockerfile.xpu . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { | ||||
|   docker rm -f "${container_name}" || true; | ||||
|   docker image rm -f "${image_name}" || true; | ||||
|   docker system prune -f || true; | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Run the image and test offline inference/tensor parallel | ||||
| docker run \ | ||||
|     --device /dev/dri \ | ||||
|     -v /dev/dri/by-path:/dev/dri/by-path \ | ||||
|     --entrypoint="" \ | ||||
|     --name "${container_name}" \ | ||||
|     "${image_name}" \ | ||||
|     sh -c ' | ||||
|     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager | ||||
|     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray | ||||
|     VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp | ||||
|     cd tests | ||||
|     pytest -v -s v1/core | ||||
| ' | ||||
| @ -1,18 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # Usage: ./rerun_test.sh path/to/test.py::test_name | ||||
|  | ||||
| # Check if argument is given | ||||
| if [ $# -lt 1 ]; then | ||||
|     echo "Usage: $0 path/to/test.py::test_name" | ||||
|     echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| TEST=$1 | ||||
| COUNT=1 | ||||
|  | ||||
| while pytest -sv "$TEST"; do | ||||
|     COUNT=$((COUNT + 1)) | ||||
|     echo "RUN NUMBER ${COUNT}" | ||||
| done | ||||
| @ -1,108 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -euox pipefail | ||||
|  | ||||
| if [[ $# -lt 4 ]]; then | ||||
|     echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| WORKING_DIR=$1 | ||||
| NUM_NODES=$2 | ||||
| NUM_GPUS=$3 | ||||
| DOCKER_IMAGE=$4 | ||||
|  | ||||
| shift 4 | ||||
| COMMANDS=("$@") | ||||
| if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then | ||||
|     echo "The number of commands must be equal to the number of nodes." | ||||
|     echo "Number of nodes: $NUM_NODES" | ||||
|     echo "Number of commands: ${#COMMANDS[@]}" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| echo "List of commands" | ||||
| for command in "${COMMANDS[@]}"; do | ||||
|     echo "$command" | ||||
| done | ||||
|  | ||||
| start_network() { | ||||
|     docker network create --subnet=192.168.10.0/24 docker-net | ||||
| } | ||||
|  | ||||
| start_nodes() { | ||||
|     for node in $(seq 0 $(($NUM_NODES-1))); do | ||||
|         GPU_DEVICES='"device=' | ||||
|         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do | ||||
|             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) | ||||
|             GPU_DEVICES+=$(($DEVICE_NUM)) | ||||
|             if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then | ||||
|                 GPU_DEVICES+=',' | ||||
|             fi | ||||
|         done | ||||
|         GPU_DEVICES+='"' | ||||
|  | ||||
|         # start the container in detached mode | ||||
|         # things to note: | ||||
|         # 1. --shm-size=10.24gb is required. don't use --ipc=host | ||||
|         # 2. pass HF_TOKEN to the container | ||||
|         # 3. map the huggingface cache directory to the container | ||||
|         # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: | ||||
|         #    starting from 192.168.10.11) | ||||
|         docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \ | ||||
|             -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \ | ||||
|             --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \ | ||||
|             /bin/bash -c "tail -f /dev/null" | ||||
|  | ||||
|         # organize containers into a ray cluster | ||||
|         if [ "$node" -eq 0 ]; then | ||||
|             # start the ray head node | ||||
|             docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block" | ||||
|             # wait for the head node to be ready | ||||
|             sleep 10 | ||||
|         else | ||||
|             # start the ray worker nodes, and connect them to the head node | ||||
|             docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block" | ||||
|         fi | ||||
|     done | ||||
|  | ||||
|     # wait for the cluster to be ready | ||||
|     sleep 10 | ||||
|  | ||||
|     # print the cluster status | ||||
|     docker exec node0 /bin/bash -c "ray status" | ||||
| } | ||||
|  | ||||
| run_nodes() { | ||||
|     # important: iterate in reverse order to start the head node last | ||||
|     # we start the worker nodes first, in detached mode, and then start the head node | ||||
|     # in the foreground, so that the output of the head node is visible in the buildkite logs | ||||
|     for node in $(seq $(($NUM_NODES - 1)) -1 0); do | ||||
|         GPU_DEVICES='"device=' | ||||
|         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do | ||||
|             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) | ||||
|             GPU_DEVICES+=$(($DEVICE_NUM)) | ||||
|             if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then | ||||
|                 GPU_DEVICES+=',' | ||||
|             fi | ||||
|         done | ||||
|         GPU_DEVICES+='"' | ||||
|         echo "Running node$node with GPU devices: $GPU_DEVICES" | ||||
|         if [ "$node" -ne 0 ]; then | ||||
|             docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" | ||||
|         else | ||||
|             docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" | ||||
|         fi | ||||
|     done | ||||
| } | ||||
| cleanup() { | ||||
|     for node in $(seq 0 $(($NUM_NODES-1))); do | ||||
|         docker stop "node$node" | ||||
|     done | ||||
|     docker network rm docker-net | ||||
| } | ||||
| trap cleanup EXIT | ||||
| start_network | ||||
| start_nodes | ||||
| run_nodes | ||||
|  | ||||
| @ -1,24 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -euo pipefail | ||||
|  | ||||
| docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||
| if [ -z "$docker_root" ]; then | ||||
|   echo "Failed to determine Docker root directory." | ||||
|   exit 1 | ||||
| fi | ||||
| echo "Docker root directory: $docker_root" | ||||
| # Check disk usage of the filesystem where Docker's root directory is located | ||||
| disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||
| # Define the threshold | ||||
| threshold=70 | ||||
| if [ "$disk_usage" -gt "$threshold" ]; then | ||||
|   echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||
|   # Remove dangling images (those that are not tagged and not used by any container) | ||||
|   docker image prune -f | ||||
|   # Remove unused volumes / force the system prune for old images as well. | ||||
|   docker volume prune -f && docker system prune --force --filter "until=72h" --all | ||||
|   echo "Docker images and volumes cleanup completed." | ||||
| else | ||||
|   echo "Disk usage is below $threshold%. No cleanup needed." | ||||
| fi | ||||
| @ -1,14 +0,0 @@ | ||||
| # Environment config | ||||
| TEST_NAME=llama8b | ||||
| CONTAINER_NAME=vllm-tpu | ||||
|  | ||||
| # vllm config | ||||
| MODEL=meta-llama/Llama-3.1-8B-Instruct | ||||
| MAX_NUM_SEQS=256 | ||||
| MAX_NUM_BATCHED_TOKENS=1024 | ||||
| TENSOR_PARALLEL_SIZE=1 | ||||
| MAX_MODEL_LEN=2048 | ||||
| DOWNLOAD_DIR=/mnt/disks/persist | ||||
| EXPECTED_THROUGHPUT=8.0 | ||||
| INPUT_LEN=1800 | ||||
| OUTPUT_LEN=128 | ||||
| @ -1,92 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| if [ ! -f "$1" ]; then | ||||
|   echo "Error: The env file '$1' does not exist." | ||||
|   exit 1  # Exit the script with a non-zero status to indicate an error | ||||
| fi | ||||
|  | ||||
| ENV_FILE=$1 | ||||
|  | ||||
| # For testing on local vm, use `set -a` to export all variables | ||||
| source /etc/environment | ||||
| source $ENV_FILE | ||||
|  | ||||
| remove_docker_container() {  | ||||
|     docker rm -f tpu-test || true;  | ||||
|     docker rm -f vllm-tpu || true; | ||||
|     docker rm -f $CONTAINER_NAME || true; | ||||
| } | ||||
|  | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Remove the container that might not be cleaned up in the previous run. | ||||
| remove_docker_container | ||||
|  | ||||
| LOG_ROOT=$(mktemp -d) | ||||
| # If mktemp fails, set -e will cause the script to exit. | ||||
| echo "Results will be stored in: $LOG_ROOT" | ||||
|  | ||||
| if [ -z "$HF_TOKEN" ]; then | ||||
|   echo "Error: HF_TOKEN is not set or is empty."   | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| # Make sure mounted disk or dir exists | ||||
| if [ ! -d "$DOWNLOAD_DIR" ]; then | ||||
|     echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder." | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| echo "Run model $MODEL" | ||||
| echo | ||||
|  | ||||
| echo "starting docker...$CONTAINER_NAME" | ||||
| echo     | ||||
| docker run \ | ||||
|  -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \ | ||||
|  --env-file $ENV_FILE \ | ||||
|  -e HF_TOKEN="$HF_TOKEN" \ | ||||
|  -e TARGET_COMMIT=$BUILDKITE_COMMIT \ | ||||
|  -e MODEL=$MODEL \ | ||||
|  -e WORKSPACE=/workspace \ | ||||
|  --name $CONTAINER_NAME \ | ||||
|  -d \ | ||||
|  --privileged \ | ||||
|  --network host \ | ||||
|  -v /dev/shm:/dev/shm \ | ||||
|  vllm/vllm-tpu-bm tail -f /dev/null | ||||
|  | ||||
| echo "run script..." | ||||
| echo | ||||
| docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh" | ||||
|  | ||||
| echo "copy result back..." | ||||
| VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt | ||||
| BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt | ||||
| docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"  | ||||
| docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG" | ||||
|  | ||||
| throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') | ||||
| echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput" | ||||
|  | ||||
| if [ "$BUILDKITE" = "true" ]; then | ||||
|   echo "Running inside Buildkite" | ||||
|   buildkite-agent artifact upload "$VLLM_LOG"  | ||||
|   buildkite-agent artifact upload "$BM_LOG" | ||||
| else | ||||
|   echo "Not running inside Buildkite" | ||||
| fi | ||||
|  | ||||
| # | ||||
| # compare the throughput with EXPECTED_THROUGHPUT  | ||||
| # and assert meeting the expectation | ||||
| #  | ||||
| if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then | ||||
|   echo "Failed to get the throughput" | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then | ||||
|   echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)" | ||||
|   exit 1 | ||||
| fi | ||||
| @ -1,14 +0,0 @@ | ||||
| # Environment config | ||||
| TEST_NAME=llama8bw8a8 | ||||
| CONTAINER_NAME=vllm-tpu | ||||
|  | ||||
| # vllm config | ||||
| MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | ||||
| MAX_NUM_SEQS=128 | ||||
| MAX_NUM_BATCHED_TOKENS=1024 | ||||
| TENSOR_PARALLEL_SIZE=1 | ||||
| MAX_MODEL_LEN=2048 | ||||
| DOWNLOAD_DIR=/mnt/disks/persist | ||||
| EXPECTED_THROUGHPUT=10.0 | ||||
| INPUT_LEN=1800 | ||||
| OUTPUT_LEN=128 | ||||
| @ -1,94 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -euo pipefail | ||||
|  | ||||
| VLLM_LOG="$WORKSPACE/vllm_log.txt" | ||||
| BM_LOG="$WORKSPACE/bm_log.txt" | ||||
|  | ||||
| if [ -n "$TARGET_COMMIT" ]; then | ||||
|   head_hash=$(git rev-parse HEAD) | ||||
|   if [ "$TARGET_COMMIT" != "$head_hash" ]; then | ||||
|     echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash" | ||||
|     exit 1 | ||||
|   fi | ||||
| fi | ||||
|  | ||||
| echo "model: $MODEL" | ||||
| echo | ||||
|  | ||||
| # | ||||
| # create a log folder | ||||
| # | ||||
| mkdir "$WORKSPACE/log" | ||||
|  | ||||
| # TODO: Move to image building. | ||||
| pip install pandas | ||||
| pip install datasets | ||||
|  | ||||
| # | ||||
| # create sonnet_4x | ||||
| # | ||||
| echo "Create sonnet_4x.txt" | ||||
| echo "" > benchmarks/sonnet_4x.txt | ||||
| for _ in {1..4} | ||||
|  do | ||||
|   cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt | ||||
| done | ||||
|  | ||||
| # | ||||
| # start vllm service in backend | ||||
| # | ||||
| echo "lanching vllm..." | ||||
| echo "logging to $VLLM_LOG" | ||||
| echo | ||||
|  | ||||
| VLLM_USE_V1=1 vllm serve $MODEL \ | ||||
|  --seed 42 \ | ||||
|  --disable-log-requests \ | ||||
|  --max-num-seqs $MAX_NUM_SEQS \ | ||||
|  --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ | ||||
|  --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ | ||||
|  --no-enable-prefix-caching \ | ||||
|  --download_dir $DOWNLOAD_DIR \ | ||||
|  --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 & | ||||
|  | ||||
|  | ||||
| echo "wait for 20 minutes.." | ||||
| echo | ||||
| # sleep 1200 | ||||
| # wait for 10 minutes... | ||||
| for i in {1..120}; do | ||||
|     # TODO: detect other type of errors. | ||||
|     if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then | ||||
|         echo "Detected RuntimeError, exiting." | ||||
|         exit 1 | ||||
|     elif grep -Fq "Application startup complete" "$VLLM_LOG"; then | ||||
|         echo "Application started" | ||||
|         break | ||||
|     else | ||||
|         echo "wait for 10 seconds..." | ||||
|         sleep 10 | ||||
|     fi | ||||
| done | ||||
|  | ||||
| # | ||||
| # run test | ||||
| # | ||||
| echo "run benchmark test..." | ||||
| echo "logging to $BM_LOG" | ||||
| echo | ||||
| python benchmarks/benchmark_serving.py \ | ||||
|     --backend vllm \ | ||||
|     --model $MODEL  \ | ||||
|     --dataset-name sonnet \ | ||||
|     --dataset-path benchmarks/sonnet_4x.txt \ | ||||
|     --sonnet-input-len $INPUT_LEN \ | ||||
|     --sonnet-output-len $OUTPUT_LEN \ | ||||
|     --ignore-eos > "$BM_LOG" | ||||
|  | ||||
| echo "completed..." | ||||
| echo | ||||
|  | ||||
| throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') | ||||
| echo "throughput: $throughput" | ||||
| echo | ||||
| @ -1,78 +0,0 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -ex | ||||
|  | ||||
| # Assume wheels are in artifacts/dist/*.whl | ||||
| wheel_files=(artifacts/dist/*.whl) | ||||
|  | ||||
| # Check that exactly one wheel is found | ||||
| if [[ ${#wheel_files[@]} -ne 1 ]]; then | ||||
|   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| # Get the single wheel file | ||||
| wheel="${wheel_files[0]}" | ||||
|  | ||||
| # Rename 'linux' to 'manylinux1' in the wheel filename | ||||
| new_wheel="${wheel/linux/manylinux1}" | ||||
| mv -- "$wheel" "$new_wheel" | ||||
| wheel="$new_wheel" | ||||
|  | ||||
| # Extract the version from the wheel | ||||
| version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) | ||||
| echo "Version: $version" | ||||
|  | ||||
| normal_wheel="$wheel" # Save the original wheel filename | ||||
|  | ||||
| # If the version contains "dev", rename it to v1.0.0.dev for consistency | ||||
| if [[ $version == *dev* ]]; then | ||||
|     suffix="${version##*.}" | ||||
|     if [[ $suffix == cu* ]]; then | ||||
|         new_version="1.0.0.dev+${suffix}" | ||||
|     else | ||||
|         new_version="1.0.0.dev" | ||||
|     fi | ||||
|     new_wheel="${wheel/$version/$new_version}" | ||||
|     # use cp to keep both files in the artifacts directory | ||||
|     cp -- "$wheel" "$new_wheel" | ||||
|     wheel="$new_wheel" | ||||
|     version="$new_version" | ||||
| fi | ||||
|  | ||||
| # Upload the wheel to S3 | ||||
| python3 .buildkite/generate_index.py --wheel "$normal_wheel" | ||||
|  | ||||
| # generate index for this commit | ||||
| aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" | ||||
| aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" | ||||
|  | ||||
| if [[ $normal_wheel == *"cu118"* ]]; then | ||||
|     # if $normal_wheel matches cu118, do not upload the index.html | ||||
|     echo "Skipping index files for cu118 wheels" | ||||
| elif [[ $normal_wheel == *"cu126"* ]]; then | ||||
|     # if $normal_wheel matches cu126, do not upload the index.html | ||||
|     echo "Skipping index files for cu126 wheels" | ||||
| else | ||||
|     # only upload index.html for cu128 wheels (default wheels) | ||||
|     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" | ||||
|     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" | ||||
| fi | ||||
|  | ||||
| # generate index for nightly | ||||
| aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" | ||||
| aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" | ||||
|  | ||||
| if [[ $normal_wheel == *"cu118"* ]]; then | ||||
|     # if $normal_wheel matches cu118, do not upload the index.html | ||||
|     echo "Skipping index files for cu118 wheels" | ||||
| elif [[ $normal_wheel == *"cu126"* ]]; then | ||||
|     # if $normal_wheel matches cu126, do not upload the index.html | ||||
|     echo "Skipping index files for cu126 wheels" | ||||
| else | ||||
|     # only upload index.html for cu128 wheels (default wheels) | ||||
|     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" | ||||
| fi | ||||
|  | ||||
| aws s3 cp "$wheel" "s3://vllm-wheels/$version/" | ||||
| aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html" | ||||
| @ -1,834 +1,166 @@ | ||||
| # In this file, you can add more tests to run either by adding a new step or | ||||
| # adding a new command to an existing step. See different options here for examples. | ||||
|  | ||||
| # This script will be feed into Jinja template in `test-template-aws.j2` at | ||||
| # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 | ||||
| # to generate the final pipeline yaml file. | ||||
|  | ||||
| # Documentation | ||||
| # label(str): the name of the test. emoji allowed. | ||||
| # fast_check(bool): whether to run this on each commit on fastcheck pipeline. | ||||
| # torch_nightly(bool): whether to run this on vllm against torch nightly pipeline. | ||||
| # fast_check_only(bool): run this test on fastcheck pipeline only | ||||
| # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run. | ||||
| # command(str): the single command to run for tests. incompatible with commands. | ||||
| # commands(list): the list of commands to run for test. incompatbile with command. | ||||
| # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] | ||||
| # gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100 | ||||
| # num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4. | ||||
| # num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host, | ||||
| #     in this case, commands must be specified. the first command runs on first host, the second | ||||
| #     command runs on the second host. | ||||
| # working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests | ||||
| # source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run. | ||||
|  | ||||
| # When adding a test | ||||
| # - If the test belong to an existing group, add it there | ||||
| # - If the test is short, add to any existing step | ||||
| # - If the test takes more than 10min, then it is okay to create a new step. | ||||
| #   Note that all steps execute in parallel. | ||||
| # This script will be feed into Jinja template in `test-template.j2` to generate | ||||
| # the final pipeline yaml file. | ||||
|  | ||||
| steps: | ||||
| ##### fast check tests  ##### | ||||
|  | ||||
| - label: Documentation Build # 2min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   working_dir: "/vllm-workspace/test_docs" | ||||
|   fast_check: true | ||||
|   no_gpu: True | ||||
|   commands: | ||||
|   - pip install -r ../requirements/docs.txt | ||||
|   # TODO: add `--strict` once warnings in docstrings are fixed | ||||
|   - mkdocs build | ||||
|  | ||||
| - label: Pytorch Nightly Dependency Override Check # 2min | ||||
|   # if this test fails, it means the nightly torch version is not compatible with some | ||||
|   # of the dependencies. Please check the error message and add the package to whitelist | ||||
|   # in /vllm/tools/generate_nightly_torch_test.py | ||||
|   soft_fail: true | ||||
|   source_file_dependencies: | ||||
|   - requirements/nightly_torch_test.txt | ||||
|   commands: | ||||
|   - bash standalone_tests/pytorch_nightly_dependency.sh | ||||
|  | ||||
| - label: Async Engine, Inputs, Utils, Worker Test # 24min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/mq_llm_engine | ||||
|   - tests/async_engine | ||||
|   - tests/test_inputs | ||||
|   - tests/multimodal | ||||
|   - tests/test_utils | ||||
|   - tests/worker | ||||
|   - tests/standalone_tests/lazy_imports.py | ||||
|   commands: | ||||
|   - python3 standalone_tests/lazy_imports.py | ||||
|   - pytest -v -s mq_llm_engine # MQLLMEngine | ||||
|   - pytest -v -s async_engine # AsyncLLMEngine | ||||
|   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py | ||||
|   - pytest -v -s test_inputs.py | ||||
|   - pytest -v -s test_outputs.py | ||||
|   - pytest -v -s multimodal | ||||
|   - pytest -v -s test_utils.py # Utils | ||||
|   - pytest -v -s worker # Worker | ||||
|  | ||||
| - label: Python-only Installation Test | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - tests/standalone_tests/python_only_compile.sh | ||||
|   - setup.py | ||||
|   commands: | ||||
|   - bash standalone_tests/python_only_compile.sh | ||||
|  | ||||
| - label: Basic Correctness Test # 30min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   fast_check: true | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/basic_correctness/test_basic_correctness | ||||
|   - tests/basic_correctness/test_cpu_offload | ||||
|   - tests/basic_correctness/test_preemption | ||||
|   - tests/basic_correctness/test_cumem.py | ||||
|   commands: | ||||
|   - export VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
|   - pytest -v -s basic_correctness/test_cumem.py | ||||
|   - pytest -v -s basic_correctness/test_basic_correctness.py | ||||
|   - pytest -v -s basic_correctness/test_cpu_offload.py | ||||
|   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py | ||||
|  | ||||
| - label: Chunked Prefill Test | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/basic_correctness/test_chunked_prefill | ||||
|   commands: | ||||
|   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py | ||||
|   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py | ||||
|  | ||||
| - label: Core Test # 10min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   fast_check: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/core | ||||
|   - vllm/distributed | ||||
|   - tests/core | ||||
|   commands: | ||||
|   - pytest -v -s core | ||||
|  | ||||
| - label: Entrypoints Test # 40min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   fast_check: true | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/entrypoints/llm | ||||
|   - tests/entrypoints/openai | ||||
|   - tests/entrypoints/test_chat_utils | ||||
|   - tests/entrypoints/offline_mode | ||||
|   commands: | ||||
|   - export VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
|   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py | ||||
|   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process | ||||
|   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process | ||||
|   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process | ||||
|   - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process | ||||
|   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ | ||||
|   - pytest -v -s entrypoints/test_chat_utils.py | ||||
|   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests | ||||
|  | ||||
| - label: Distributed Tests (4 GPUs) # 10min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 4 | ||||
|   source_file_dependencies: | ||||
|   - vllm/distributed/ | ||||
|   - vllm/core/ | ||||
|   - tests/distributed/test_utils | ||||
|   - tests/distributed/test_pynccl | ||||
|   - tests/distributed/test_events | ||||
|   - tests/spec_decode/e2e/test_integration_dist_tp4 | ||||
|   - tests/compile/test_basic_correctness | ||||
|   - examples/offline_inference/rlhf.py | ||||
|   - examples/offline_inference/rlhf_colocate.py | ||||
|   - tests/examples/offline_inference/data_parallel.py | ||||
|   - tests/v1/test_async_llm_dp.py | ||||
|   - tests/v1/test_external_lb_dp.py | ||||
|   - tests/v1/engine/test_engine_core_client.py | ||||
|   commands: | ||||
|   # test with tp=2 and external_dp=2 | ||||
|   - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py | ||||
|   - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py | ||||
|   # test with tp=2 and pp=2 | ||||
|   - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py | ||||
|   # test with internal dp | ||||
|   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager | ||||
|   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py | ||||
|   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py | ||||
|   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp | ||||
|   - pytest -v -s distributed/test_utils.py | ||||
|   - pytest -v -s compile/test_basic_correctness.py | ||||
|   - pytest -v -s distributed/test_pynccl.py | ||||
|   - pytest -v -s distributed/test_events.py | ||||
|   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py | ||||
|   # TODO: create a dedicated test section for multi-GPU example tests | ||||
|   # when we have multiple distributed example tests | ||||
|   - pushd ../examples/offline_inference | ||||
|   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py | ||||
|   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py | ||||
|   - popd | ||||
|  | ||||
| - label: EPLB Algorithm Test | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   source_file_dependencies: | ||||
|   - vllm/distributed/eplb | ||||
|   - tests/distributed/test_eplb_algo.py | ||||
|   commands: | ||||
|   - pytest -v -s distributed/test_eplb_algo.py | ||||
|  | ||||
| - label: EPLB Execution Test # 5min | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 4 | ||||
|   source_file_dependencies: | ||||
|   - vllm/distributed/eplb | ||||
|   - tests/distributed/test_eplb_execute.py | ||||
|   commands: | ||||
|   - pytest -v -s distributed/test_eplb_execute.py | ||||
|  | ||||
| - label: Metrics, Tracing Test # 10min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   num_gpus: 2 | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/metrics | ||||
|   - tests/tracing | ||||
|   commands: | ||||
|   - pytest -v -s metrics | ||||
|   - "pip install \ | ||||
|       'opentelemetry-sdk>=1.26.0' \ | ||||
|       'opentelemetry-api>=1.26.0' \ | ||||
|       'opentelemetry-exporter-otlp>=1.26.0' \ | ||||
|       'opentelemetry-semantic-conventions-ai>=0.4.1'" | ||||
|   - pytest -v -s tracing | ||||
|  | ||||
| ##### fast check tests  ##### | ||||
| #####  1 GPU test  ##### | ||||
|  | ||||
| - label: Regression Test # 5min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/test_regression | ||||
|   commands: | ||||
|   - pip install modelscope | ||||
|   - pytest -v -s test_regression.py | ||||
| - label: Regression Test | ||||
|   mirror_hardwares: [amd] | ||||
|   command: pytest -v -s test_regression.py | ||||
|   working_dir: "/vllm-workspace/tests" # optional | ||||
|  | ||||
| - label: Engine Test # 10min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/engine | ||||
|   - tests/tokenization | ||||
|   - tests/test_sequence | ||||
|   - tests/test_config | ||||
|   - tests/test_logger | ||||
|   - tests/test_vllm_port | ||||
|   commands: | ||||
|   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py | ||||
|   # OOM in the CI unless we run this separately | ||||
|   - pytest -v -s tokenization | ||||
| - label: AsyncEngine Test | ||||
|   #mirror_hardwares: [amd] | ||||
|   command: pytest -v -s async_engine | ||||
|  | ||||
| - label: V1 Test | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|     - vllm/ | ||||
|     - tests/v1 | ||||
| - label: Basic Correctness Test | ||||
|   mirror_hardwares: [amd] | ||||
|   commands: | ||||
|     # split the test to avoid interference | ||||
|     - pytest -v -s v1/core | ||||
|     - pytest -v -s v1/engine | ||||
|     - pytest -v -s v1/entrypoints | ||||
|     - pytest -v -s v1/sample | ||||
|     - pytest -v -s v1/worker | ||||
|     - pytest -v -s v1/structured_output | ||||
|     - pytest -v -s v1/spec_decode | ||||
|     - pytest -v -s v1/kv_connector/unit | ||||
|     - pytest -v -s v1/test_serial_utils.py | ||||
|     - pytest -v -s v1/test_utils.py | ||||
|     - pytest -v -s v1/test_oracle.py | ||||
|     - pytest -v -s v1/test_metrics_reader.py | ||||
|     # TODO: accuracy does not match, whether setting | ||||
|     # VLLM_USE_FLASHINFER_SAMPLER or not on H100. | ||||
|     - pytest -v -s v1/e2e | ||||
|     # Integration test for streaming correctness (requires special branch). | ||||
|     - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api | ||||
|     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine | ||||
|   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py | ||||
|   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py | ||||
|   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py | ||||
|   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py | ||||
|   - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py | ||||
|  | ||||
| - label: Examples Test # 25min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
| - label: Core Test | ||||
|   mirror_hardwares: [amd] | ||||
|   command: pytest -v -s core | ||||
|  | ||||
| - label: Distributed Comm Ops Test | ||||
|   #mirror_hardwares: [amd] | ||||
|   command: pytest -v -s distributed/test_comm_ops.py | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 2 | ||||
|  | ||||
| - label: Distributed Tests | ||||
|   mirror_hardwares: [amd] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 2 | ||||
|   commands: | ||||
|   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py | ||||
|   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py | ||||
|   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py | ||||
|   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py | ||||
|   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py | ||||
|   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py | ||||
|   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py | ||||
|   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py | ||||
|   - pytest -v -s spec_decode/e2e/test_integration_dist.py  | ||||
|  | ||||
| - label: Distributed Tests (Multiple Groups) | ||||
|   #mirror_hardwares: [amd] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 4 | ||||
|   commands: | ||||
|   - pytest -v -s distributed/test_pynccl.py | ||||
|  | ||||
| - label: Engine Test | ||||
|   mirror_hardwares: [amd] | ||||
|   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py | ||||
|  | ||||
| - label: Entrypoints Test | ||||
|   mirror_hardwares: [amd] | ||||
|  | ||||
|   commands: | ||||
|   - pytest -v -s test_inputs.py | ||||
|   - pytest -v -s entrypoints -m llm | ||||
|   - pytest -v -s entrypoints -m openai | ||||
|  | ||||
| - label: Examples Test | ||||
|   working_dir: "/vllm-workspace/examples" | ||||
|   source_file_dependencies: | ||||
|   - vllm/entrypoints | ||||
|   - examples/ | ||||
|   mirror_hardwares: [amd] | ||||
|   commands: | ||||
|     - pip install tensorizer # for tensorizer test | ||||
|     - python3 offline_inference/basic/generate.py --model facebook/opt-125m | ||||
|     - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 | ||||
|     - python3 offline_inference/basic/chat.py | ||||
|     - python3 offline_inference/prefix_caching.py | ||||
|     - python3 offline_inference/llm_engine_example.py | ||||
|     - python3 offline_inference/audio_language.py --seed 0 | ||||
|     - python3 offline_inference/vision_language.py --seed 0 | ||||
|     - python3 offline_inference/vision_language_pooling.py --seed 0 | ||||
|     - python3 offline_inference/vision_language_multi_image.py --seed 0 | ||||
|     - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors | ||||
|     - python3 offline_inference/encoder_decoder.py | ||||
|     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 | ||||
|     - python3 offline_inference/basic/classify.py | ||||
|     - python3 offline_inference/basic/embed.py | ||||
|     - python3 offline_inference/basic/score.py | ||||
|     - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 | ||||
|     # install aws cli for llava_example.py | ||||
|     # install tensorizer for tensorize_vllm_model.py | ||||
|     - pip install awscli tensorizer | ||||
|     - python3 offline_inference.py | ||||
|     - python3 offline_inference_with_prefix.py | ||||
|     - python3 llm_engine_example.py | ||||
|     - python3 llava_example.py | ||||
|     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors | ||||
|  | ||||
| - label: Prefix Caching Test # 9min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/prefix_caching | ||||
| - label: Kernels Test %N | ||||
|   #mirror_hardwares: [amd] | ||||
|   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT | ||||
|   parallelism: 4 | ||||
|  | ||||
| - label: Models Test | ||||
|   #mirror_hardwares: [amd] | ||||
|   commands: | ||||
|     - bash ../.buildkite/download-images.sh | ||||
|     - pytest -v -s models --ignore=models/test_llava.py | ||||
|  | ||||
| - label: Llava Test | ||||
|   mirror_hardwares: [amd] | ||||
|   commands: | ||||
|     - bash ../.buildkite/download-images.sh | ||||
|     - pytest -v -s models/test_llava.py | ||||
|  | ||||
| - label: Prefix Caching Test | ||||
|   mirror_hardwares: [amd] | ||||
|   commands: | ||||
|     - pytest -v -s prefix_caching | ||||
|  | ||||
| - label: Samplers Test | ||||
|   #mirror_hardwares: [amd] | ||||
|   command: pytest -v -s samplers | ||||
|  | ||||
| - label: Platform Tests (CUDA) | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/cuda | ||||
|   commands: | ||||
|     - pytest -v -s cuda/test_cuda_context.py | ||||
| - label: LogitsProcessor Test | ||||
|   mirror_hardwares: [amd] | ||||
|   command: pytest -v -s test_logits_processor.py | ||||
|  | ||||
| - label: Samplers Test # 36min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/model_executor/layers | ||||
|   - vllm/sampling_metadata.py | ||||
|   - tests/samplers | ||||
|   - tests/conftest.py | ||||
|   commands: | ||||
|     - pytest -v -s samplers | ||||
|     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers | ||||
| - label: Utils Test | ||||
|   command: pytest -v -s test_utils.py | ||||
|  | ||||
| - label: Speculative decoding tests # 40min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/spec_decode | ||||
|   - tests/spec_decode | ||||
|   - vllm/model_executor/models/eagle.py | ||||
|   commands: | ||||
|     - pytest -v -s spec_decode/e2e/test_multistep_correctness.py | ||||
|     - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py | ||||
|     - pytest -v -s spec_decode/e2e/test_eagle_correctness.py | ||||
| - label: Worker Test | ||||
|   mirror_hardwares: [amd] | ||||
|   command: pytest -v -s worker | ||||
|  | ||||
| - label: LoRA Test %N # 15min each | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   source_file_dependencies: | ||||
|   - vllm/lora | ||||
|   - tests/lora | ||||
|   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py | ||||
| - label: Speculative decoding tests | ||||
|   #mirror_hardwares: [amd] | ||||
|   command: pytest -v -s spec_decode | ||||
|  | ||||
| - label: LoRA Test %N | ||||
|   #mirror_hardwares: [amd] | ||||
|   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py | ||||
|   parallelism: 4 | ||||
|  | ||||
| - label: PyTorch Compilation Unit Tests | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|     - vllm/ | ||||
|     - tests/compile | ||||
| - label: LoRA Long Context (Distributed) | ||||
|   #mirror_hardwares: [amd] | ||||
|   num_gpus: 4 | ||||
|   # This test runs llama 13B, so it is required to run on 4 GPUs. | ||||
|   commands: | ||||
|     - pytest -v -s compile/test_pass_manager.py | ||||
|     - pytest -v -s compile/test_fusion.py | ||||
|     - pytest -v -s compile/test_fusion_attn.py | ||||
|     - pytest -v -s compile/test_silu_mul_quant_fusion.py | ||||
|     - pytest -v -s compile/test_sequence_parallelism.py | ||||
|     - pytest -v -s compile/test_async_tp.py | ||||
|     # Temporarily run this way because we cannot clean up GPU mem usage | ||||
|     # for multi GPU tests. | ||||
|     # TODO(sang): Fix it. | ||||
|     - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced | ||||
|     - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel | ||||
|     - pytest -v -s lora/test_long_context.py::test_self_consistency | ||||
|     - pytest -v -s lora/test_long_context.py::test_quality | ||||
|     - pytest -v -s lora/test_long_context.py::test_max_len | ||||
|  | ||||
| - label: PyTorch Fullgraph Smoke Test # 9min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/compile | ||||
|   commands: | ||||
|   - pytest -v -s compile/test_basic_correctness.py | ||||
|   # these tests need to be separated, cannot combine | ||||
|   - pytest -v -s compile/piecewise/test_simple.py | ||||
|   - pytest -v -s compile/piecewise/test_toy_llama.py | ||||
|   - pytest -v -s compile/piecewise/test_full_cudagraph.py | ||||
| - label: Tensorizer Test | ||||
|   #mirror_hardwares: [amd] | ||||
|   command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader | ||||
|  | ||||
| - label: PyTorch Fullgraph Test # 18min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/compile | ||||
|   commands: | ||||
|   - pytest -v -s compile/test_full_graph.py | ||||
|  | ||||
| - label: Kernels Core Operation Test | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   source_file_dependencies: | ||||
|   - csrc/ | ||||
|   - tests/kernels/core | ||||
|   commands: | ||||
|     - pytest -v -s kernels/core | ||||
|  | ||||
| - label: Kernels Attention Test %N | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   source_file_dependencies: | ||||
|   - csrc/attention/ | ||||
|   - vllm/attention | ||||
|   - vllm/v1/attention | ||||
|   - tests/kernels/attention | ||||
|   commands: | ||||
|     - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT | ||||
|   parallelism: 2 | ||||
|  | ||||
| - label: Kernels Quantization Test %N | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   source_file_dependencies: | ||||
|   - csrc/quantization/ | ||||
|   - vllm/model_executor/layers/quantization | ||||
|   - tests/kernels/quantization | ||||
|   commands: | ||||
|     - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT | ||||
|   parallelism: 2 | ||||
|  | ||||
| - label: Kernels MoE Test | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - csrc/moe/ | ||||
|   - tests/kernels/moe | ||||
|   - vllm/model_executor/layers/fused_moe/ | ||||
|   commands: | ||||
|     - pytest -v -s kernels/moe | ||||
|  | ||||
| - label: Kernels Mamba Test | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - csrc/mamba/ | ||||
|   - tests/kernels/mamba | ||||
|   commands: | ||||
|     - pytest -v -s kernels/mamba | ||||
|  | ||||
| - label: Tensorizer Test # 11min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   soft_fail: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/model_executor/model_loader | ||||
|   - tests/tensorizer_loader | ||||
|   - tests/entrypoints/openai/test_tensorizer_entrypoint.py | ||||
|   commands: | ||||
|     - apt-get update && apt-get install -y curl libsodium23 | ||||
|     - export VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
|     - pytest -v -s tensorizer_loader | ||||
|     - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py | ||||
|  | ||||
| - label: Model Executor Test | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   soft_fail: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/model_executor | ||||
|   - tests/model_executor | ||||
|   commands: | ||||
|     - apt-get update && apt-get install -y curl libsodium23 | ||||
|     - export VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
|     - pytest -v -s model_executor | ||||
|  | ||||
| - label: Benchmarks # 9min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   working_dir: "/vllm-workspace/.buildkite" | ||||
|   source_file_dependencies: | ||||
|   - benchmarks/ | ||||
|   commands: | ||||
|   - bash scripts/run-benchmarks.sh | ||||
|  | ||||
| - label: Benchmarks CLI Test # 10min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/benchmarks/ | ||||
|   commands: | ||||
|   - pytest -v -s benchmarks/ | ||||
| - label: Metrics Test | ||||
|   mirror_hardwares: [amd] | ||||
|   command: pytest -v -s metrics | ||||
|  | ||||
| - label: Quantization Test | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - csrc/ | ||||
|   - vllm/model_executor/layers/quantization | ||||
|   - tests/quantization | ||||
|   #mirror_hardwares: [amd] | ||||
|   command: pytest -v -s quantization | ||||
|  | ||||
| - label: Benchmarks | ||||
|   working_dir: "/vllm-workspace/.buildkite" | ||||
|   mirror_hardwares: [amd] | ||||
|   commands: | ||||
|   # temporary install here since we need nightly, will move to requirements/test.in | ||||
|   # after torchao 0.12 release | ||||
|   - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126 | ||||
|   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization | ||||
|   - pip install aiohttp | ||||
|   - bash run-benchmarks.sh | ||||
|  | ||||
| - label: LM Eval Small Models # 53min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" | ||||
|   source_file_dependencies: | ||||
|   - csrc/ | ||||
|   - vllm/model_executor/layers/quantization | ||||
| - label: Documentation Build | ||||
|   working_dir: "/vllm-workspace/test_docs/docs" | ||||
|   no_gpu: True | ||||
|   commands: | ||||
|   - export VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
|   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 | ||||
|  | ||||
| - label: OpenAI API correctness | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - csrc/ | ||||
|   - vllm/entrypoints/openai/ | ||||
|   - vllm/model_executor/models/whisper.py | ||||
|   commands: # LMEval+Transcription WER check | ||||
|   - pytest -s entrypoints/openai/correctness/ | ||||
|  | ||||
| - label: Encoder Decoder tests # 5min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/encoder_decoder | ||||
|   commands: | ||||
|     - pytest -v -s encoder_decoder | ||||
|  | ||||
| - label: OpenAI-Compatible Tool Use # 20 min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   fast_check: false | ||||
|   source_file_dependencies: | ||||
|     - vllm/ | ||||
|     - tests/tool_use | ||||
|     - tests/mistral_tool_use | ||||
|   commands: | ||||
|     - pytest -v -s tool_use | ||||
|     - pytest -v -s mistral_tool_use | ||||
|  | ||||
| #####  models test  ##### | ||||
|  | ||||
| - label: Basic Models Test # 24min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models | ||||
|   commands: | ||||
|     - pytest -v -s models/test_transformers.py | ||||
|     - pytest -v -s models/test_registry.py | ||||
|     - pytest -v -s models/test_utils.py | ||||
|     - pytest -v -s models/test_vision.py | ||||
|     - pytest -v -s models/test_initialization.py | ||||
|  | ||||
| - label: Language Models Test (Standard) | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models/language | ||||
|   commands: | ||||
|     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. | ||||
|     - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' | ||||
|     - pip freeze | grep -E 'torch' | ||||
|     - pytest -v -s models/language -m core_model | ||||
|  | ||||
| - label: Language Models Test (Hybrid) # 35 min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models/language/generation | ||||
|   commands: | ||||
|     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. | ||||
|     - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' | ||||
|     - pytest -v -s models/language/generation -m hybrid_model | ||||
|  | ||||
| - label: Language Models Test (Extended Generation) # 1hr20min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   optional: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models/language/generation | ||||
|   commands: | ||||
|     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. | ||||
|     - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' | ||||
|     - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' | ||||
|  | ||||
| - label: Language Models Test (Extended Pooling)  # 36min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   optional: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models/language/pooling | ||||
|   commands: | ||||
|     - pytest -v -s models/language/pooling -m 'not core_model' | ||||
|  | ||||
| - label: Multi-Modal Models Test (Standard) | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models/multimodal | ||||
|   commands: | ||||
|     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git | ||||
|     - pip freeze | grep -E 'torch' | ||||
|     - pytest -v -s models/multimodal/processing | ||||
|     - pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model | ||||
|     - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work | ||||
|  | ||||
| - label: Multi-Modal Models Test (Extended) 1 | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   optional: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models/multimodal | ||||
|   commands: | ||||
|     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git | ||||
|     - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model' | ||||
|  | ||||
| - label: Multi-Modal Models Test (Extended) 2 | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   optional: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models/multimodal | ||||
|   commands: | ||||
|     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git | ||||
|     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' | ||||
|  | ||||
| - label: Multi-Modal Models Test (Extended) 3 | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   optional: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models/multimodal | ||||
|   commands: | ||||
|     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git | ||||
|     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' | ||||
|  | ||||
| - label: Quantized Models Test | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   source_file_dependencies: | ||||
|   - vllm/model_executor/layers/quantization | ||||
|   - tests/models/quantization | ||||
|   commands: | ||||
|     - pytest -v -s models/quantization | ||||
|  | ||||
| # This test is used only in PR development phase to test individual models and should never run on main | ||||
| - label: Custom Models Test | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   optional: true | ||||
|   commands: | ||||
|     - echo 'Testing custom models...' | ||||
|     # PR authors can temporarily add commands below to test individual models | ||||
|     # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py | ||||
|     # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* | ||||
|  | ||||
| #####  1 GPU test  ##### | ||||
| #####  multi gpus test  ##### | ||||
|  | ||||
| - label: Distributed Comm Ops Test # 7min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 2 | ||||
|   source_file_dependencies: | ||||
|   - vllm/distributed | ||||
|   - tests/distributed | ||||
|   commands: | ||||
|   - pytest -v -s distributed/test_comm_ops.py | ||||
|   - pytest -v -s distributed/test_shm_broadcast.py | ||||
|  | ||||
| - label: 2 Node Tests (4 GPUs in total) # 16min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 2 | ||||
|   num_nodes: 2 | ||||
|   source_file_dependencies: | ||||
|   - vllm/distributed/ | ||||
|   - vllm/engine/ | ||||
|   - vllm/executor/ | ||||
|   - vllm/model_executor/models/ | ||||
|   - tests/distributed/ | ||||
|   - tests/examples/offline_inference/data_parallel.py | ||||
|   commands: | ||||
|   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) | ||||
|     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' | ||||
|     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' | ||||
|     - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code | ||||
|     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py | ||||
|     - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py | ||||
|   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) | ||||
|     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' | ||||
|     - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' | ||||
|     - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code | ||||
|  | ||||
| - label: Distributed Tests (2 GPUs) # 40min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 2 | ||||
|   source_file_dependencies: | ||||
|   - vllm/distributed/ | ||||
|   - vllm/engine/ | ||||
|   - vllm/executor/ | ||||
|   - vllm/model_executor/models/ | ||||
|   - tests/distributed/ | ||||
|   - vllm/compilation | ||||
|   - vllm/worker/worker_base.py | ||||
|   - vllm/worker/worker.py | ||||
|   - vllm/worker/model_runner.py | ||||
|   - entrypoints/llm/test_collective_rpc.py | ||||
|   - tests/v1/test_async_llm_dp.py | ||||
|   - tests/v1/test_external_lb_dp.py | ||||
|   - tests/v1/entrypoints/openai/test_multi_api_servers.py | ||||
|   - vllm/v1/engine/ | ||||
|   commands: | ||||
|   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py | ||||
|   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py | ||||
|   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py | ||||
|   - pytest -v -s entrypoints/llm/test_collective_rpc.py | ||||
|   - pytest -v -s ./compile/test_basic_correctness.py | ||||
|   - pytest -v -s ./compile/test_wrapper.py | ||||
|   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' | ||||
|   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' | ||||
|   # Avoid importing model tests that cause CUDA reinitialization error | ||||
|   - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' | ||||
|   - pytest models/language -v -s -m 'distributed(num_gpus=2)' | ||||
|   - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' | ||||
|   # test sequence parallel | ||||
|   - pytest -v -s distributed/test_sequence_parallel.py | ||||
|   # this test fails consistently. | ||||
|   # TODO: investigate and fix | ||||
|   # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py | ||||
|   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py | ||||
|   - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py | ||||
|   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown | ||||
|  | ||||
| - label: Plugin Tests (2 GPUs) # 40min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 2 | ||||
|   source_file_dependencies: | ||||
|   - vllm/plugins/ | ||||
|   - tests/plugins/ | ||||
|   commands: | ||||
|   # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform | ||||
|   - pip install -e ./plugins/vllm_add_dummy_platform | ||||
|   - pytest -v -s plugins_tests/test_platform_plugins.py | ||||
|   - pip uninstall vllm_add_dummy_platform -y | ||||
|   # end platform plugin tests | ||||
|   # other tests continue here: | ||||
|   - pytest -v -s plugins_tests/test_scheduler_plugins.py | ||||
|   - pip install -e ./plugins/vllm_add_dummy_model | ||||
|   - pytest -v -s distributed/test_distributed_oot.py | ||||
|   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process | ||||
|   - pytest -v -s models/test_oot_registration.py # it needs a clean process | ||||
|   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins | ||||
|  | ||||
| - label: Multi-step Tests (4 GPUs) # 36min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 4 | ||||
|   source_file_dependencies: | ||||
|   - vllm/model_executor/layers/sampler.py | ||||
|   - vllm/sequence.py | ||||
|   - vllm/worker/worker_base.py | ||||
|   - vllm/worker/worker.py | ||||
|   - vllm/worker/multi_step_worker.py | ||||
|   - vllm/worker/model_runner_base.py | ||||
|   - vllm/worker/model_runner.py | ||||
|   - vllm/worker/multi_step_model_runner.py | ||||
|   - vllm/engine | ||||
|   - tests/multi_step | ||||
|   commands: | ||||
|   # this test is quite flaky | ||||
|   # TODO: investigate and fix. | ||||
|   # - pytest -v -s multi_step/test_correctness_async_llm.py | ||||
|   - pytest -v -s multi_step/test_correctness_llm.py | ||||
|  | ||||
| - label: Pipeline Parallelism Test # 45min | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 4 | ||||
|   source_file_dependencies: | ||||
|   - vllm/distributed/ | ||||
|   - vllm/engine/ | ||||
|   - vllm/executor/ | ||||
|   - vllm/model_executor/models/ | ||||
|   - tests/distributed/ | ||||
|   commands: | ||||
|   - pytest -v -s distributed/test_pp_cudagraph.py | ||||
|   - pytest -v -s distributed/test_pipeline_parallel.py | ||||
|  | ||||
| - label: LoRA TP Test (Distributed) | ||||
|   mirror_hardwares: [amdexperimental, amdproduction] | ||||
|   num_gpus: 4 | ||||
|   source_file_dependencies: | ||||
|   - vllm/lora | ||||
|   - tests/lora | ||||
|   commands: | ||||
|     # FIXIT: find out which code initialize cuda before running the test | ||||
|     # before the fix, we need to use spawn to test it | ||||
|     - export VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
|     # There is some Tensor Parallelism related processing logic in LoRA that | ||||
|     # requires multi-GPU testing for validation. | ||||
|     - pytest -v -s -x lora/test_chatglm3_tp.py | ||||
|     - pytest -v -s -x lora/test_llama_tp.py | ||||
|  | ||||
|  | ||||
| - label: Weight Loading Multiple GPU Test  # 33min | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 2 | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/weight_loading | ||||
|   commands: | ||||
|     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt | ||||
|  | ||||
| - label: Weight Loading Multiple GPU Test - Large Models # optional | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 2 | ||||
|   gpu: a100 | ||||
|   optional: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/weight_loading | ||||
|   commands: | ||||
|     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt | ||||
|  | ||||
|  | ||||
| ##### multi gpus test ##### | ||||
| ##### A100 test ##### | ||||
|  | ||||
| - label: Distributed Tests (A100) # optional | ||||
|   gpu: a100 | ||||
|   optional: true | ||||
|   num_gpus: 4 | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   commands: | ||||
|   # NOTE: don't test llama model here, it seems hf implementation is buggy | ||||
|   # see https://github.com/vllm-project/vllm/pull/5689 for details | ||||
|   - pytest -v -s distributed/test_custom_all_reduce.py | ||||
|   - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py | ||||
|   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' | ||||
|   - pytest -v -s -x lora/test_mixtral.py | ||||
|  | ||||
| - label: LM Eval Large Models # optional | ||||
|   gpu: a100 | ||||
|   optional: true | ||||
|   num_gpus: 4 | ||||
|   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" | ||||
|   source_file_dependencies: | ||||
|   - csrc/ | ||||
|   - vllm/model_executor/layers/quantization | ||||
|   commands: | ||||
|   - export VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
|   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 | ||||
|   - pip install -r requirements-docs.txt | ||||
|   - SPHINXOPTS=\"-W\" make html | ||||
|  | ||||
							
								
								
									
										59
									
								
								.buildkite/test-template-aws.j2
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								.buildkite/test-template-aws.j2
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,59 @@ | ||||
| {% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} | ||||
| {% set default_working_dir = "/vllm-workspace/tests" %} | ||||
|  | ||||
| steps: | ||||
|   - label: ":docker: build image" | ||||
|     agents: | ||||
|       queue: cpu_queue | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." | ||||
|       - "docker push {{ docker_image }}" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|     retry: | ||||
|       automatic: | ||||
|         - exit_status: -1  # Agent was lost | ||||
|           limit: 5 | ||||
|         - exit_status: -10  # Agent was lost | ||||
|           limit: 5 | ||||
|   - wait | ||||
|  | ||||
|   {% for step in steps %} | ||||
|   - label: "{{ step.label }}" | ||||
|     agents: | ||||
|       {% if step.no_gpu %} | ||||
|       queue: cpu_queue | ||||
|       {% elif step.num_gpus == 2 or step.num_gpus == 4 %} | ||||
|       queue: gpu_4_queue | ||||
|       {% else %} | ||||
|       queue: gpu_1_queue | ||||
|       {% endif %} | ||||
|     soft_fail: true | ||||
|     {% if step.parallelism %} | ||||
|     parallelism: {{ step.parallelism }} | ||||
|     {% endif %} | ||||
|     retry: | ||||
|       automatic: | ||||
|         - exit_status: -1  # Agent was lost | ||||
|           limit: 5 | ||||
|         - exit_status: -10  # Agent was lost | ||||
|           limit: 5 | ||||
|     plugins: | ||||
|       - docker#v5.2.0: | ||||
|           image: {{ docker_image }} | ||||
|           always-pull: true | ||||
|           propagate-environment: true | ||||
|           {% if not step.no_gpu %} | ||||
|           gpus: all | ||||
|           {% endif %} | ||||
|           command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"] | ||||
|           environment: | ||||
|             - VLLM_USAGE_SOURCE=ci-test | ||||
|             - HF_TOKEN | ||||
|             {% if step.label == "Speculative decoding tests" %} | ||||
|             - VLLM_ATTENTION_BACKEND=XFORMERS | ||||
|             {% endif %} | ||||
|           volumes: | ||||
|             - /dev/shm:/dev/shm | ||||
|   {% endfor %} | ||||
							
								
								
									
										95
									
								
								.buildkite/test-template.j2
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										95
									
								
								.buildkite/test-template.j2
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,95 @@ | ||||
| {% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %} | ||||
| {% set default_num_gpu = 1 %} | ||||
| {% set default_working_dir = "/vllm-workspace/tests" %} | ||||
|  | ||||
| steps: | ||||
|   - label: ":docker: build image" | ||||
|     commands:  | ||||
|       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." | ||||
|       - "docker push {{ docker_image }}" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|     retry: | ||||
|       automatic: | ||||
|         - exit_status: -1  # Agent was lost | ||||
|           limit: 5 | ||||
|         - exit_status: -10  # Agent was lost | ||||
|           limit: 5 | ||||
|   - wait | ||||
|  | ||||
|   - group: "AMD Tests" | ||||
|     depends_on: ~ | ||||
|     steps: | ||||
|     {% for step in steps %} | ||||
|     {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} | ||||
|       - label: "AMD: {{ step.label }}" | ||||
|         agents: | ||||
|           queue: amd | ||||
|         command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}" | ||||
|         env: | ||||
|           DOCKER_BUILDKIT: "1" | ||||
|     {% endif %} | ||||
|     {% endfor %} | ||||
|  | ||||
|   - label: "Neuron Test" | ||||
|     depends_on: ~ | ||||
|     agents: | ||||
|       queue: neuron | ||||
|     command: bash .buildkite/run-neuron-test.sh | ||||
|     soft_fail: true | ||||
|  | ||||
|   - label: "Intel Test" | ||||
|     depends_on: ~ | ||||
|     agents: | ||||
|       queue: intel | ||||
|     command: bash .buildkite/run-cpu-test.sh | ||||
|  | ||||
|   {% for step in steps %} | ||||
|   - label: "{{ step.label }}" | ||||
|     agents: | ||||
|       queue: kubernetes | ||||
|     soft_fail: {{ step.soft_fail or false }} | ||||
|     {% if step.parallelism %} | ||||
|     parallelism: {{ step.parallelism }} | ||||
|     {% endif %} | ||||
|     retry: | ||||
|       automatic: | ||||
|         - exit_status: -1  # Agent was lost | ||||
|           limit: 5 | ||||
|         - exit_status: -10  # Agent was lost | ||||
|           limit: 5 | ||||
|     plugins: | ||||
|       - kubernetes: | ||||
|           podSpec: | ||||
|             {% if step.num_gpus %} | ||||
|             priorityClassName: gpu-priority-cls-{{ step.num_gpus }} | ||||
|             {% endif %} | ||||
|             volumes: | ||||
|               - name: dshm | ||||
|                 emptyDir: | ||||
|                   medium: Memory | ||||
|             containers: | ||||
|               - image: "{{ docker_image }}" | ||||
|                 command: ["bash"] | ||||
|                 args: | ||||
|                 - '-c' | ||||
|                 - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'" | ||||
|                 {% if not step.no_gpu %} | ||||
|                 resources: | ||||
|                   requests: | ||||
|                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" | ||||
|                   limits: | ||||
|                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" | ||||
|                 {% endif %} | ||||
|                 env: | ||||
|                   - name: VLLM_USAGE_SOURCE | ||||
|                     value: ci-test | ||||
|                   - name: HF_TOKEN | ||||
|                     valueFrom: | ||||
|                       secretKeyRef: | ||||
|                         name: hf-token-secret | ||||
|                         key: token | ||||
|                 volumeMounts: | ||||
|                   - mountPath: /dev/shm | ||||
|                     name: dshm | ||||
|   {% endfor %} | ||||
| @ -1,33 +1 @@ | ||||
| /.venv | ||||
| /build | ||||
| dist | ||||
| vllm/*.so | ||||
|  | ||||
| # Byte-compiled / optimized / DLL files | ||||
| __pycache__/ | ||||
| *.py[cod] | ||||
| *$py.class | ||||
|  | ||||
| .mypy_cache | ||||
|  | ||||
| # Distribution / packaging | ||||
| .Python | ||||
| /build/ | ||||
| cmake-build-*/ | ||||
| CMakeUserPresets.json | ||||
| develop-eggs/ | ||||
| /dist/ | ||||
| downloads/ | ||||
| eggs/ | ||||
| .eggs/ | ||||
| lib/ | ||||
| lib64/ | ||||
| parts/ | ||||
| sdist/ | ||||
| var/ | ||||
| wheels/ | ||||
| share/python-wheels/ | ||||
| *.egg-info/ | ||||
| .installed.cfg | ||||
| *.egg | ||||
| MANIFEST | ||||
|  | ||||
							
								
								
									
										54
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										54
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							| @ -1,54 +0,0 @@ | ||||
| # See https://help.github.com/articles/about-codeowners/ | ||||
| # for more info about CODEOWNERS file | ||||
|  | ||||
| # This lists cover the "core" components of vLLM that require careful review | ||||
| /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill | ||||
| /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill | ||||
| /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill | ||||
| /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill | ||||
| /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill | ||||
| /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill | ||||
| /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill | ||||
| /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth | ||||
| /vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm | ||||
| /vllm/multimodal @DarkLight1337 @ywang96 | ||||
| /vllm/vllm_flash_attn @LucasWilkinson | ||||
| /vllm/lora @jeejeelee | ||||
| /vllm/reasoning @aarnphm | ||||
| /vllm/entrypoints @aarnphm | ||||
| CMakeLists.txt @tlrmchlsmth @LucasWilkinson | ||||
|  | ||||
| # Any change to the VllmConfig changes can have a large user-facing impact, | ||||
| # so spam a lot of people | ||||
| /vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor | ||||
|  | ||||
| # vLLM V1 | ||||
| /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat | ||||
| /vllm/v1/structured_output @mgoin @russellb @aarnphm | ||||
|  | ||||
| # Test ownership | ||||
| /.buildkite/lm-eval-harness @mgoin @simon-mo | ||||
| /tests/async_engine @njhill @robertgshaw2-redhat @simon-mo | ||||
| /tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac | ||||
| /tests/distributed/test_multi_node_assignment.py @youkaichao | ||||
| /tests/distributed/test_pipeline_parallel.py @youkaichao | ||||
| /tests/distributed/test_same_node.py @youkaichao | ||||
| /tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm | ||||
| /tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm | ||||
| /tests/kernels @tlrmchlsmth @WoosukKwon | ||||
| /tests/model_executor/test_guided_processors.py @mgoin @russellb | ||||
| /tests/models @DarkLight1337 @ywang96 | ||||
| /tests/multi_step @alexm-redhat @comaniac | ||||
| /tests/multimodal @DarkLight1337 @ywang96 | ||||
| /tests/prefix_caching @comaniac @KuntaiDu | ||||
| /tests/quantization @mgoin @robertgshaw2-redhat | ||||
| /tests/spec_decode @njhill @LiuXiaoxuanPKU | ||||
| /tests/test_inputs.py @DarkLight1337 @ywang96 | ||||
| /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm | ||||
| /tests/v1/structured_output @mgoin @russellb @aarnphm | ||||
| /tests/weight_loading @mgoin @youkaichao | ||||
| /tests/lora @jeejeelee | ||||
|  | ||||
| # Docs | ||||
| /docs @hmellor | ||||
| mkdocs.yaml @hmellor | ||||
							
								
								
									
										2
									
								
								.github/FUNDING.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/FUNDING.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,2 +0,0 @@ | ||||
| github: [vllm-project] | ||||
| open_collective: vllm | ||||
							
								
								
									
										7
									
								
								.github/ISSUE_TEMPLATE/100-documentation.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.github/ISSUE_TEMPLATE/100-documentation.yml
									
									
									
									
										vendored
									
									
								
							| @ -20,10 +20,3 @@ body: | ||||
|   attributes: | ||||
|     value: > | ||||
|       Thanks for contributing 🎉! | ||||
| - type: checkboxes | ||||
|   id: askllm | ||||
|   attributes: | ||||
|     label: Before submitting a new issue... | ||||
|     options: | ||||
|       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. | ||||
|         required: true | ||||
|  | ||||
							
								
								
									
										9
									
								
								.github/ISSUE_TEMPLATE/200-installation.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								.github/ISSUE_TEMPLATE/200-installation.yml
									
									
									
									
										vendored
									
									
								
							| @ -14,7 +14,7 @@ body: | ||||
|     description: | | ||||
|       Please run the following and paste the output below. | ||||
|       ```sh | ||||
|       wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py | ||||
|       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py | ||||
|       # For security purposes, please feel free to check the contents of collect_env.py before running it. | ||||
|       python collect_env.py | ||||
|       ``` | ||||
| @ -38,10 +38,3 @@ body: | ||||
|   attributes: | ||||
|     value: > | ||||
|       Thanks for contributing 🎉! | ||||
| - type: checkboxes | ||||
|   id: askllm | ||||
|   attributes: | ||||
|     label: Before submitting a new issue... | ||||
|     options: | ||||
|       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. | ||||
|         required: true | ||||
|  | ||||
							
								
								
									
										9
									
								
								.github/ISSUE_TEMPLATE/300-usage.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								.github/ISSUE_TEMPLATE/300-usage.yml
									
									
									
									
										vendored
									
									
								
							| @ -14,7 +14,7 @@ body: | ||||
|     description: | | ||||
|       Please run the following and paste the output below. | ||||
|       ```sh | ||||
|       wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py | ||||
|       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py | ||||
|       # For security purposes, please feel free to check the contents of collect_env.py before running it. | ||||
|       python collect_env.py | ||||
|       ``` | ||||
| @ -36,10 +36,3 @@ body: | ||||
|   attributes: | ||||
|     value: > | ||||
|       Thanks for contributing 🎉! | ||||
| - type: checkboxes | ||||
|   id: askllm | ||||
|   attributes: | ||||
|     label: Before submitting a new issue... | ||||
|     options: | ||||
|       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. | ||||
|         required: true | ||||
|  | ||||
| @ -8,36 +8,21 @@ body: | ||||
|   attributes: | ||||
|     value: > | ||||
|       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). | ||||
| - type: markdown | ||||
|   attributes: | ||||
|     value: | | ||||
|       ⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as: | ||||
|       - API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys) | ||||
|       - Passwords or authentication credentials | ||||
|       - Private URLs or endpoints | ||||
|       - Personal or confidential data | ||||
|        | ||||
|       Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples. | ||||
| - type: textarea | ||||
|   attributes: | ||||
|     label: Your current environment | ||||
|     description: | | ||||
|       Please run the following and paste the output below. | ||||
|       ```sh | ||||
|       wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py | ||||
|       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py | ||||
|       # For security purposes, please feel free to check the contents of collect_env.py before running it. | ||||
|       python collect_env.py | ||||
|       ``` | ||||
|       It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues. | ||||
|     value: | | ||||
|       <details> | ||||
|       <summary>The output of <code>python collect_env.py</code></summary> | ||||
| 
 | ||||
|       ```text | ||||
|       Your output of `python collect_env.py` here | ||||
|       The output of `python collect_env.py` | ||||
|       ``` | ||||
| 
 | ||||
|       </details> | ||||
|   validations: | ||||
|     required: true | ||||
| - type: textarea | ||||
| @ -85,24 +70,17 @@ body: | ||||
|       ``` | ||||
| 
 | ||||
|       ``` | ||||
|       The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present. | ||||
|       The error message you got, with the full traceback. | ||||
|       ``` | ||||
|   validations: | ||||
|     required: true | ||||
| - type: markdown | ||||
|   attributes: | ||||
|     value: | | ||||
|       ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output: | ||||
|     value: > | ||||
|       ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output: | ||||
| 
 | ||||
|       - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). | ||||
| 
 | ||||
|       - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. | ||||
| 
 | ||||
|       Thanks for reporting 🙏! | ||||
| - type: checkboxes | ||||
|   id: askllm | ||||
|   attributes: | ||||
|     label: Before submitting a new issue... | ||||
|     options: | ||||
|       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. | ||||
|         required: true | ||||
|       Thanks for contributing 🎉! | ||||
							
								
								
									
										69
									
								
								.github/ISSUE_TEMPLATE/450-ci-failure.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										69
									
								
								.github/ISSUE_TEMPLATE/450-ci-failure.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,69 +0,0 @@ | ||||
| name: 🧪 CI failure report | ||||
| description: Report a failing test. | ||||
| title: "[CI Failure]: " | ||||
| labels: ["ci-failure"] | ||||
|  | ||||
| body: | ||||
| - type: markdown | ||||
|   attributes: | ||||
|     value: > | ||||
|       #### Include the name of the failing Buildkite step and test file in the title. | ||||
| - type: input | ||||
|   attributes: | ||||
|     label: Name of failing test | ||||
|     description: | | ||||
|       Paste in the fully-qualified name of the failing test from the logs. | ||||
|     placeholder: | | ||||
|       `path/to/test_file.py::test_name[params]` | ||||
|   validations: | ||||
|     required: true | ||||
| - type: checkboxes | ||||
|   attributes: | ||||
|     label: Basic information | ||||
|     description: Select all items that apply to the failing test. | ||||
|     options: | ||||
|       - label: Flaky test | ||||
|       - label: Can reproduce locally | ||||
|       - label: Caused by external libraries (e.g. bug in `transformers`) | ||||
| - type: textarea | ||||
|   attributes: | ||||
|     label: 🧪 Describe the failing test | ||||
|     description: | | ||||
|       Please provide a clear and concise description of the failing test. | ||||
|     placeholder: | | ||||
|       A clear and concise description of the failing test. | ||||
|    | ||||
|       ``` | ||||
|       The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present. | ||||
|       ``` | ||||
|   validations: | ||||
|     required: true | ||||
| - type: textarea | ||||
|   attributes: | ||||
|     label: 📝 History of failing test | ||||
|     description: | | ||||
|       Since when did the test start to fail? | ||||
|       You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main). | ||||
|  | ||||
|       If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods: | ||||
|  | ||||
|       - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally. | ||||
|  | ||||
|       - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally. | ||||
|  | ||||
|       - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only) | ||||
|     placeholder: | | ||||
|       Approximate timeline and/or problematic PRs | ||||
|  | ||||
|       A link to the Buildkite analytics of the failing test (if available) | ||||
|   validations: | ||||
|     required: true | ||||
| - type: textarea | ||||
|   attributes: | ||||
|     label: CC List. | ||||
|     description: > | ||||
|       The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test. | ||||
| - type: markdown | ||||
|   attributes: | ||||
|     value: > | ||||
|       Thanks for reporting 🙏! | ||||
| @ -29,10 +29,3 @@ body: | ||||
|   attributes: | ||||
|     value: > | ||||
|       Thanks for contributing 🎉! | ||||
| - type: checkboxes | ||||
|   id: askllm | ||||
|   attributes: | ||||
|     label: Before submitting a new issue... | ||||
|     options: | ||||
|       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. | ||||
|         required: true | ||||
| @ -9,7 +9,7 @@ body: | ||||
|     value: > | ||||
|       #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). | ||||
| 
 | ||||
|       #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model. | ||||
|       #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. | ||||
| - type: textarea | ||||
|   attributes: | ||||
|     label: The model to consider. | ||||
| @ -31,10 +31,3 @@ body: | ||||
|   attributes: | ||||
|     value: > | ||||
|       Thanks for contributing 🎉! | ||||
| - type: checkboxes | ||||
|   id: askllm | ||||
|   attributes: | ||||
|     label: Before submitting a new issue... | ||||
|     options: | ||||
|       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. | ||||
|         required: true | ||||
| @ -35,7 +35,7 @@ body: | ||||
|     description: | | ||||
|       Please run the following and paste the output below. | ||||
|       ```sh | ||||
|       wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py | ||||
|       wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py | ||||
|       # For security purposes, please feel free to check the contents of collect_env.py before running it. | ||||
|       python collect_env.py | ||||
|       ``` | ||||
| @ -50,10 +50,3 @@ body: | ||||
|   attributes: | ||||
|     value: > | ||||
|       Thanks for contributing 🎉! | ||||
| - type: checkboxes | ||||
|   id: askllm | ||||
|   attributes: | ||||
|     label: Before submitting a new issue... | ||||
|     options: | ||||
|       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. | ||||
|         required: true | ||||
							
								
								
									
										7
									
								
								.github/ISSUE_TEMPLATE/750-RFC.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										7
									
								
								.github/ISSUE_TEMPLATE/750-RFC.yml
									
									
									
									
										vendored
									
									
								
							| @ -47,10 +47,3 @@ body: | ||||
|   attributes: | ||||
|     value: > | ||||
|       Thanks for contributing 🎉! | ||||
| - type: checkboxes | ||||
|   id: askllm | ||||
|   attributes: | ||||
|     label: Before submitting a new issue... | ||||
|     options: | ||||
|       - label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions. | ||||
|         required: true | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	