mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-10-31 22:44:37 +08:00 
			
		
		
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			main
			...
			fix-hashin
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 1936d7bab0 | |||
| 996cf2de5c | 
| @ -1,53 +1,36 @@ | |||||||
| # SPDX-License-Identifier: Apache-2.0 |  | ||||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |  | ||||||
|  |  | ||||||
| import os | import os | ||||||
| import sys |  | ||||||
| import zipfile | import zipfile | ||||||
|  |  | ||||||
| # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB | MAX_SIZE_MB = 200 | ||||||
| # Note that we have 800 MiB quota, please use it wisely. |  | ||||||
| # See https://github.com/pypi/support/issues/6326 . |  | ||||||
| # Please also sync the value with the one in Dockerfile. |  | ||||||
| VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def print_top_10_largest_files(zip_file): | def print_top_10_largest_files(zip_file): | ||||||
|     """Print the top 10 largest files in the given zip file.""" |     with zipfile.ZipFile(zip_file, 'r') as z: | ||||||
|     with zipfile.ZipFile(zip_file, "r") as z: |  | ||||||
|         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] |         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] | ||||||
|         file_sizes.sort(key=lambda x: x[1], reverse=True) |         file_sizes.sort(key=lambda x: x[1], reverse=True) | ||||||
|         for f, size in file_sizes[:10]: |         for f, size in file_sizes[:10]: | ||||||
|             print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.") |             print(f"{f}: {size/(1024*1024)} MBs uncompressed.") | ||||||
|  |  | ||||||
|  |  | ||||||
| def check_wheel_size(directory): | def check_wheel_size(directory): | ||||||
|     """Check the size of .whl files in the given directory.""" |  | ||||||
|     for root, _, files in os.walk(directory): |     for root, _, files in os.walk(directory): | ||||||
|         for file_name in files: |         for f in files: | ||||||
|             if file_name.endswith(".whl"): |             if f.endswith(".whl"): | ||||||
|                 wheel_path = os.path.join(root, file_name) |                 wheel_path = os.path.join(root, f) | ||||||
|                 wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024) |                 wheel_size = os.path.getsize(wheel_path) | ||||||
|                 if wheel_size_mb > VLLM_MAX_SIZE_MB: |                 wheel_size_mb = wheel_size / (1024 * 1024) | ||||||
|  |                 if wheel_size_mb > MAX_SIZE_MB: | ||||||
|                     print( |                     print( | ||||||
|                         f"Not allowed: Wheel {wheel_path} is larger " |                         f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) " | ||||||
|                         f"({wheel_size_mb:.2f} MB) than the limit " |                         f"compare to the allowed size ({MAX_SIZE_MB} MB).") | ||||||
|                         f"({VLLM_MAX_SIZE_MB} MB)." |  | ||||||
|                     ) |  | ||||||
|                     print_top_10_largest_files(wheel_path) |                     print_top_10_largest_files(wheel_path) | ||||||
|                     return 1 |                     return 1 | ||||||
|                 else: |                 else: | ||||||
|                     print( |                     print(f"Wheel {wheel_path} is within the allowed size " | ||||||
|                         f"Wheel {wheel_path} is within the allowed size " |                           f"({wheel_size_mb} MB).") | ||||||
|                         f"({wheel_size_mb:.2f} MB)." |  | ||||||
|                     ) |  | ||||||
|     return 0 |     return 0 | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     if len(sys.argv) < 2: |     import sys | ||||||
|         print("Usage: python check-wheel-size.py <directory>") |     sys.exit(check_wheel_size(sys.argv[1])) | ||||||
|         sys.exit(1) |  | ||||||
|  |  | ||||||
|     directory = sys.argv[1] |  | ||||||
|     sys.exit(check_wheel_size(directory)) |  | ||||||
|  | |||||||
							
								
								
									
										18
									
								
								.buildkite/download-images.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								.buildkite/download-images.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,18 @@ | |||||||
|  | #!/bin/bash | ||||||
|  |  | ||||||
|  | set -ex | ||||||
|  | set -o pipefail | ||||||
|  |  | ||||||
|  | (which wget && which curl) || (apt-get update && apt-get install -y wget curl) | ||||||
|  |  | ||||||
|  | # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/ | ||||||
|  | mkdir -p images | ||||||
|  | cd images | ||||||
|  | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt | ||||||
|  | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt | ||||||
|  | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt | ||||||
|  | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt | ||||||
|  | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg | ||||||
|  | wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg | ||||||
|  |  | ||||||
|  | cd - | ||||||
| @ -1,46 +0,0 @@ | |||||||
| # SPDX-License-Identifier: Apache-2.0 |  | ||||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |  | ||||||
|  |  | ||||||
| import argparse |  | ||||||
| import os |  | ||||||
|  |  | ||||||
| template = """<!DOCTYPE html> |  | ||||||
| <html> |  | ||||||
|     <body> |  | ||||||
|     <h1>Links for vLLM</h1/> |  | ||||||
|         <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/> |  | ||||||
|         <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/> |  | ||||||
|     </body> |  | ||||||
| </html> |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| parser = argparse.ArgumentParser() |  | ||||||
| parser.add_argument("--wheel", help="The wheel path.", required=True) |  | ||||||
| args = parser.parse_args() |  | ||||||
|  |  | ||||||
| filename = os.path.basename(args.wheel) |  | ||||||
|  |  | ||||||
| with open("index.html", "w") as f: |  | ||||||
|     print(f"Generated index.html for {args.wheel}") |  | ||||||
|     # sync the abi tag with .buildkite/scripts/upload-wheels.sh |  | ||||||
|     if "x86_64" in filename: |  | ||||||
|         x86_wheel = filename |  | ||||||
|         arm_wheel = filename.replace("x86_64", "aarch64").replace( |  | ||||||
|             "manylinux1", "manylinux2014" |  | ||||||
|         ) |  | ||||||
|     elif "aarch64" in filename: |  | ||||||
|         x86_wheel = filename.replace("aarch64", "x86_64").replace( |  | ||||||
|             "manylinux2014", "manylinux1" |  | ||||||
|         ) |  | ||||||
|         arm_wheel = filename |  | ||||||
|     else: |  | ||||||
|         raise ValueError(f"Unsupported wheel: {filename}") |  | ||||||
|     # cloudfront requires escaping the '+' character |  | ||||||
|     f.write( |  | ||||||
|         template.format( |  | ||||||
|             x86_wheel=x86_wheel, |  | ||||||
|             x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"), |  | ||||||
|             arm_wheel=arm_wheel, |  | ||||||
|             arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"), |  | ||||||
|         ) |  | ||||||
|     ) |  | ||||||
| @ -1,13 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2 |  | ||||||
| model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.671 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.664 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| trust_remote_code: True |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For hf script, without -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 |  | ||||||
| model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.905 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.905 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For hf script, without -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 |  | ||||||
| model_name: "meta-llama/Meta-Llama-3-70B-Instruct" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.892 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.892 |  | ||||||
| limit: 250 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.752 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.754 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.753 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.753 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.755 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.755 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 |  | ||||||
| model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.753 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.753 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 |  | ||||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.764 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.764 |  | ||||||
| limit: 250 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 |  | ||||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.728 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.728 |  | ||||||
| limit: 250 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.758 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.759 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For hf script, without -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 |  | ||||||
| model_name: "meta-llama/Meta-Llama-3-8B-Instruct" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.756 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.752 |  | ||||||
| limit: 250 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "HandH1998/QQQ-Llama-3-8b-g128" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.419 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.416 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,11 +0,0 @@ | |||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1 |  | ||||||
| model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.335 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.323 |  | ||||||
| limit: 1319 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.356 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.358 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For hf script, without -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8 |  | ||||||
| model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" |  | ||||||
| backend: "vllm-vlm" |  | ||||||
| tasks: |  | ||||||
| - name: "chartqa" |  | ||||||
|   metrics: |  | ||||||
|   - name: "relaxed_accuracy,none" |  | ||||||
|     # TODO(zhewenl): model card is 0.90, but the actual score is 0.80. |  | ||||||
|     value: 0.80 |  | ||||||
| limit: 100 |  | ||||||
| num_fewshot: 0 |  | ||||||
| @ -1,10 +0,0 @@ | |||||||
| # For hf script, without -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5 |  | ||||||
| model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" |  | ||||||
| tasks: |  | ||||||
| - name: "mmlu_pro" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,custom-extract" |  | ||||||
|     value: 0.80 |  | ||||||
| limit: 250 # will run on 250 * 14 subjects = 3500 samples |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "mgoin/Minitron-4B-Base-FP8" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.231 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.22 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 |  | ||||||
| model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.86 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.86 |  | ||||||
| limit: 250 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 |  | ||||||
| model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.624 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.624 |  | ||||||
| limit: 250 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For hf script, without -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 |  | ||||||
| model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.616 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.632 |  | ||||||
| limit: 250 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1 |  | ||||||
| model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.30 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.465 |  | ||||||
| limit: 1319 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.578 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.585 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.593 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.588 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 |  | ||||||
| model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.595 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.582 |  | ||||||
| limit: 1000 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 |  | ||||||
| model_name: "Qwen/Qwen2-57B-A14B-Instruct" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.792 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.824 |  | ||||||
| limit: 250 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,11 +0,0 @@ | |||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1 |  | ||||||
| model_name: "Qwen/Qwen2.5-1.5B-Instruct" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.54 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.59 |  | ||||||
| limit: 1319 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size) |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1 |  | ||||||
| model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.47 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.64 |  | ||||||
| limit: 1319 |  | ||||||
| num_fewshot: 5 |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1 |  | ||||||
|  |  | ||||||
| model_name: "Qwen/Qwen2.5-VL-7B-Instruct" |  | ||||||
| backend: "vllm-vlm" |  | ||||||
| tasks: |  | ||||||
| - name: "chartqa" |  | ||||||
|   metrics: |  | ||||||
|   - name: "relaxed_accuracy,none" |  | ||||||
|     value: 0.855 |  | ||||||
| limit: 2500 |  | ||||||
| num_fewshot: 0 |  | ||||||
| @ -1,14 +0,0 @@ | |||||||
| model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8" |  | ||||||
| tasks: |  | ||||||
|   - name: "mmlu_pro" |  | ||||||
|     metrics: |  | ||||||
|       - name: "exact_match,custom-extract" |  | ||||||
|         value: 0.82 |  | ||||||
| limit: 250 # will run on 250 * 14 subjects = 3500 samples |  | ||||||
| num_fewshot: 5 |  | ||||||
| enforce_eager: false # we use false to speed up the eval process |  | ||||||
| kv_cache_dtype: fp8 # we use fp8 to speed up the eval process |  | ||||||
| max_model_len: 40960 |  | ||||||
| apply_chat_template: true |  | ||||||
| fewshot_as_multiturn: true |  | ||||||
| gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>" |  | ||||||
| @ -1,12 +0,0 @@ | |||||||
| # For vllm script, with -t option (tensor parallel size). |  | ||||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2 |  | ||||||
| model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" |  | ||||||
| tasks: |  | ||||||
| - name: "gsm8k" |  | ||||||
|   metrics: |  | ||||||
|   - name: "exact_match,strict-match" |  | ||||||
|     value: 0.6353 |  | ||||||
|   - name: "exact_match,flexible-extract" |  | ||||||
|     value: 0.637 |  | ||||||
| limit: null |  | ||||||
| num_fewshot: null  |  | ||||||
| @ -1 +0,0 @@ | |||||||
| Qwen3-235B-A22B-Instruct-2507-FP8.yaml |  | ||||||
| @ -1,5 +0,0 @@ | |||||||
| Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform.yaml |  | ||||||
| Meta-Llama-3-70B-Instruct.yaml |  | ||||||
| Mixtral-8x7B-Instruct-v0.1.yaml |  | ||||||
| Qwen2-57B-A14-Instruct.yaml |  | ||||||
| DeepSeek-V2-Lite-Chat.yaml |  | ||||||
| @ -1 +0,0 @@ | |||||||
| Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml |  | ||||||
| @ -1 +0,0 @@ | |||||||
| Qwen2.5-VL-7B-Instruct.yaml |  | ||||||
| @ -1,6 +0,0 @@ | |||||||
| Qwen2.5-1.5B-Instruct.yaml |  | ||||||
| Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml |  | ||||||
| Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml |  | ||||||
| Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml |  | ||||||
| Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml |  | ||||||
| Qwen1.5-MoE-W4A16-compressed-tensors.yaml |  | ||||||
| @ -1,44 +0,0 @@ | |||||||
| # SPDX-License-Identifier: Apache-2.0 |  | ||||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |  | ||||||
| from pathlib import Path |  | ||||||
|  |  | ||||||
| import pytest |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def pytest_addoption(parser): |  | ||||||
|     parser.addoption( |  | ||||||
|         "--config-list-file", |  | ||||||
|         action="store", |  | ||||||
|         help="Path to the file listing model config YAMLs (one per line)", |  | ||||||
|     ) |  | ||||||
|     parser.addoption( |  | ||||||
|         "--tp-size", |  | ||||||
|         action="store", |  | ||||||
|         default="1", |  | ||||||
|         help="Tensor parallel size to use for evaluation", |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @pytest.fixture(scope="session") |  | ||||||
| def config_list_file(pytestconfig, config_dir): |  | ||||||
|     rel_path = pytestconfig.getoption("--config-list-file") |  | ||||||
|     return config_dir / rel_path |  | ||||||
|  |  | ||||||
|  |  | ||||||
| @pytest.fixture(scope="session") |  | ||||||
| def tp_size(pytestconfig): |  | ||||||
|     return pytestconfig.getoption("--tp-size") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def pytest_generate_tests(metafunc): |  | ||||||
|     if "config_filename" in metafunc.fixturenames: |  | ||||||
|         rel_path = metafunc.config.getoption("--config-list-file") |  | ||||||
|         config_list_file = Path(rel_path).resolve() |  | ||||||
|         config_dir = config_list_file.parent |  | ||||||
|         with open(config_list_file, encoding="utf-8") as f: |  | ||||||
|             configs = [ |  | ||||||
|                 config_dir / line.strip() |  | ||||||
|                 for line in f |  | ||||||
|                 if line.strip() and not line.startswith("#") |  | ||||||
|             ] |  | ||||||
|         metafunc.parametrize("config_filename", configs) |  | ||||||
| @ -1,44 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| # We can use this script to compute baseline accuracy on chartqa for vllm. |  | ||||||
| # |  | ||||||
| # Make sure you have lm-eval-harness installed: |  | ||||||
| #   pip install lm-eval==0.4.9 |  | ||||||
|  |  | ||||||
| usage() { |  | ||||||
|     echo`` |  | ||||||
|     echo "Runs lm eval harness on ChartQA using multimodal vllm." |  | ||||||
|     echo "This pathway is intended to be used to create baselines for " |  | ||||||
|     echo "our correctness tests in vllm's CI." |  | ||||||
|     echo |  | ||||||
|     echo "usage: ${0} <options>" |  | ||||||
|     echo |  | ||||||
|     echo "  -m    - huggingface stub or local directory of the model" |  | ||||||
|     echo "  -l    - limit number of samples to run" |  | ||||||
|     echo "  -t    - tensor parallel size to run at" |  | ||||||
|     echo |  | ||||||
| } |  | ||||||
|  |  | ||||||
| while getopts "m:l:t:" OPT; do |  | ||||||
|   case ${OPT} in |  | ||||||
|     m )  |  | ||||||
|         MODEL="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     l )  |  | ||||||
|         LIMIT="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     t )  |  | ||||||
|         TP_SIZE="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     \? )  |  | ||||||
|         usage |  | ||||||
|         exit 1 |  | ||||||
|         ;; |  | ||||||
|   esac |  | ||||||
| done |  | ||||||
|  |  | ||||||
| lm_eval --model vllm-vlm \ |  | ||||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \ |  | ||||||
|   --tasks chartqa \ |  | ||||||
|   --batch_size auto \ |  | ||||||
|   --apply_chat_template \ |  | ||||||
|   --limit $LIMIT |  | ||||||
| @ -1,46 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| # We can use this script to compute baseline accuracy on GSM for transformers. |  | ||||||
| # |  | ||||||
| # Make sure you have lm-eval-harness installed: |  | ||||||
| #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] |  | ||||||
|  |  | ||||||
| usage() { |  | ||||||
|     echo`` |  | ||||||
|     echo "Runs lm eval harness on GSM8k using huggingface transformers." |  | ||||||
|     echo "This pathway is intended to be used to create baselines for " |  | ||||||
|     echo "our automated nm-test-accuracy workflow" |  | ||||||
|     echo |  | ||||||
|     echo "usage: ${0} <options>" |  | ||||||
|     echo |  | ||||||
|     echo "  -m    - huggingface stub or local directory of the model" |  | ||||||
|     echo "  -b    - batch size to run the evaluation at" |  | ||||||
|     echo "  -l    - limit number of samples to run" |  | ||||||
|     echo "  -f    - number of fewshot samples to use" |  | ||||||
|     echo |  | ||||||
| } |  | ||||||
|  |  | ||||||
| while getopts "m:b:l:f:" OPT; do |  | ||||||
|   case ${OPT} in |  | ||||||
|     m )  |  | ||||||
|         MODEL="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     b )  |  | ||||||
|         BATCH_SIZE="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     l )  |  | ||||||
|         LIMIT="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     f )  |  | ||||||
|         FEWSHOT="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     \? )  |  | ||||||
|         usage |  | ||||||
|         exit 1 |  | ||||||
|         ;; |  | ||||||
|   esac |  | ||||||
| done |  | ||||||
|  |  | ||||||
| lm_eval --model hf \ |  | ||||||
|   --model_args "pretrained=$MODEL,parallelize=True" \ |  | ||||||
|   --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ |  | ||||||
|   --batch_size "$BATCH_SIZE" |  | ||||||
| @ -1,51 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| # We can use this script to compute baseline accuracy on GSM for vllm. |  | ||||||
| # We use this for fp8, which HF does not support. |  | ||||||
| # |  | ||||||
| # Make sure you have lm-eval-harness installed: |  | ||||||
| #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] |  | ||||||
|  |  | ||||||
| usage() { |  | ||||||
|     echo`` |  | ||||||
|     echo "Runs lm eval harness on GSM8k using huggingface transformers." |  | ||||||
|     echo "This pathway is intended to be used to create baselines for " |  | ||||||
|     echo "our automated nm-test-accuracy workflow" |  | ||||||
|     echo |  | ||||||
|     echo "usage: ${0} <options>" |  | ||||||
|     echo |  | ||||||
|     echo "  -m    - huggingface stub or local directory of the model" |  | ||||||
|     echo "  -b    - batch size to run the evaluation at" |  | ||||||
|     echo "  -l    - limit number of samples to run" |  | ||||||
|     echo "  -f    - number of fewshot samples to use" |  | ||||||
|     echo "  -t    - tensor parallel size to run at" |  | ||||||
|     echo |  | ||||||
| } |  | ||||||
|  |  | ||||||
| while getopts "m:b:l:f:t:" OPT; do |  | ||||||
|   case ${OPT} in |  | ||||||
|     m )  |  | ||||||
|         MODEL="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     b )  |  | ||||||
|         BATCH_SIZE="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     l )  |  | ||||||
|         LIMIT="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     f )  |  | ||||||
|         FEWSHOT="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     t ) |  | ||||||
|         TP_SIZE="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     \? )  |  | ||||||
|         usage |  | ||||||
|         exit 1 |  | ||||||
|         ;; |  | ||||||
|   esac |  | ||||||
| done |  | ||||||
|  |  | ||||||
| lm_eval --model vllm \ |  | ||||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ |  | ||||||
|   --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ |  | ||||||
|   --batch_size "$BATCH_SIZE" |  | ||||||
| @ -1,50 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| # We can use this script to compute baseline accuracy on MMLUPRO for vllm. |  | ||||||
| # We use this for fp8, which HF does not support. |  | ||||||
| # |  | ||||||
| # Make sure you have lm-eval-harness installed: |  | ||||||
| #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] |  | ||||||
|  |  | ||||||
| usage() { |  | ||||||
|     echo`` |  | ||||||
|     echo "Runs lm eval harness on MMLU Pro using huggingface transformers." |  | ||||||
|     echo "This pathway is intended to be used to create baselines for " |  | ||||||
|     echo "our automated nm-test-accuracy workflow" |  | ||||||
|     echo |  | ||||||
|     echo "usage: ${0} <options>" |  | ||||||
|     echo |  | ||||||
|     echo "  -m    - huggingface stub or local directory of the model" |  | ||||||
|     echo "  -l    - limit number of samples to run" |  | ||||||
|     echo "  -f    - number of fewshot samples to use" |  | ||||||
|     echo "  -t    - tensor parallel size to run at" |  | ||||||
|     echo |  | ||||||
| } |  | ||||||
|  |  | ||||||
| while getopts "m:b:l:f:t:" OPT; do |  | ||||||
|   case ${OPT} in |  | ||||||
|     m ) |  | ||||||
|         MODEL="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     b ) |  | ||||||
|         BATCH_SIZE="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     l ) |  | ||||||
|         LIMIT="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     f ) |  | ||||||
|         FEWSHOT="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     t ) |  | ||||||
|         TP_SIZE="$OPTARG" |  | ||||||
|         ;; |  | ||||||
|     \? ) |  | ||||||
|         usage |  | ||||||
|         exit 1 |  | ||||||
|         ;; |  | ||||||
|   esac |  | ||||||
| done |  | ||||||
|  |  | ||||||
| lm_eval --model vllm \ |  | ||||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ |  | ||||||
|   --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ |  | ||||||
|   --batch_size auto |  | ||||||
| @ -1,71 +0,0 @@ | |||||||
| # SPDX-License-Identifier: Apache-2.0 |  | ||||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |  | ||||||
| """ |  | ||||||
| LM eval harness on model to compare vs HF baseline computed offline. |  | ||||||
| Configs are found in configs/$MODEL.yaml |  | ||||||
|  |  | ||||||
| pytest -s -v test_lm_eval_correctness.py \ |  | ||||||
|     --config-list-file=configs/models-small.txt \ |  | ||||||
|     --tp-size=1 |  | ||||||
| """ |  | ||||||
|  |  | ||||||
| import lm_eval |  | ||||||
| import numpy as np |  | ||||||
| import yaml |  | ||||||
|  |  | ||||||
| RTOL = 0.08 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def launch_lm_eval(eval_config, tp_size): |  | ||||||
|     trust_remote_code = eval_config.get("trust_remote_code", False) |  | ||||||
|     max_model_len = eval_config.get("max_model_len", 4096) |  | ||||||
|     batch_size = eval_config.get("batch_size", "auto") |  | ||||||
|     backend = eval_config.get("backend", "vllm") |  | ||||||
|     enforce_eager = eval_config.get("enforce_eager", "true") |  | ||||||
|     kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto") |  | ||||||
|     model_args = ( |  | ||||||
|         f"pretrained={eval_config['model_name']}," |  | ||||||
|         f"tensor_parallel_size={tp_size}," |  | ||||||
|         f"enforce_eager={enforce_eager}," |  | ||||||
|         f"kv_cache_dtype={kv_cache_dtype}," |  | ||||||
|         f"add_bos_token=true," |  | ||||||
|         f"trust_remote_code={trust_remote_code}," |  | ||||||
|         f"max_model_len={max_model_len}," |  | ||||||
|     ) |  | ||||||
|     results = lm_eval.simple_evaluate( |  | ||||||
|         model=backend, |  | ||||||
|         model_args=model_args, |  | ||||||
|         tasks=[task["name"] for task in eval_config["tasks"]], |  | ||||||
|         num_fewshot=eval_config["num_fewshot"], |  | ||||||
|         limit=eval_config["limit"], |  | ||||||
|         # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help |  | ||||||
|         # text models. however, this is regressing measured strict-match for |  | ||||||
|         # existing text models in CI, so only apply it for mm, or explicitly set |  | ||||||
|         apply_chat_template=eval_config.get( |  | ||||||
|             "apply_chat_template", backend == "vllm-vlm" |  | ||||||
|         ), |  | ||||||
|         fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False), |  | ||||||
|         # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...) |  | ||||||
|         gen_kwargs=eval_config.get("gen_kwargs"), |  | ||||||
|         batch_size=batch_size, |  | ||||||
|     ) |  | ||||||
|     return results |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def test_lm_eval_correctness_param(config_filename, tp_size): |  | ||||||
|     eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) |  | ||||||
|  |  | ||||||
|     results = launch_lm_eval(eval_config, tp_size) |  | ||||||
|  |  | ||||||
|     success = True |  | ||||||
|     for task in eval_config["tasks"]: |  | ||||||
|         for metric in task["metrics"]: |  | ||||||
|             ground_truth = metric["value"] |  | ||||||
|             measured_value = results["results"][task["name"]][metric["name"]] |  | ||||||
|             print( |  | ||||||
|                 f"{task['name']} | {metric['name']}: " |  | ||||||
|                 f"ground_truth={ground_truth} | measured={measured_value}" |  | ||||||
|             ) |  | ||||||
|             success = success and np.isclose(ground_truth, measured_value, rtol=RTOL) |  | ||||||
|  |  | ||||||
|     assert success |  | ||||||
| @ -1,134 +0,0 @@ | |||||||
| # vLLM benchmark suite |  | ||||||
|  |  | ||||||
| ## Introduction |  | ||||||
|  |  | ||||||
| This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance. |  | ||||||
| vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD. |  | ||||||
|  |  | ||||||
| ## Performance benchmark quick overview |  | ||||||
|  |  | ||||||
| **Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models. |  | ||||||
|  |  | ||||||
| **Benchmarking Duration**: about 1hr. |  | ||||||
|  |  | ||||||
| **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. |  | ||||||
|  |  | ||||||
| ## Trigger the benchmark |  | ||||||
|  |  | ||||||
| The benchmark needs to be triggered manually: |  | ||||||
|  |  | ||||||
| ```bash |  | ||||||
| bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| Runtime environment variables: |  | ||||||
|  |  | ||||||
| - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. |  | ||||||
| - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). |  | ||||||
| - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). |  | ||||||
| - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). |  | ||||||
| - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. |  | ||||||
| - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. |  | ||||||
|  |  | ||||||
| ## Performance benchmark details |  | ||||||
|  |  | ||||||
| See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. |  | ||||||
| > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. |  | ||||||
| For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead. |  | ||||||
| > |  | ||||||
| ### Latency test |  | ||||||
|  |  | ||||||
| Here is an example of one test inside `latency-tests.json`: |  | ||||||
|  |  | ||||||
| ```json |  | ||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "latency_llama8B_tp1", |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-8B", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "num_iters_warmup": 5, |  | ||||||
|             "num_iters": 15 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
| ] |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| In this example: |  | ||||||
|  |  | ||||||
| - The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. |  | ||||||
| - The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` |  | ||||||
|  |  | ||||||
| Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. |  | ||||||
|  |  | ||||||
| WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. |  | ||||||
|  |  | ||||||
| ### Throughput test |  | ||||||
|  |  | ||||||
| The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`. |  | ||||||
|  |  | ||||||
| The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. |  | ||||||
|  |  | ||||||
| ### Serving test |  | ||||||
|  |  | ||||||
| We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: |  | ||||||
|  |  | ||||||
| ```json |  | ||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp1_sharegpt", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-8B", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
|             "swap_space": 16, |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-8B", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
| ] |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| Inside this example: |  | ||||||
|  |  | ||||||
| - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. |  | ||||||
| - The `server-parameters` includes the command line arguments for vLLM server. |  | ||||||
| - The `client-parameters` includes the command line arguments for `vllm bench serve`. |  | ||||||
| - The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve` |  | ||||||
|  |  | ||||||
| The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. |  | ||||||
|  |  | ||||||
| WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. |  | ||||||
|  |  | ||||||
| ### Visualizing the results |  | ||||||
|  |  | ||||||
| The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. |  | ||||||
| You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. |  | ||||||
| If you do not see the table, please wait till the benchmark finish running. |  | ||||||
| The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. |  | ||||||
| The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. |  | ||||||
|  |  | ||||||
| The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. |  | ||||||
| When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. |  | ||||||
| `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.   |  | ||||||
| If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. |  | ||||||
|  |  | ||||||
| Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps. |  | ||||||
| `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` |  | ||||||
|  |  | ||||||
| |   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        | |  | ||||||
| |----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------| |  | ||||||
| | 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 | |  | ||||||
| | 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 | |  | ||||||
|  |  | ||||||
| A comparison diagram will be generated below the table. |  | ||||||
| Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 |  | ||||||
| <img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" /> |  | ||||||
| @ -1,65 +0,0 @@ | |||||||
| # Performance benchmarks descriptions |  | ||||||
|  |  | ||||||
| ## Latency tests |  | ||||||
|  |  | ||||||
| - Input length: 32 tokens. |  | ||||||
| - Output length: 128 tokens. |  | ||||||
| - Batch size: fixed (8). |  | ||||||
| - GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. |  | ||||||
| - CPU Models: llama-3.1 8B. |  | ||||||
| - Evaluation metrics: end-to-end latency (mean, median, p99). |  | ||||||
|  |  | ||||||
| {latency_tests_markdown_table} |  | ||||||
|  |  | ||||||
| ## Throughput tests |  | ||||||
|  |  | ||||||
| - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). |  | ||||||
| - Output length: the corresponding output length of these 200 prompts. |  | ||||||
| - Batch size: dynamically determined by vllm to achieve maximum throughput. |  | ||||||
| - GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. |  | ||||||
| - CPU Models: llama-3.1 8B. |  | ||||||
| - Evaluation metrics: throughput. |  | ||||||
|  |  | ||||||
| {throughput_tests_markdown_table} |  | ||||||
|  |  | ||||||
| ## Serving tests |  | ||||||
|  |  | ||||||
| - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). |  | ||||||
| - Output length: the corresponding output length of these 200 prompts. |  | ||||||
| - Batch size: dynamically determined by vllm and the arrival pattern of the requests. |  | ||||||
| - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). |  | ||||||
| - GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. |  | ||||||
| - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 |  | ||||||
| - CPU Models: llama-3.1 8B. |  | ||||||
| - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). |  | ||||||
| - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts. |  | ||||||
|  |  | ||||||
| {serving_tests_markdown_table} |  | ||||||
|  |  | ||||||
| ## Platform Information |  | ||||||
|  |  | ||||||
| {platform_markdown_table} |  | ||||||
|  |  | ||||||
| ## json version of the benchmarking tables |  | ||||||
|  |  | ||||||
| This section contains the data of the markdown tables above in JSON format. |  | ||||||
| You can load the benchmarking tables into pandas dataframes as follows: |  | ||||||
|  |  | ||||||
| ```python |  | ||||||
| import json |  | ||||||
| import pandas as pd |  | ||||||
|  |  | ||||||
| benchmarking_results_json = """The json string""" |  | ||||||
| benchmarking_results = json.loads(benchmarking_results_json) |  | ||||||
| latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"]) |  | ||||||
| throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"]) |  | ||||||
| serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"]) |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| The json string for all benchmarking tables: |  | ||||||
|  |  | ||||||
| ```json |  | ||||||
| {benchmarking_results_in_json_string} |  | ||||||
| ``` |  | ||||||
|  |  | ||||||
| You can also check the raw experiment data in the Artifact tab of the Buildkite page. |  | ||||||
| @ -1,456 +0,0 @@ | |||||||
| # SPDX-License-Identifier: Apache-2.0 |  | ||||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |  | ||||||
| import argparse |  | ||||||
| import json |  | ||||||
| import os |  | ||||||
| from importlib import util |  | ||||||
|  |  | ||||||
| import pandas as pd |  | ||||||
|  |  | ||||||
| pd.options.display.float_format = "{:.2f}".format |  | ||||||
| plotly_found = util.find_spec("plotly.express") is not None |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def compare_data_columns( |  | ||||||
|     files, name_column, data_column, info_cols, drop_column, debug=False |  | ||||||
| ): |  | ||||||
|     """ |  | ||||||
|     Align concatenation by keys derived from info_cols instead of row order. |  | ||||||
|     - Pick one canonical key list: subset of info_cols present in ALL files. |  | ||||||
|     - For each file: set index to those keys, aggregate duplicates |  | ||||||
|     - (mean for metric, first for names). |  | ||||||
|     - Concat along axis=1 (indexes align), then reset_index so callers can |  | ||||||
|     - group by columns. |  | ||||||
|     - If --debug, add a <file_label>_name column per file. |  | ||||||
|     """ |  | ||||||
|     print("\ncompare_data_column:", data_column) |  | ||||||
|  |  | ||||||
|     frames = [] |  | ||||||
|     raw_data_cols = [] |  | ||||||
|     compare_frames = [] |  | ||||||
|  |  | ||||||
|     # 1) choose a canonical key list from info_cols that exists in ALL files |  | ||||||
|     cols_per_file = [] |  | ||||||
|     for f in files: |  | ||||||
|         try: |  | ||||||
|             df_tmp = pd.read_json(f, orient="records") |  | ||||||
|         except Exception as err: |  | ||||||
|             raise ValueError(f"Failed to read {f}") from err |  | ||||||
|         cols_per_file.append(set(df_tmp.columns)) |  | ||||||
|  |  | ||||||
|     key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] |  | ||||||
|     if not key_cols: |  | ||||||
|         # soft fallback: use any info_cols present in the first file |  | ||||||
|         key_cols = [c for c in info_cols if c in list(cols_per_file[0])] |  | ||||||
|     if not key_cols: |  | ||||||
|         raise ValueError( |  | ||||||
|             "No common key columns found from info_cols across the input files." |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     # 2) build a single "meta" block (keys as columns) once, aligned by the key index |  | ||||||
|     meta_added = False |  | ||||||
|  |  | ||||||
|     for file in files: |  | ||||||
|         df = pd.read_json(file, orient="records") |  | ||||||
|  |  | ||||||
|         # Keep rows that actually have the compared metric (same as original behavior) |  | ||||||
|         if drop_column in df.columns: |  | ||||||
|             df = df.dropna(subset=[drop_column], ignore_index=True) |  | ||||||
|  |  | ||||||
|         # Stabilize numeric key columns (harmless if missing) |  | ||||||
|         for c in ( |  | ||||||
|             "Input Len", |  | ||||||
|             "Output Len", |  | ||||||
|             "TP Size", |  | ||||||
|             "PP Size", |  | ||||||
|             "# of max concurrency.", |  | ||||||
|             "qps", |  | ||||||
|         ): |  | ||||||
|             if c in df.columns: |  | ||||||
|                 df[c] = pd.to_numeric(df[c], errors="coerce") |  | ||||||
|  |  | ||||||
|         # Ensure all key columns exist |  | ||||||
|         for c in key_cols: |  | ||||||
|             if c not in df.columns: |  | ||||||
|                 df[c] = pd.NA |  | ||||||
|  |  | ||||||
|         # Set index = key_cols and aggregate duplicates → unique MultiIndex |  | ||||||
|         df_idx = df.set_index(key_cols, drop=False) |  | ||||||
|  |  | ||||||
|         # meta (key columns), unique per key |  | ||||||
|         meta = df_idx[key_cols] |  | ||||||
|         if not meta.index.is_unique: |  | ||||||
|             meta = meta.groupby(level=key_cols, dropna=False).first() |  | ||||||
|  |  | ||||||
|         # metric series for this file, aggregated to one row per key |  | ||||||
|         file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) |  | ||||||
|         s = df_idx[data_column] |  | ||||||
|         if not s.index.is_unique: |  | ||||||
|             s = s.groupby(level=key_cols, dropna=False).mean() |  | ||||||
|         s.name = file_label  # column label like original |  | ||||||
|  |  | ||||||
|         # add meta once (from first file) so keys are the leftmost columns |  | ||||||
|         if not meta_added: |  | ||||||
|             frames.append(meta) |  | ||||||
|             meta_added = True |  | ||||||
|  |  | ||||||
|         # (NEW) debug: aligned test-name column per file |  | ||||||
|         if debug and name_column in df_idx.columns: |  | ||||||
|             name_s = df_idx[name_column] |  | ||||||
|             if not name_s.index.is_unique: |  | ||||||
|                 name_s = name_s.groupby(level=key_cols, dropna=False).first() |  | ||||||
|             name_s.name = f"{file_label}_name" |  | ||||||
|             frames.append(name_s) |  | ||||||
|  |  | ||||||
|         frames.append(s) |  | ||||||
|         raw_data_cols.append(file_label) |  | ||||||
|         compare_frames.append(s) |  | ||||||
|  |  | ||||||
|         # Generalize ratio: for any file N>=2, add ratio (fileN / file1) |  | ||||||
|         if len(compare_frames) >= 2: |  | ||||||
|             base = compare_frames[0] |  | ||||||
|             current = compare_frames[-1] |  | ||||||
|             if "P99" in data_column or "Median" in data_column: |  | ||||||
|                 ratio = base / current  # for latency |  | ||||||
|             else: |  | ||||||
|                 ratio = current / base |  | ||||||
|             ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0 |  | ||||||
|             ratio.name = f"Ratio 1 vs {len(compare_frames)}" |  | ||||||
|             frames.append(ratio) |  | ||||||
|  |  | ||||||
|     # 4) concat on columns with aligned MultiIndex; |  | ||||||
|     # then reset_index to return keys as columns |  | ||||||
|     concat_df = pd.concat(frames, axis=1) |  | ||||||
|     concat_df = concat_df.reset_index(drop=True).reset_index() |  | ||||||
|     if "index" in concat_df.columns: |  | ||||||
|         concat_df = concat_df.drop(columns=["index"]) |  | ||||||
|  |  | ||||||
|     # Ensure key/info columns appear first (in your info_cols order) |  | ||||||
|     front = [c for c in info_cols if c in concat_df.columns] |  | ||||||
|     rest = [c for c in concat_df.columns if c not in front] |  | ||||||
|     concat_df = concat_df[front + rest] |  | ||||||
|  |  | ||||||
|     print(raw_data_cols) |  | ||||||
|     return concat_df, raw_data_cols |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def split_json_by_tp_pp( |  | ||||||
|     input_file: str = "benchmark_results.json", output_root: str = "." |  | ||||||
| ) -> list[str]: |  | ||||||
|     """ |  | ||||||
|     Split a benchmark JSON into separate folders by (TP Size, PP Size). |  | ||||||
|  |  | ||||||
|     Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json |  | ||||||
|     Returns: list of file paths written. |  | ||||||
|     """ |  | ||||||
|     # Load JSON data into DataFrame |  | ||||||
|     with open(input_file, encoding="utf-8") as f: |  | ||||||
|         data = json.load(f) |  | ||||||
|  |  | ||||||
|     # If the JSON is a dict with a list under common keys, use that list |  | ||||||
|     if isinstance(data, dict): |  | ||||||
|         for key in ("results", "serving_results", "benchmarks", "data"): |  | ||||||
|             if isinstance(data.get(key), list): |  | ||||||
|                 data = data[key] |  | ||||||
|                 break |  | ||||||
|  |  | ||||||
|     df = pd.DataFrame(data) |  | ||||||
|  |  | ||||||
|     # Keep only "serving" tests |  | ||||||
|     name_col = next( |  | ||||||
|         (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None |  | ||||||
|     ) |  | ||||||
|     if name_col: |  | ||||||
|         df = df[ |  | ||||||
|             df[name_col].astype(str).str.contains(r"serving", case=False, na=False) |  | ||||||
|         ].copy() |  | ||||||
|  |  | ||||||
|     # Handle alias column names |  | ||||||
|     rename_map = { |  | ||||||
|         "tp_size": "TP Size", |  | ||||||
|         "tensor_parallel_size": "TP Size", |  | ||||||
|         "pp_size": "PP Size", |  | ||||||
|         "pipeline_parallel_size": "PP Size", |  | ||||||
|     } |  | ||||||
|     df.rename( |  | ||||||
|         columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     # Ensure TP/PP columns exist (default to 1 if missing) |  | ||||||
|     if "TP Size" not in df.columns: |  | ||||||
|         df["TP Size"] = 1 |  | ||||||
|     if "PP Size" not in df.columns: |  | ||||||
|         df["PP Size"] = 1 |  | ||||||
|  |  | ||||||
|     # make sure TP/PP are numeric ints with no NaN |  | ||||||
|     df["TP Size"] = ( |  | ||||||
|         pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int) |  | ||||||
|     ) |  | ||||||
|     df["PP Size"] = ( |  | ||||||
|         pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int) |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     # Split into separate folders |  | ||||||
|     saved_paths: list[str] = [] |  | ||||||
|     for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False): |  | ||||||
|         folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}") |  | ||||||
|         os.makedirs(folder_name, exist_ok=True) |  | ||||||
|         filepath = os.path.join(folder_name, "benchmark_results.json") |  | ||||||
|         group_df.to_json(filepath, orient="records", indent=2, force_ascii=False) |  | ||||||
|         print(f"Saved: {filepath}") |  | ||||||
|         saved_paths.append(filepath) |  | ||||||
|  |  | ||||||
|     return saved_paths |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _add_limit_line(fig, y_value, label): |  | ||||||
|     # Visible dashed line + annotation |  | ||||||
|     fig.add_hline( |  | ||||||
|         y=y_value, |  | ||||||
|         line_dash="dash", |  | ||||||
|         line_color="red" if "ttft" in label.lower() else "blue", |  | ||||||
|         annotation_text=f"{label}: {y_value} ms", |  | ||||||
|         annotation_position="top left", |  | ||||||
|     ) |  | ||||||
|     # Optional: add a legend item (as a transparent helper trace) |  | ||||||
|     if plot and plotly_found: |  | ||||||
|         import plotly.graph_objects as go |  | ||||||
|  |  | ||||||
|         fig.add_trace( |  | ||||||
|             go.Scatter( |  | ||||||
|                 x=[None], |  | ||||||
|                 y=[None], |  | ||||||
|                 mode="lines", |  | ||||||
|                 line=dict( |  | ||||||
|                     dash="dash", color="red" if "ttft" in label.lower() else "blue" |  | ||||||
|                 ), |  | ||||||
|                 name=f"{label}", |  | ||||||
|             ) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _find_concurrency_col(df: pd.DataFrame) -> str: |  | ||||||
|     for c in [ |  | ||||||
|         "# of max concurrency.", |  | ||||||
|         "# of max concurrency", |  | ||||||
|         "Max Concurrency", |  | ||||||
|         "max_concurrency", |  | ||||||
|         "Concurrency", |  | ||||||
|     ]: |  | ||||||
|         if c in df.columns: |  | ||||||
|             return c |  | ||||||
|     # Fallback: guess an integer-like column (harmless if unused) |  | ||||||
|     for c in df.columns: |  | ||||||
|         if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: |  | ||||||
|             return c |  | ||||||
|     return "# of max concurrency." |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _highlight_threshold( |  | ||||||
|     df: pd.DataFrame, threshold: float |  | ||||||
| ) -> "pd.io.formats.style.Styler": |  | ||||||
|     """Highlight numeric per-configuration columns with value <= threshold.""" |  | ||||||
|     conc_col = _find_concurrency_col(df) |  | ||||||
|     key_cols = [ |  | ||||||
|         c |  | ||||||
|         for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] |  | ||||||
|         if c in df.columns |  | ||||||
|     ] |  | ||||||
|     conf_cols = [ |  | ||||||
|         c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio") |  | ||||||
|     ] |  | ||||||
|     conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] |  | ||||||
|     return df.style.map( |  | ||||||
|         lambda v: "background-color:#e6ffe6;font-weight:bold;" |  | ||||||
|         if pd.notna(v) and v <= threshold |  | ||||||
|         else "", |  | ||||||
|         subset=conf_cols, |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": |  | ||||||
|     parser = argparse.ArgumentParser() |  | ||||||
|     parser.add_argument( |  | ||||||
|         "-f", "--file", action="append", type=str, help="input file name" |  | ||||||
|     ) |  | ||||||
|     parser.add_argument( |  | ||||||
|         "--debug", action="store_true", help="show all information for debugging" |  | ||||||
|     ) |  | ||||||
|     parser.add_argument( |  | ||||||
|         "--plot", |  | ||||||
|         action=argparse.BooleanOptionalAction, |  | ||||||
|         default=True, |  | ||||||
|         help="plot perf diagrams or not --no-plot --plot", |  | ||||||
|     ) |  | ||||||
|     parser.add_argument( |  | ||||||
|         "-x", |  | ||||||
|         "--xaxis", |  | ||||||
|         type=str, |  | ||||||
|         default="# of max concurrency.", |  | ||||||
|         help="column name to use as X Axis in comparison graph", |  | ||||||
|     ) |  | ||||||
|     parser.add_argument( |  | ||||||
|         "-l", |  | ||||||
|         "--latency", |  | ||||||
|         type=str, |  | ||||||
|         default="p99", |  | ||||||
|         help="take median|p99 for latency like TTFT/TPOT", |  | ||||||
|     ) |  | ||||||
|     parser.add_argument( |  | ||||||
|         "--ttft-max-ms", |  | ||||||
|         type=float, |  | ||||||
|         default=3000.0, |  | ||||||
|         help="Reference limit for TTFT plots (ms)", |  | ||||||
|     ) |  | ||||||
|     parser.add_argument( |  | ||||||
|         "--tpot-max-ms", |  | ||||||
|         type=float, |  | ||||||
|         default=100.0, |  | ||||||
|         help="Reference limit for TPOT plots (ms)", |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     args = parser.parse_args() |  | ||||||
|  |  | ||||||
|     drop_column = "P99" |  | ||||||
|     name_column = "Test name" |  | ||||||
|     info_cols = [ |  | ||||||
|         "Model", |  | ||||||
|         "Dataset Name", |  | ||||||
|         "Input Len", |  | ||||||
|         "Output Len", |  | ||||||
|         "TP Size", |  | ||||||
|         "PP Size", |  | ||||||
|         "# of max concurrency.", |  | ||||||
|         "qps", |  | ||||||
|     ] |  | ||||||
|  |  | ||||||
|     if "median" in args.latency: |  | ||||||
|         data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] |  | ||||||
|         html_msgs_for_data_cols = [ |  | ||||||
|             "Compare Output Tokens /n", |  | ||||||
|             "Median TTFT /n", |  | ||||||
|             "Median TPOT /n", |  | ||||||
|         ] |  | ||||||
|         drop_column = "P99" |  | ||||||
|     elif "p99" in args.latency: |  | ||||||
|         data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"] |  | ||||||
|         html_msgs_for_data_cols = [ |  | ||||||
|             "Compare Output Tokens /n", |  | ||||||
|             "P99 TTFT /n", |  | ||||||
|             "P99 TPOT /n", |  | ||||||
|         ] |  | ||||||
|  |  | ||||||
|     if len(args.file) == 1: |  | ||||||
|         files = split_json_by_tp_pp(args.file[0], output_root="splits") |  | ||||||
|         info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] |  | ||||||
|     else: |  | ||||||
|         files = args.file |  | ||||||
|     print("comparing : " + ", ".join(files)) |  | ||||||
|     debug = args.debug |  | ||||||
|     plot = args.plot |  | ||||||
|     # For Plot feature, assign y axis from one of info_cols |  | ||||||
|     y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6 |  | ||||||
|     with open("perf_comparison.html", "w") as text_file: |  | ||||||
|         for i in range(len(data_cols_to_compare)): |  | ||||||
|             output_df, raw_data_cols = compare_data_columns( |  | ||||||
|                 files, |  | ||||||
|                 name_column, |  | ||||||
|                 data_cols_to_compare[i], |  | ||||||
|                 info_cols, |  | ||||||
|                 drop_column, |  | ||||||
|                 debug=debug, |  | ||||||
|             ) |  | ||||||
|  |  | ||||||
|             # For Plot feature, insert y axis from one of info_cols |  | ||||||
|             raw_data_cols.insert(0, info_cols[y_axis_index]) |  | ||||||
|  |  | ||||||
|             filtered_info_cols = info_cols[:-2] |  | ||||||
|             existing_group_cols = [ |  | ||||||
|                 c for c in filtered_info_cols if c in output_df.columns |  | ||||||
|             ] |  | ||||||
|             if not existing_group_cols: |  | ||||||
|                 raise ValueError( |  | ||||||
|                     f"No valid group-by columns  " |  | ||||||
|                     f"Expected subset: {filtered_info_cols}, " |  | ||||||
|                     f"but DataFrame has: {list(output_df.columns)}" |  | ||||||
|                 ) |  | ||||||
|             # output_df_sorted = output_df.sort_values(by=existing_group_cols) |  | ||||||
|             output_df_sorted = output_df.sort_values(by=args.xaxis) |  | ||||||
|             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) |  | ||||||
|             for name, group in output_groups: |  | ||||||
|                 group_name = ( |  | ||||||
|                     ",".join(map(str, name)).replace(",", "_").replace("/", "-") |  | ||||||
|                 ) |  | ||||||
|                 group_html_name = "perf_comparison_" + group_name + ".html" |  | ||||||
|  |  | ||||||
|                 metric_name = str(data_cols_to_compare[i]).lower() |  | ||||||
|                 if "tok/s" in metric_name: |  | ||||||
|                     html = group.to_html() |  | ||||||
|                 elif "ttft" in metric_name: |  | ||||||
|                     styler = _highlight_threshold(group, args.ttft_max_ms).format( |  | ||||||
|                         {c: "{:.2f}" for c in group.select_dtypes("number").columns}, |  | ||||||
|                         na_rep="—", |  | ||||||
|                     ) |  | ||||||
|                     html = styler.to_html( |  | ||||||
|                         table_attributes='border="1" class="dataframe"' |  | ||||||
|                     ) |  | ||||||
|                 elif ( |  | ||||||
|                     "tpot" in metric_name |  | ||||||
|                     or "median" in metric_name |  | ||||||
|                     or "p99" in metric_name |  | ||||||
|                 ): |  | ||||||
|                     styler = _highlight_threshold(group, args.tpot_max_ms).format( |  | ||||||
|                         {c: "{:.2f}" for c in group.select_dtypes("number").columns}, |  | ||||||
|                         na_rep="—", |  | ||||||
|                     ) |  | ||||||
|                     html = styler.to_html( |  | ||||||
|                         table_attributes='border="1" class="dataframe"' |  | ||||||
|                     ) |  | ||||||
|  |  | ||||||
|                 text_file.write(html_msgs_for_data_cols[i]) |  | ||||||
|                 text_file.write(html) |  | ||||||
|                 with open(group_html_name, "a+") as sub_text_file: |  | ||||||
|                     sub_text_file.write(html_msgs_for_data_cols[i]) |  | ||||||
|                     sub_text_file.write(html) |  | ||||||
|  |  | ||||||
|                     if plot and plotly_found: |  | ||||||
|                         import plotly.express as px |  | ||||||
|  |  | ||||||
|                         df = group[raw_data_cols] |  | ||||||
|                         df_sorted = df.sort_values(by=info_cols[y_axis_index]) |  | ||||||
|                         # Melt DataFrame for plotting |  | ||||||
|                         df_melted = df_sorted.melt( |  | ||||||
|                             id_vars=info_cols[y_axis_index], |  | ||||||
|                             var_name="Configuration", |  | ||||||
|                             value_name=data_cols_to_compare[i], |  | ||||||
|                         ) |  | ||||||
|                         title = ( |  | ||||||
|                             data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] |  | ||||||
|                         ) |  | ||||||
|                         # Create Plotly line chart |  | ||||||
|                         fig = px.line( |  | ||||||
|                             df_melted, |  | ||||||
|                             x=info_cols[y_axis_index], |  | ||||||
|                             y=data_cols_to_compare[i], |  | ||||||
|                             color="Configuration", |  | ||||||
|                             title=title, |  | ||||||
|                             markers=True, |  | ||||||
|                         ) |  | ||||||
|  |  | ||||||
|                         # ---- Add threshold lines based on metric name ---- |  | ||||||
|                         if "ttft" in metric_name: |  | ||||||
|                             _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") |  | ||||||
|                         elif ( |  | ||||||
|                             "tpot" in metric_name |  | ||||||
|                             or "median" in metric_name |  | ||||||
|                             or "p99" in metric_name |  | ||||||
|                         ): |  | ||||||
|                             _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") |  | ||||||
|  |  | ||||||
|                         # Export to HTML |  | ||||||
|                         text_file.write( |  | ||||||
|                             fig.to_html(full_html=True, include_plotlyjs="cdn") |  | ||||||
|                         ) |  | ||||||
|                         sub_text_file.write( |  | ||||||
|                             fig.to_html(full_html=True, include_plotlyjs="cdn") |  | ||||||
|                         ) |  | ||||||
| @ -1,414 +0,0 @@ | |||||||
| # SPDX-License-Identifier: Apache-2.0 |  | ||||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |  | ||||||
|  |  | ||||||
| import argparse |  | ||||||
| import json |  | ||||||
| import os |  | ||||||
| import shlex |  | ||||||
| from importlib import util |  | ||||||
| from pathlib import Path |  | ||||||
| from typing import Any |  | ||||||
|  |  | ||||||
| import pandas as pd |  | ||||||
| import psutil |  | ||||||
| import regex as re |  | ||||||
| from tabulate import tabulate |  | ||||||
|  |  | ||||||
| # latency results and the keys that will be printed into markdown |  | ||||||
| latency_results = [] |  | ||||||
| latency_column_mapping = { |  | ||||||
|     "test_name": "Test name", |  | ||||||
|     "gpu_type": "GPU", |  | ||||||
|     "avg_latency": "Mean latency (ms)", |  | ||||||
|     # "P10": "P10 (s)", |  | ||||||
|     # "P25": "P25 (s)", |  | ||||||
|     "P50": "Median latency (ms)", |  | ||||||
|     # "P75": "P75 (s)", |  | ||||||
|     # "P90": "P90 (s)", |  | ||||||
|     "P99": "P99 latency (ms)", |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # throughput tests and the keys that will be printed into markdown |  | ||||||
| throughput_results = [] |  | ||||||
| throughput_results_column_mapping = { |  | ||||||
|     "test_name": "Test name", |  | ||||||
|     "gpu_type": "GPU", |  | ||||||
|     "num_requests": "# of req.", |  | ||||||
|     "total_num_tokens": "Total # of tokens", |  | ||||||
|     "elapsed_time": "Elapsed time (s)", |  | ||||||
|     "requests_per_second": "Tput (req/s)", |  | ||||||
|     "tokens_per_second": "Tput (tok/s)", |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # serving results and the keys that will be printed into markdown |  | ||||||
| serving_results = [] |  | ||||||
| serving_column_mapping = { |  | ||||||
|     "test_name": "Test name", |  | ||||||
|     "model_id": "Model", |  | ||||||
|     "dataset_name": "Dataset Name", |  | ||||||
|     "input_len": "Input Len", |  | ||||||
|     "output_len": "Output Len", |  | ||||||
|     "tp_size": "TP Size", |  | ||||||
|     "pp_size": "PP Size", |  | ||||||
|     "dtype": "dtype", |  | ||||||
|     "gpu_type": "GPU", |  | ||||||
|     "completed": "# of req.", |  | ||||||
|     "qps": "qps", |  | ||||||
|     "max_concurrency": "# of max concurrency.", |  | ||||||
|     "request_throughput": "Tput (req/s)", |  | ||||||
|     "total_token_throughput": "Total Token Tput (tok/s)", |  | ||||||
|     "output_throughput": "Output Tput (tok/s)", |  | ||||||
|     # "total_input_tokens": "Total input tokens", |  | ||||||
|     # "total_output_tokens": "Total output tokens", |  | ||||||
|     "mean_ttft_ms": "Mean TTFT (ms)", |  | ||||||
|     "median_ttft_ms": "Median TTFT (ms)", |  | ||||||
|     "p99_ttft_ms": "P99 TTFT (ms)", |  | ||||||
|     "std_ttft_ms": "STD TTFT (ms)", |  | ||||||
|     "mean_tpot_ms": "Mean TPOT (ms)", |  | ||||||
|     "median_tpot_ms": "Median", |  | ||||||
|     "p99_tpot_ms": "P99", |  | ||||||
|     "std_tpot_ms": "STD TPOT (ms)", |  | ||||||
|     "mean_itl_ms": "Mean ITL (ms)", |  | ||||||
|     "median_itl_ms": "Median ITL (ms)", |  | ||||||
|     "p99_itl_ms": "P99 ITL (ms)", |  | ||||||
| } |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def read_markdown(file): |  | ||||||
|     if os.path.exists(file): |  | ||||||
|         with open(file) as f: |  | ||||||
|             return f.read() + "\n" |  | ||||||
|     else: |  | ||||||
|         return f"{file} not found.\n" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def results_to_json(latency, throughput, serving): |  | ||||||
|     return json.dumps( |  | ||||||
|         { |  | ||||||
|             "latency": latency.to_dict(), |  | ||||||
|             "throughput": throughput.to_dict(), |  | ||||||
|             "serving": serving.to_dict(), |  | ||||||
|         } |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_size_with_unit(bytes, suffix="B"): |  | ||||||
|     """ |  | ||||||
|     Scale bytes to its proper format |  | ||||||
|     e.g: |  | ||||||
|         1253656 => '1.20MB' |  | ||||||
|         1253656678 => '1.17GB' |  | ||||||
|     """ |  | ||||||
|     factor = 1024 |  | ||||||
|     for unit in ["", "K", "M", "G", "T", "P"]: |  | ||||||
|         if bytes < factor: |  | ||||||
|             return f"{bytes:.2f}{unit}{suffix}" |  | ||||||
|         bytes /= factor |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def _coerce(val: str) -> Any: |  | ||||||
|     """Best-effort type coercion from string to Python types.""" |  | ||||||
|     low = val.lower() |  | ||||||
|     if low == "null": |  | ||||||
|         return None |  | ||||||
|     if low == "true": |  | ||||||
|         return True |  | ||||||
|     if low == "false": |  | ||||||
|         return False |  | ||||||
|     # integers |  | ||||||
|     if re.fullmatch(r"[+-]?\d+", val): |  | ||||||
|         try: |  | ||||||
|             return int(val) |  | ||||||
|         except ValueError: |  | ||||||
|             pass |  | ||||||
|     # floats (keep 'inf'/'-inf'/'nan' as strings) |  | ||||||
|     if re.fullmatch(r"[+-]?\d*\.\d+", val): |  | ||||||
|         try: |  | ||||||
|             return float(val) |  | ||||||
|         except ValueError: |  | ||||||
|             pass |  | ||||||
|     return val |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def parse_client_command(cmd: str) -> dict[str, Any]: |  | ||||||
|     """Parse the client_command shell string into {executable, script, args}.""" |  | ||||||
|     toks = shlex.split(cmd) |  | ||||||
|     if len(toks) < 2: |  | ||||||
|         raise ValueError("client_command must include an executable and a script") |  | ||||||
|     executable, script = toks[0], toks[1] |  | ||||||
|     args: dict[str, Any] = {} |  | ||||||
|  |  | ||||||
|     i = 2 |  | ||||||
|     while i < len(toks): |  | ||||||
|         t = toks[i] |  | ||||||
|         if t.startswith("--"): |  | ||||||
|             # --key=value or --key (value) or boolean flag |  | ||||||
|             if "=" in t: |  | ||||||
|                 key, val = t.split("=", 1) |  | ||||||
|                 if key == "--metadata": |  | ||||||
|                     md = {} |  | ||||||
|                     if val: |  | ||||||
|                         if "=" in val: |  | ||||||
|                             k, v = val.split("=", 1) |  | ||||||
|                             md[k] = _coerce(v) |  | ||||||
|                         else: |  | ||||||
|                             md[val] = True |  | ||||||
|                     args[key] = md |  | ||||||
|                 else: |  | ||||||
|                     args[key] = _coerce(val) |  | ||||||
|                 i += 1 |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             key = t |  | ||||||
|  |  | ||||||
|             # Special: consume metadata k=v pairs until next --flag |  | ||||||
|             if key == "--metadata": |  | ||||||
|                 i += 1 |  | ||||||
|                 md = {} |  | ||||||
|                 while i < len(toks) and not toks[i].startswith("--"): |  | ||||||
|                     pair = toks[i] |  | ||||||
|                     if "=" in pair: |  | ||||||
|                         k, v = pair.split("=", 1) |  | ||||||
|                         md[k] = _coerce(v) |  | ||||||
|                     else: |  | ||||||
|                         md[pair] = True |  | ||||||
|                     i += 1 |  | ||||||
|                 args[key] = md |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             # Standard: check if next token is a value (not a flag) |  | ||||||
|             if i + 1 < len(toks) and not toks[i + 1].startswith("--"): |  | ||||||
|                 args[key] = _coerce(toks[i + 1]) |  | ||||||
|                 i += 2 |  | ||||||
|             else: |  | ||||||
|                 # lone flag -> True |  | ||||||
|                 args[key] = True |  | ||||||
|                 i += 1 |  | ||||||
|         else: |  | ||||||
|             # unexpected positional; skip |  | ||||||
|             i += 1 |  | ||||||
|  |  | ||||||
|     return {"executable": executable, "script": script, "args": args} |  | ||||||
|  |  | ||||||
|  |  | ||||||
| if __name__ == "__main__": |  | ||||||
|     parser = argparse.ArgumentParser() |  | ||||||
|     parser.add_argument( |  | ||||||
|         "-r", |  | ||||||
|         "--result", |  | ||||||
|         type=str, |  | ||||||
|         default="results", |  | ||||||
|         help="Folder name for benchmark output results.", |  | ||||||
|     ) |  | ||||||
|     args = parser.parse_args() |  | ||||||
|     results_folder = Path(args.result) |  | ||||||
|     if not results_folder.exists(): |  | ||||||
|         raise FileNotFoundError(f"results folder does not exist: {results_folder}") |  | ||||||
|     # collect results |  | ||||||
|     for test_file in results_folder.glob("*.json"): |  | ||||||
|         with open(test_file) as f: |  | ||||||
|             raw_result = json.loads(f.read()) |  | ||||||
|  |  | ||||||
|         if "serving" in str(test_file): |  | ||||||
|             # this result is generated via `vllm bench serve` command |  | ||||||
|             # attach the benchmarking command to raw_result |  | ||||||
|             try: |  | ||||||
|                 with open(test_file.with_suffix(".commands")) as f: |  | ||||||
|                     command = json.loads(f.read()) |  | ||||||
|             except OSError as e: |  | ||||||
|                 print(e) |  | ||||||
|                 continue |  | ||||||
|             # Parse Server Command Arg |  | ||||||
|             out: dict[str, Any] = { |  | ||||||
|                 "server_command": parse_client_command(command["server_command"]) |  | ||||||
|             } |  | ||||||
|             parse_args = [ |  | ||||||
|                 "--tensor-parallel-size", |  | ||||||
|                 "--pipeline-parallel-size", |  | ||||||
|                 "--dtype", |  | ||||||
|             ] |  | ||||||
|             col_mapping = ["tp_size", "pp_size", "dtype"] |  | ||||||
|             for index, arg in enumerate(parse_args): |  | ||||||
|                 if arg in out["server_command"]["args"]: |  | ||||||
|                     raw_result.update( |  | ||||||
|                         {col_mapping[index]: out["server_command"]["args"][arg]} |  | ||||||
|                     ) |  | ||||||
|  |  | ||||||
|             # Parse Client Command Arg |  | ||||||
|             out: dict[str, Any] = { |  | ||||||
|                 "client_command": parse_client_command(command["client_command"]) |  | ||||||
|             } |  | ||||||
|             parse_args = [ |  | ||||||
|                 "--dataset-name", |  | ||||||
|                 "--random-input-len", |  | ||||||
|                 "--random-output-len", |  | ||||||
|                 "--request-rate", |  | ||||||
|             ] |  | ||||||
|             col_mapping = ["dataset_name", "input_len", "output_len", "qps"] |  | ||||||
|  |  | ||||||
|             for index, arg in enumerate(parse_args): |  | ||||||
|                 if arg in out["client_command"]["args"]: |  | ||||||
|                     raw_result.update( |  | ||||||
|                         {col_mapping[index]: out["client_command"]["args"][arg]} |  | ||||||
|                     ) |  | ||||||
|             # Add Server, Client command |  | ||||||
|             raw_result.update(command) |  | ||||||
|  |  | ||||||
|             # update the test name of this result |  | ||||||
|             raw_result.update({"test_name": test_file.stem}) |  | ||||||
|             # add the result to raw_result |  | ||||||
|             serving_results.append(raw_result) |  | ||||||
|             continue |  | ||||||
|  |  | ||||||
|         elif "latency" in f.name: |  | ||||||
|             # this result is generated via `vllm bench latency` command |  | ||||||
|  |  | ||||||
|             # attach the benchmarking command to raw_result |  | ||||||
|             try: |  | ||||||
|                 with open(test_file.with_suffix(".commands")) as f: |  | ||||||
|                     command = json.loads(f.read()) |  | ||||||
|             except OSError as e: |  | ||||||
|                 print(e) |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             raw_result.update(command) |  | ||||||
|  |  | ||||||
|             # update the test name of this result |  | ||||||
|             raw_result.update({"test_name": test_file.stem}) |  | ||||||
|  |  | ||||||
|             # get different percentiles |  | ||||||
|             for perc in [10, 25, 50, 75, 90, 99]: |  | ||||||
|                 # Multiply 1000 to convert the time unit from s to ms |  | ||||||
|                 raw_result.update( |  | ||||||
|                     {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]} |  | ||||||
|                 ) |  | ||||||
|             raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 |  | ||||||
|  |  | ||||||
|             # add the result to raw_result |  | ||||||
|             latency_results.append(raw_result) |  | ||||||
|             continue |  | ||||||
|  |  | ||||||
|         elif "throughput" in f.name: |  | ||||||
|             # this result is generated via `vllm bench throughput` command |  | ||||||
|  |  | ||||||
|             # attach the benchmarking command to raw_result |  | ||||||
|             try: |  | ||||||
|                 with open(test_file.with_suffix(".commands")) as f: |  | ||||||
|                     command = json.loads(f.read()) |  | ||||||
|             except OSError as e: |  | ||||||
|                 print(e) |  | ||||||
|                 continue |  | ||||||
|  |  | ||||||
|             raw_result.update(command) |  | ||||||
|  |  | ||||||
|             # update the test name of this result |  | ||||||
|             raw_result.update({"test_name": test_file.stem}) |  | ||||||
|  |  | ||||||
|             # add the result to raw_result |  | ||||||
|             throughput_results.append(raw_result) |  | ||||||
|             continue |  | ||||||
|  |  | ||||||
|         print(f"Skipping {test_file}") |  | ||||||
|  |  | ||||||
|     latency_results = pd.DataFrame.from_dict(latency_results) |  | ||||||
|     serving_results = pd.DataFrame.from_dict(serving_results) |  | ||||||
|     throughput_results = pd.DataFrame.from_dict(throughput_results) |  | ||||||
|  |  | ||||||
|     svmem = psutil.virtual_memory() |  | ||||||
|     platform_data = { |  | ||||||
|         "Physical cores": [psutil.cpu_count(logical=False)], |  | ||||||
|         "Total cores": [psutil.cpu_count(logical=True)], |  | ||||||
|         "Total Memory": [get_size_with_unit(svmem.total)], |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if util.find_spec("numa") is not None: |  | ||||||
|         from numa import info |  | ||||||
|  |  | ||||||
|         platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()] |  | ||||||
|  |  | ||||||
|     if util.find_spec("cpuinfo") is not None: |  | ||||||
|         from cpuinfo import get_cpu_info |  | ||||||
|  |  | ||||||
|         platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]] |  | ||||||
|  |  | ||||||
|     platform_results = pd.DataFrame.from_dict( |  | ||||||
|         platform_data, orient="index", columns=["Platform Info"] |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     raw_results_json = results_to_json( |  | ||||||
|         latency_results, throughput_results, serving_results |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     # remapping the key, for visualization purpose |  | ||||||
|     if not latency_results.empty: |  | ||||||
|         latency_results = latency_results[list(latency_column_mapping.keys())].rename( |  | ||||||
|             columns=latency_column_mapping |  | ||||||
|         ) |  | ||||||
|     if not serving_results.empty: |  | ||||||
|         valid_columns = [ |  | ||||||
|             col for col in serving_column_mapping if col in serving_results.columns |  | ||||||
|         ] |  | ||||||
|         serving_results = serving_results[valid_columns].rename( |  | ||||||
|             columns=serving_column_mapping |  | ||||||
|         ) |  | ||||||
|     if not throughput_results.empty: |  | ||||||
|         throughput_results = throughput_results[ |  | ||||||
|             list(throughput_results_column_mapping.keys()) |  | ||||||
|         ].rename(columns=throughput_results_column_mapping) |  | ||||||
|  |  | ||||||
|     processed_results_json = results_to_json( |  | ||||||
|         latency_results, throughput_results, serving_results |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     for df in [latency_results, serving_results, throughput_results]: |  | ||||||
|         if df.empty: |  | ||||||
|             continue |  | ||||||
|  |  | ||||||
|         # Sort all dataframes by their respective "Test name" columns |  | ||||||
|         df.sort_values(by="Test name", inplace=True) |  | ||||||
|  |  | ||||||
|         # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", |  | ||||||
|         # we want to turn it into "8xGPUTYPE" |  | ||||||
|         df["GPU"] = df["GPU"].apply( |  | ||||||
|             lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0]) |  | ||||||
|         ) |  | ||||||
|  |  | ||||||
|     # get markdown tables |  | ||||||
|     latency_md_table = tabulate( |  | ||||||
|         latency_results, headers="keys", tablefmt="pipe", showindex=False |  | ||||||
|     ) |  | ||||||
|     serving_md_table = tabulate( |  | ||||||
|         serving_results, headers="keys", tablefmt="pipe", showindex=False |  | ||||||
|     ) |  | ||||||
|     throughput_md_table = tabulate( |  | ||||||
|         throughput_results, headers="keys", tablefmt="pipe", showindex=False |  | ||||||
|     ) |  | ||||||
|     platform_md_table = tabulate( |  | ||||||
|         platform_results, headers="keys", tablefmt="pipe", showindex=True |  | ||||||
|     ) |  | ||||||
|  |  | ||||||
|     # document the result |  | ||||||
|     md_file = "benchmark_results.md" |  | ||||||
|     json_file = "benchmark_results.json" |  | ||||||
|     with open(results_folder / md_file, "w") as f: |  | ||||||
|         results = read_markdown( |  | ||||||
|             "../.buildkite/performance-benchmarks/" |  | ||||||
|             + "performance-benchmarks-descriptions.md" |  | ||||||
|         ) |  | ||||||
|         results = results.format( |  | ||||||
|             latency_tests_markdown_table=latency_md_table, |  | ||||||
|             throughput_tests_markdown_table=throughput_md_table, |  | ||||||
|             serving_tests_markdown_table=serving_md_table, |  | ||||||
|             platform_markdown_table=platform_md_table, |  | ||||||
|             benchmarking_results_in_json_string=processed_results_json, |  | ||||||
|         ) |  | ||||||
|         f.write(results) |  | ||||||
|  |  | ||||||
|     # document benchmarking results in json |  | ||||||
|     with open(results_folder / json_file, "w") as f: |  | ||||||
|         results = ( |  | ||||||
|             latency_results.to_dict(orient="records") |  | ||||||
|             + throughput_results.to_dict(orient="records") |  | ||||||
|             + serving_results.to_dict(orient="records") |  | ||||||
|         ) |  | ||||||
|         f.write(json.dumps(results)) |  | ||||||
| @ -1,224 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # Currently FP8 benchmark is NOT enabled. |  | ||||||
|  |  | ||||||
| set -x |  | ||||||
| server_params=$1 |  | ||||||
| common_params=$2 |  | ||||||
|  |  | ||||||
| json2args() { |  | ||||||
|   # transforms the JSON string to command line args, and '_' is replaced to '-' |  | ||||||
|   # example: |  | ||||||
|   # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } |  | ||||||
|   # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 |  | ||||||
|   local json_string=$1 |  | ||||||
|   local args=$( |  | ||||||
|     echo "$json_string" | jq -r ' |  | ||||||
|       to_entries | |  | ||||||
|       map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | |  | ||||||
|       join(" ") |  | ||||||
|     ' |  | ||||||
|   ) |  | ||||||
|   echo "$args" |  | ||||||
| } |  | ||||||
|  |  | ||||||
| launch_trt_server() { |  | ||||||
|  |  | ||||||
|   model_path=$(echo "$common_params" | jq -r '.model') |  | ||||||
|   model_name="${model_path#*/}" |  | ||||||
|   model_type=$(echo "$server_params" | jq -r '.model_type') |  | ||||||
|   model_dtype=$(echo "$server_params" | jq -r '.model_dtype') |  | ||||||
|   model_tp_size=$(echo "$common_params" | jq -r '.tp') |  | ||||||
|   max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') |  | ||||||
|   max_input_len=$(echo "$server_params" | jq -r '.max_input_len') |  | ||||||
|   max_seq_len=$(echo "$server_params" | jq -r '.max_seq_len') |  | ||||||
|   max_num_tokens=$(echo "$server_params" | jq -r '.max_num_tokens') |  | ||||||
|   trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') |  | ||||||
|  |  | ||||||
|   # create model caching directory |  | ||||||
|   cd ~ |  | ||||||
|   rm -rf models |  | ||||||
|   mkdir -p models |  | ||||||
|   cd models |  | ||||||
|   models_dir=$(pwd) |  | ||||||
|   trt_model_path=${models_dir}/${model_name}-trt-ckpt |  | ||||||
|   trt_engine_path=${models_dir}/${model_name}-trt-engine |  | ||||||
|  |  | ||||||
|   # clone tensorrt backend |  | ||||||
|   cd / |  | ||||||
|   rm -rf tensorrtllm_backend |  | ||||||
|   git clone https://github.com/triton-inference-server/tensorrtllm_backend.git |  | ||||||
|   git lfs install |  | ||||||
|   cd tensorrtllm_backend |  | ||||||
|   git checkout "$trt_llm_version" |  | ||||||
|   git submodule update --init --recursive |  | ||||||
|  |  | ||||||
|   # build trtllm engine |  | ||||||
|   cd /tensorrtllm_backend |  | ||||||
|   cd "./tensorrt_llm/examples/${model_type}" |  | ||||||
|   python3 convert_checkpoint.py \ |  | ||||||
|     --model_dir "${model_path}" \ |  | ||||||
|     --dtype "${model_dtype}" \ |  | ||||||
|     --tp_size "${model_tp_size}" \ |  | ||||||
|     --output_dir "${trt_model_path}" |  | ||||||
|   trtllm-build \ |  | ||||||
|     --checkpoint_dir "${trt_model_path}" \ |  | ||||||
|     --use_fused_mlp \ |  | ||||||
|     --reduce_fusion disable \ |  | ||||||
|     --workers 8 \ |  | ||||||
|     --gpt_attention_plugin "${model_dtype}" \ |  | ||||||
|     --gemm_plugin "${model_dtype}" \ |  | ||||||
|     --tp_size "${model_tp_size}" \ |  | ||||||
|     --max_batch_size "${max_batch_size}" \ |  | ||||||
|     --max_input_len "${max_input_len}" \ |  | ||||||
|     --max_seq_len "${max_seq_len}" \ |  | ||||||
|     --max_num_tokens "${max_num_tokens}" \ |  | ||||||
|     --output_dir "${trt_engine_path}" |  | ||||||
|  |  | ||||||
|   # handle triton protobuf files and launch triton server |  | ||||||
|   cd /tensorrtllm_backend |  | ||||||
|   mkdir triton_model_repo |  | ||||||
|   cp -r all_models/inflight_batcher_llm/* triton_model_repo/ |  | ||||||
|   cd triton_model_repo |  | ||||||
|   rm -rf ./tensorrt_llm/1/* |  | ||||||
|   cp -r "${trt_engine_path}"/* ./tensorrt_llm/1 |  | ||||||
|   python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false |  | ||||||
|   python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5" |  | ||||||
|   python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false" |  | ||||||
|   python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size" |  | ||||||
|   python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1" |  | ||||||
|   cd /tensorrtllm_backend |  | ||||||
|   python3 scripts/launch_triton_server.py \ |  | ||||||
|     --world_size="${model_tp_size}" \ |  | ||||||
|     --model_repo=/tensorrtllm_backend/triton_model_repo & |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| launch_tgi_server() { |  | ||||||
|   model=$(echo "$common_params" | jq -r '.model') |  | ||||||
|   tp=$(echo "$common_params" | jq -r '.tp') |  | ||||||
|   port=$(echo "$common_params" | jq -r '.port') |  | ||||||
|   server_args=$(json2args "$server_params") |  | ||||||
|  |  | ||||||
|   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then |  | ||||||
|     echo "Key 'fp8' exists in common params." |  | ||||||
|     server_command="/tgi-entrypoint.sh \ |  | ||||||
|                 --model-id $model \ |  | ||||||
|                 --num-shard $tp \ |  | ||||||
|                 --port $port \ |  | ||||||
|                 --quantize fp8 \ |  | ||||||
|                 $server_args" |  | ||||||
|   else |  | ||||||
|     echo "Key 'fp8' does not exist in common params." |  | ||||||
|     server_command="/tgi-entrypoint.sh \ |  | ||||||
|                 --model-id $model \ |  | ||||||
|                 --num-shard $tp \ |  | ||||||
|                 --port $port \ |  | ||||||
|                 $server_args" |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   echo "Server command: $server_command" |  | ||||||
|   eval "$server_command" & |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| launch_lmdeploy_server() { |  | ||||||
|   model=$(echo "$common_params" | jq -r '.model') |  | ||||||
|   tp=$(echo "$common_params" | jq -r '.tp') |  | ||||||
|   port=$(echo "$common_params" | jq -r '.port') |  | ||||||
|   server_args=$(json2args "$server_params") |  | ||||||
|  |  | ||||||
|   server_command="lmdeploy serve api_server $model \ |  | ||||||
|     --tp $tp \ |  | ||||||
|     --server-port $port \ |  | ||||||
|     $server_args" |  | ||||||
|  |  | ||||||
|   # run the server |  | ||||||
|   echo "Server command: $server_command" |  | ||||||
|   bash -c "$server_command" & |  | ||||||
| } |  | ||||||
|  |  | ||||||
| launch_sglang_server() { |  | ||||||
|  |  | ||||||
|   model=$(echo "$common_params" | jq -r '.model') |  | ||||||
|   tp=$(echo "$common_params" | jq -r '.tp') |  | ||||||
|   port=$(echo "$common_params" | jq -r '.port') |  | ||||||
|   server_args=$(json2args "$server_params") |  | ||||||
|  |  | ||||||
|   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then |  | ||||||
|     echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." |  | ||||||
|     model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') |  | ||||||
|     server_command="python3 \ |  | ||||||
|         -m sglang.launch_server \ |  | ||||||
|         --tp $tp \ |  | ||||||
|         --model-path $model \ |  | ||||||
|         --port $port \ |  | ||||||
|         $server_args" |  | ||||||
|   else |  | ||||||
|     echo "Key 'fp8' does not exist in common params." |  | ||||||
|     server_command="python3 \ |  | ||||||
|         -m sglang.launch_server \ |  | ||||||
|         --tp $tp \ |  | ||||||
|         --model-path $model \ |  | ||||||
|         --port $port \ |  | ||||||
|         $server_args" |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   # run the server |  | ||||||
|   echo "Server command: $server_command" |  | ||||||
|   eval "$server_command" & |  | ||||||
| } |  | ||||||
|  |  | ||||||
| launch_vllm_server() { |  | ||||||
|  |  | ||||||
|   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') |  | ||||||
|  |  | ||||||
|   model=$(echo "$common_params" | jq -r '.model') |  | ||||||
|   tp=$(echo "$common_params" | jq -r '.tp') |  | ||||||
|   port=$(echo "$common_params" | jq -r '.port') |  | ||||||
|   server_args=$(json2args "$server_params") |  | ||||||
|  |  | ||||||
|   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then |  | ||||||
|     echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." |  | ||||||
|     model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') |  | ||||||
|     server_command="vllm serve $model \ |  | ||||||
|         -tp $tp \ |  | ||||||
|         --port $port \ |  | ||||||
|         $server_args" |  | ||||||
|   else |  | ||||||
|     echo "Key 'fp8' does not exist in common params." |  | ||||||
|     server_command="vllm serve $model \ |  | ||||||
|         -tp $tp \ |  | ||||||
|         --port $port \ |  | ||||||
|         $server_args" |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   # run the server |  | ||||||
|   echo "Server command: $server_command" |  | ||||||
|   eval "$server_command" & |  | ||||||
| } |  | ||||||
|  |  | ||||||
| main() { |  | ||||||
|  |  | ||||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then |  | ||||||
|     launch_trt_server |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then |  | ||||||
|     launch_tgi_server |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then |  | ||||||
|     launch_lmdeploy_server |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then |  | ||||||
|     launch_sglang_server |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   if [[ "$CURRENT_LLM_SERVING_ENGINE" == *"vllm"* ]]; then |  | ||||||
|     launch_vllm_server |  | ||||||
|   fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| main |  | ||||||
| @ -1,504 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # This script should be run inside the CI process |  | ||||||
| # This script assumes that we are already inside the vllm/ directory |  | ||||||
| # Benchmarking results will be available inside vllm/benchmarks/results/ |  | ||||||
|  |  | ||||||
| # Do not set -e, as the mixtral 8x22B model tends to crash occasionally |  | ||||||
| # and we still want to see other benchmarking results even when mixtral crashes. |  | ||||||
| set -x |  | ||||||
| set -o pipefail |  | ||||||
|  |  | ||||||
| check_gpus() { |  | ||||||
|   if command -v nvidia-smi; then |  | ||||||
|     # check the number of GPUs and GPU type. |  | ||||||
|     declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) |  | ||||||
|   elif command -v amd-smi; then |  | ||||||
|     declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) |  | ||||||
|   elif command -v hl-smi; then |  | ||||||
|     declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l) |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   if [[ $gpu_count -gt 0 ]]; then |  | ||||||
|     echo "GPU found." |  | ||||||
|   else |  | ||||||
|     echo "Need at least 1 GPU to run benchmarking." |  | ||||||
|     exit 1 |  | ||||||
|   fi |  | ||||||
|    |  | ||||||
|   declare -g arch_suffix='' |  | ||||||
|    |  | ||||||
|   if command -v nvidia-smi; then |  | ||||||
|     declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') |  | ||||||
|   elif command -v amd-smi; then |  | ||||||
|     declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}') |  | ||||||
|   elif command -v hl-smi; then |  | ||||||
|     declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//') |  | ||||||
|     arch_suffix='-hpu' |  | ||||||
|   fi |  | ||||||
|   echo "GPU type is $gpu_type" |  | ||||||
| } |  | ||||||
|  |  | ||||||
| check_cpus() { |  | ||||||
|   # check the number of CPUs and NUMA Node and GPU type. |  | ||||||
|   declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') |  | ||||||
|   if [[ $numa_count -gt 0 ]]; then |  | ||||||
|     echo "NUMA found." |  | ||||||
|     echo $numa_count |  | ||||||
|   else |  | ||||||
|     echo "Need at least 1 NUMA to run benchmarking." |  | ||||||
|     exit 1 |  | ||||||
|   fi |  | ||||||
|   declare -g gpu_type="cpu" |  | ||||||
|   echo "GPU type is $gpu_type" |  | ||||||
| } |  | ||||||
|  |  | ||||||
| check_hf_token() { |  | ||||||
|   # check if HF_TOKEN is available and valid |  | ||||||
|   if [[ -z "$HF_TOKEN" ]]; then |  | ||||||
|     echo "Error: HF_TOKEN is not set." |  | ||||||
|     exit 1 |  | ||||||
|   elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then |  | ||||||
|     echo "Error: HF_TOKEN does not start with 'hf_'." |  | ||||||
|     exit 1 |  | ||||||
|   else |  | ||||||
|     echo "HF_TOKEN is set and valid." |  | ||||||
|   fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| ensure_sharegpt_downloaded() { |  | ||||||
|   local FILE=ShareGPT_V3_unfiltered_cleaned_split.json |  | ||||||
|   if [ ! -f "$FILE" ]; then |  | ||||||
|     wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$FILE |  | ||||||
|   else |  | ||||||
|     echo "$FILE already exists." |  | ||||||
|   fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| json2args() { |  | ||||||
|   # transforms the JSON string to command line args, and '_' is replaced to '-' |  | ||||||
|   # example: |  | ||||||
|   # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } |  | ||||||
|   # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 |  | ||||||
|   local json_string=$1 |  | ||||||
|   local args=$( |  | ||||||
|     echo "$json_string" | jq -r ' |  | ||||||
|       to_entries | |  | ||||||
|       map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | |  | ||||||
|       join(" ") |  | ||||||
|     ' |  | ||||||
|   ) |  | ||||||
|   echo "$args" |  | ||||||
| } |  | ||||||
|  |  | ||||||
| json2envs() { |  | ||||||
|   # transforms the JSON string to environment variables. |  | ||||||
|   # example: |  | ||||||
|   # input: { "VLLM_CPU_KVCACHE_SPACE": 5 } |  | ||||||
|   # output: VLLM_CPU_KVCACHE_SPACE=5 |  | ||||||
|   local json_string=$1 |  | ||||||
|   local args=$( |  | ||||||
|     echo "$json_string" | jq -r ' |  | ||||||
|       to_entries | |  | ||||||
|       map((.key ) + "=" + (.value | tostring)) | |  | ||||||
|       join(" ") |  | ||||||
|     ' |  | ||||||
|   ) |  | ||||||
|   echo "$args" |  | ||||||
| } |  | ||||||
|  |  | ||||||
| wait_for_server() { |  | ||||||
|   # wait for vllm server to start |  | ||||||
|   # return 1 if vllm server crashes |  | ||||||
|   timeout 1200 bash -c ' |  | ||||||
|     until curl -X POST localhost:8000/v1/completions; do |  | ||||||
|       sleep 1 |  | ||||||
|     done' && return 0 || return 1 |  | ||||||
| } |  | ||||||
|  |  | ||||||
| kill_processes_launched_by_current_bash() { |  | ||||||
|   # Kill all python processes launched from current bash script |  | ||||||
|   current_shell_pid=$$ |  | ||||||
|   processes=$(ps -eo pid,ppid,command | awk -v ppid="$current_shell_pid" -v proc="$1" '$2 == ppid && $3 ~ proc {print $1}') |  | ||||||
|   if [ -n "$processes" ]; then |  | ||||||
|     echo "Killing the following processes matching '$1':" |  | ||||||
|     echo "$processes" |  | ||||||
|     echo "$processes" | xargs kill -9 |  | ||||||
|   else |  | ||||||
|     echo "No processes found matching '$1'." |  | ||||||
|   fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| kill_gpu_processes() { |  | ||||||
|  |  | ||||||
|   ps -aux |  | ||||||
|   lsof -t -i:8000 | xargs -r kill -9 |  | ||||||
|   pgrep python3 | xargs -r kill -9 |  | ||||||
|   # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 |  | ||||||
|   pgrep VLLM | xargs -r kill -9 |  | ||||||
|  |  | ||||||
|   # wait until GPU memory usage smaller than 1GB |  | ||||||
|   if command -v nvidia-smi; then |  | ||||||
|     while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do |  | ||||||
|       sleep 1 |  | ||||||
|     done |  | ||||||
|   elif command -v amd-smi; then |  | ||||||
|     while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do |  | ||||||
|       sleep 1 |  | ||||||
|     done |  | ||||||
|   elif command -v hl-smi; then |  | ||||||
|     while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do |  | ||||||
|       sleep 1 |  | ||||||
|     done |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   # remove vllm config file |  | ||||||
|   rm -rf ~/.config/vllm |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| upload_to_buildkite() { |  | ||||||
|   # upload the benchmarking results to buildkite |  | ||||||
|  |  | ||||||
|   # if the agent binary is not found, skip uploading the results, exit 0 |  | ||||||
|   # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent |  | ||||||
|   if command -v buildkite-agent >/dev/null 2>&1; then |  | ||||||
|     BUILDKITE_AGENT_COMMAND="buildkite-agent" |  | ||||||
|   elif [ -f /workspace/buildkite-agent ]; then |  | ||||||
|     BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent" |  | ||||||
|   else |  | ||||||
|     echo "buildkite-agent binary not found. Skip uploading the results." |  | ||||||
|     return 0 |  | ||||||
|   fi |  | ||||||
|  |  | ||||||
|   # Use the determined command to annotate and upload artifacts |  | ||||||
|   $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md" |  | ||||||
|   $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" |  | ||||||
| } |  | ||||||
|  |  | ||||||
| run_latency_tests() { |  | ||||||
|   # run latency tests using `vllm bench latency` command |  | ||||||
|   # $1: a json file specifying latency test cases |  | ||||||
|  |  | ||||||
|   local latency_test_file |  | ||||||
|   latency_test_file=$1 |  | ||||||
|  |  | ||||||
|   # Iterate over latency tests |  | ||||||
|   jq -c '.[]' "$latency_test_file" | while read -r params; do |  | ||||||
|     # get the test name, and append the GPU type back to it. |  | ||||||
|     test_name=$(echo "$params" | jq -r '.test_name') |  | ||||||
|     if [[ ! "$test_name" =~ ^latency_ ]]; then |  | ||||||
|       echo "In latency-test.json, test_name must start with \"latency_\"." |  | ||||||
|       exit 1 |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     # if TEST_SELECTOR is set, only run the test cases that match the selector |  | ||||||
|     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then |  | ||||||
|       echo "Skip test case $test_name." |  | ||||||
|       continue |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     # get arguments |  | ||||||
|     latency_params=$(echo "$params" | jq -r '.parameters') |  | ||||||
|     latency_args=$(json2args "$latency_params") |  | ||||||
|     latency_environment_variables=$(echo "$params" | jq -r '.environment_variables') |  | ||||||
|     latency_envs=$(json2envs "$latency_environment_variables") |  | ||||||
|  |  | ||||||
|     # check if there is enough GPU to run the test |  | ||||||
|     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') |  | ||||||
|     if [ "$ON_CPU" == "1" ]; then |  | ||||||
|       pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size') |  | ||||||
|       world_size=$(($tp*$pp)) |  | ||||||
|       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then |  | ||||||
|         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." |  | ||||||
|         continue |  | ||||||
|       fi |  | ||||||
|     else |  | ||||||
|       if [[ $gpu_count -lt $tp ]]; then |  | ||||||
|         echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." |  | ||||||
|         continue |  | ||||||
|       fi |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     latency_command=" $latency_envs vllm bench latency \ |  | ||||||
|       --output-json $RESULTS_FOLDER/${test_name}.json \ |  | ||||||
|       $latency_args" |  | ||||||
|  |  | ||||||
|     echo "Running test case $test_name" |  | ||||||
|     echo "Latency command: $latency_command" |  | ||||||
|  |  | ||||||
|     # recoding benchmarking command ang GPU command |  | ||||||
|     jq_output=$(jq -n \ |  | ||||||
|       --arg latency "$latency_command" \ |  | ||||||
|       --arg gpu "$gpu_type" \ |  | ||||||
|       '{ |  | ||||||
|         latency_command: $latency, |  | ||||||
|         gpu_type: $gpu |  | ||||||
|       }') |  | ||||||
|     echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" |  | ||||||
|  |  | ||||||
|     # run the benchmark |  | ||||||
|     eval "$latency_command" |  | ||||||
|  |  | ||||||
|     kill_gpu_processes |  | ||||||
|  |  | ||||||
|   done |  | ||||||
| } |  | ||||||
|  |  | ||||||
| run_throughput_tests() { |  | ||||||
|   # run throughput tests using `vllm bench throughput` |  | ||||||
|   # $1: a json file specifying throughput test cases |  | ||||||
|  |  | ||||||
|   local throughput_test_file |  | ||||||
|   throughput_test_file=$1 |  | ||||||
|  |  | ||||||
|   # Iterate over throughput tests |  | ||||||
|   jq -c '.[]' "$throughput_test_file" | while read -r params; do |  | ||||||
|     # get the test name, and append the GPU type back to it. |  | ||||||
|     test_name=$(echo "$params" | jq -r '.test_name') |  | ||||||
|     if [[ ! "$test_name" =~ ^throughput_ ]]; then |  | ||||||
|       echo "In throughput-test.json, test_name must start with \"throughput_\"." |  | ||||||
|       exit 1 |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     # if TEST_SELECTOR is set, only run the test cases that match the selector |  | ||||||
|     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then |  | ||||||
|       echo "Skip test case $test_name." |  | ||||||
|       continue |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     # get arguments |  | ||||||
|     throughput_params=$(echo "$params" | jq -r '.parameters') |  | ||||||
|     throughput_args=$(json2args "$throughput_params") |  | ||||||
|     throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables') |  | ||||||
|     throughput_envs=$(json2envs "$throughput_environment_variables") |  | ||||||
|  |  | ||||||
|     # check if there is enough GPU to run the test |  | ||||||
|     tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') |  | ||||||
|     if [ "$ON_CPU" == "1" ]; then |  | ||||||
|       pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size') |  | ||||||
|       world_size=$(($tp*$pp)) |  | ||||||
|       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then |  | ||||||
|         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." |  | ||||||
|         continue |  | ||||||
|       fi |  | ||||||
|     else |  | ||||||
|       if [[ $gpu_count -lt $tp ]]; then |  | ||||||
|         echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." |  | ||||||
|         continue |  | ||||||
|       fi |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     throughput_command=" $throughput_envs vllm bench throughput \ |  | ||||||
|       --output-json $RESULTS_FOLDER/${test_name}.json \ |  | ||||||
|       $throughput_args" |  | ||||||
|  |  | ||||||
|     echo "Running test case $test_name" |  | ||||||
|     echo "Throughput command: $throughput_command" |  | ||||||
|     # recoding benchmarking command ang GPU command |  | ||||||
|     jq_output=$(jq -n \ |  | ||||||
|       --arg command "$throughput_command" \ |  | ||||||
|       --arg gpu "$gpu_type" \ |  | ||||||
|       '{ |  | ||||||
|         throughput_command: $command, |  | ||||||
|         gpu_type: $gpu |  | ||||||
|       }') |  | ||||||
|     echo "$jq_output" >"$RESULTS_FOLDER/$test_name.commands" |  | ||||||
|  |  | ||||||
|     # run the benchmark |  | ||||||
|     eval "$throughput_command" |  | ||||||
|  |  | ||||||
|     kill_gpu_processes |  | ||||||
|  |  | ||||||
|   done |  | ||||||
| } |  | ||||||
|  |  | ||||||
| run_serving_tests() { |  | ||||||
|   # run serving tests using `vllm bench serve` command |  | ||||||
|   # $1: a json file specifying serving test cases |  | ||||||
|  |  | ||||||
|   local serving_test_file |  | ||||||
|   serving_test_file=$1 |  | ||||||
|  |  | ||||||
|   # Iterate over serving tests |  | ||||||
|   jq -c '.[]' "$serving_test_file" | while read -r params; do |  | ||||||
|     # get the test name, and append the GPU type back to it. |  | ||||||
|     test_name=$(echo "$params" | jq -r '.test_name') |  | ||||||
|     if [[ ! "$test_name" =~ ^serving_ ]]; then |  | ||||||
|       echo "In serving-test.json, test_name must start with \"serving_\"." |  | ||||||
|       exit 1 |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     # if TEST_SELECTOR is set, only run the test cases that match the selector |  | ||||||
|     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then |  | ||||||
|       echo "Skip test case $test_name." |  | ||||||
|       continue |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     # get client and server arguments |  | ||||||
|     server_params=$(echo "$params" | jq -r '.server_parameters') |  | ||||||
|     server_envs=$(echo "$params" | jq -r '.server_environment_variables') |  | ||||||
|     client_params=$(echo "$params" | jq -r '.client_parameters') |  | ||||||
|     server_args=$(json2args "$server_params") |  | ||||||
|     server_envs=$(json2envs "$server_envs") |  | ||||||
|     client_args=$(json2args "$client_params") |  | ||||||
|     qps_list=$(echo "$params" | jq -r '.qps_list') |  | ||||||
|     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') |  | ||||||
|     echo "Running over qps list $qps_list" |  | ||||||
|     max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') |  | ||||||
|     if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then |  | ||||||
|         num_prompts=$(echo "$client_params" | jq -r '.num_prompts') |  | ||||||
|         max_concurrency_list="[$num_prompts]" |  | ||||||
|     fi |  | ||||||
|     max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') |  | ||||||
|     echo "Running over max concurrency list $max_concurrency_list" |  | ||||||
|  |  | ||||||
|     # check if there is enough resources to run the test |  | ||||||
|     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') |  | ||||||
|     if [ "$ON_CPU" == "1" ]; then |  | ||||||
|       pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size') |  | ||||||
|       world_size=$(($tp*$pp)) |  | ||||||
|       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then |  | ||||||
|         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." |  | ||||||
|         continue |  | ||||||
|       fi |  | ||||||
|     else |  | ||||||
|       if [[ $gpu_count -lt $tp ]]; then |  | ||||||
|         echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." |  | ||||||
|         continue |  | ||||||
|       fi |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     # check if server model and client model is aligned |  | ||||||
|     server_model=$(echo "$server_params" | jq -r '.model') |  | ||||||
|     client_model=$(echo "$client_params" | jq -r '.model') |  | ||||||
|     if [[ $server_model != "$client_model" ]]; then |  | ||||||
|       echo "Server model and client model must be the same. Skip testcase $test_name." |  | ||||||
|       continue |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     server_command="$server_envs vllm serve \ |  | ||||||
|       $server_args" |  | ||||||
|  |  | ||||||
|     # run the server |  | ||||||
|     echo "Running test case $test_name" |  | ||||||
|     echo "Server command: $server_command" |  | ||||||
|     # support remote vllm server |  | ||||||
|     client_remote_args="" |  | ||||||
|     if [[ -z "${REMOTE_HOST}" ]]; then |  | ||||||
|       bash -c "$server_command" & |  | ||||||
|       server_pid=$! |  | ||||||
|       # wait until the server is alive |  | ||||||
|       if wait_for_server; then |  | ||||||
|         echo "" |  | ||||||
|         echo "vLLM server is up and running." |  | ||||||
|       else |  | ||||||
|         echo "" |  | ||||||
|         echo "vLLM failed to start within the timeout period." |  | ||||||
|       fi |  | ||||||
|     else |  | ||||||
|       server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" |  | ||||||
|       if [[ ${REMOTE_PORT} ]]; then |  | ||||||
|         client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT " |  | ||||||
|       else |  | ||||||
|         client_remote_args=" --host=$REMOTE_HOST " |  | ||||||
|       fi |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     # iterate over different QPS |  | ||||||
|     for qps in $qps_list; do |  | ||||||
|       # remove the surrounding single quote from qps |  | ||||||
|       if [[ "$qps" == *"inf"* ]]; then |  | ||||||
|         echo "qps was $qps" |  | ||||||
|         qps="inf" |  | ||||||
|         echo "now qps is $qps" |  | ||||||
|       fi |  | ||||||
|  |  | ||||||
|       # iterate over different max_concurrency |  | ||||||
|       for max_concurrency in $max_concurrency_list; do |  | ||||||
|         new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency |  | ||||||
|         echo " new test name $new_test_name" |  | ||||||
|         # pass the tensor parallel size to the client so that it can be displayed |  | ||||||
|         # on the benchmark dashboard |  | ||||||
|         client_command="vllm bench serve \ |  | ||||||
|           --save-result \ |  | ||||||
|           --result-dir $RESULTS_FOLDER \ |  | ||||||
|           --result-filename ${new_test_name}.json \ |  | ||||||
|           --request-rate $qps \ |  | ||||||
|           --max-concurrency $max_concurrency \ |  | ||||||
|           --metadata "tensor_parallel_size=$tp" \ |  | ||||||
|           $client_args $client_remote_args " |  | ||||||
|  |  | ||||||
|         echo "Running test case $test_name with qps $qps" |  | ||||||
|         echo "Client command: $client_command" |  | ||||||
|  |  | ||||||
|         bash -c "$client_command" |  | ||||||
|  |  | ||||||
|         # record the benchmarking commands |  | ||||||
|         jq_output=$(jq -n \ |  | ||||||
|           --arg server "$server_command" \ |  | ||||||
|           --arg client "$client_command" \ |  | ||||||
|           --arg gpu "$gpu_type" \ |  | ||||||
|           '{ |  | ||||||
|             server_command: $server, |  | ||||||
|             client_command: $client, |  | ||||||
|             gpu_type: $gpu |  | ||||||
|           }') |  | ||||||
|         echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" |  | ||||||
|  |  | ||||||
|       done |  | ||||||
|     done |  | ||||||
|  |  | ||||||
|     # clean up |  | ||||||
|     kill -9 $server_pid |  | ||||||
|     kill_gpu_processes |  | ||||||
|   done |  | ||||||
| } |  | ||||||
|  |  | ||||||
| main() { |  | ||||||
|   local ARCH |  | ||||||
|   ARCH='' |  | ||||||
|   if [ "$ON_CPU" == "1" ];then |  | ||||||
|      check_cpus |  | ||||||
|      ARCH='-cpu' |  | ||||||
|   else |  | ||||||
|      check_gpus |  | ||||||
|      ARCH="$arch_suffix" |  | ||||||
|   fi |  | ||||||
|   check_hf_token |  | ||||||
|  |  | ||||||
|   # dependencies |  | ||||||
|   (which wget && which curl) || (apt-get update && apt-get install -y wget curl) |  | ||||||
|   (which jq) || (apt-get update && apt-get -y install jq) |  | ||||||
|   (which lsof) || (apt-get update && apt-get install -y lsof) |  | ||||||
|  |  | ||||||
|   # get the current IP address, required by `vllm bench serve` command |  | ||||||
|   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') |  | ||||||
|   # turn of the reporting of the status of each request, to clean up the terminal output |  | ||||||
|   export VLLM_LOGGING_LEVEL="WARNING" |  | ||||||
|  |  | ||||||
|   # prepare for benchmarking |  | ||||||
|   cd benchmarks || exit 1 |  | ||||||
|   ensure_sharegpt_downloaded |  | ||||||
|   declare -g RESULTS_FOLDER=results/ |  | ||||||
|   mkdir -p $RESULTS_FOLDER |  | ||||||
|   QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/ |  | ||||||
|  |  | ||||||
|   # dump vllm info via vllm collect-env |  | ||||||
|   env_output=$(vllm collect-env) |  | ||||||
|  |  | ||||||
|   echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" |  | ||||||
|  |  | ||||||
|   # benchmarking |  | ||||||
|   run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" |  | ||||||
|   run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" |  | ||||||
|   run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" |  | ||||||
|  |  | ||||||
|   # postprocess benchmarking results |  | ||||||
|   pip install tabulate pandas |  | ||||||
|   python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py |  | ||||||
|  |  | ||||||
|   upload_to_buildkite |  | ||||||
| } |  | ||||||
|  |  | ||||||
| main "$@" |  | ||||||
| @ -1,21 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "llama8B_tp1_genai_perf", |  | ||||||
|         "qps_list": [4,8,16,32], |  | ||||||
|         "common_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-8B-Instruct", |  | ||||||
|             "tp": 1, |  | ||||||
|             "port": 8000, |  | ||||||
|             "num_prompts": 500, |  | ||||||
|             "reuse_server": false |  | ||||||
|         }, |  | ||||||
|         "vllm_server_parameters": { |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "gpu_memory_utilization": 0.9, |  | ||||||
|             "max_num_seqs": 512, |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "genai_perf_input_parameters": { |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,26 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "latency_llama8B_tp2", |  | ||||||
|         "environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "num_iters_warmup": 5, |  | ||||||
|             "num_iters": 15 |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,55 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "latency_llama8B_tp1", |  | ||||||
|         "environment_variables": { |  | ||||||
|             "PT_HPU_LAZY_MODE": 1, |  | ||||||
|             "VLLM_CONTIGUOUS_PA": 1, |  | ||||||
|             "VLLM_DEFRAG": 1 |  | ||||||
|         }, |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "num-iters-warmup": 5, |  | ||||||
|             "num-iters": 15, |  | ||||||
|             "max-model-len": 256, |  | ||||||
|             "async-scheduling": "" |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "latency_llama70B_tp4", |  | ||||||
|         "environment_variables": { |  | ||||||
|             "PT_HPU_LAZY_MODE": 1, |  | ||||||
|             "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, |  | ||||||
|             "VLLM_CONTIGUOUS_PA": 1, |  | ||||||
|             "VLLM_DEFRAG": 1 |  | ||||||
|         }, |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "num-iters-warmup": 5, |  | ||||||
|             "num-iters": 15, |  | ||||||
|             "max-model-len": 256, |  | ||||||
|             "async-scheduling": "" |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "latency_mixtral8x7B_tp2", |  | ||||||
|         "environment_variables": { |  | ||||||
|             "PT_HPU_LAZY_MODE": 1, |  | ||||||
|             "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, |  | ||||||
|             "VLLM_CONTIGUOUS_PA": 1, |  | ||||||
|             "VLLM_DEFRAG": 1 |  | ||||||
|         }, |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "num-iters-warmup": 5, |  | ||||||
|             "num-iters": 15, |  | ||||||
|             "max-model-len": 256, |  | ||||||
|             "async-scheduling": "" |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,32 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "latency_llama8B_tp1", |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "num_iters_warmup": 5, |  | ||||||
|             "num_iters": 15 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "latency_llama70B_tp4", |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "num-iters-warmup": 5, |  | ||||||
|             "num-iters": 15 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "latency_mixtral8x7B_tp2", |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "num-iters-warmup": 5, |  | ||||||
|             "num-iters": 15 |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,311 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "llama8B_tp1_sharegpt", |  | ||||||
|         "qps_list": [4,8,16,32,"inf"], |  | ||||||
|         "common_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-8B-Instruct", |  | ||||||
|             "tp": 1, |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 500, |  | ||||||
|             "port": 8000, |  | ||||||
|             "reuse_server": false |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_server_parameters": { |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_server_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_client_parameters": { |  | ||||||
|             "endpoint": "/generate_stream" |  | ||||||
|         }, |  | ||||||
|         "trt_server_parameters": { |  | ||||||
|             "model_type": "llama", |  | ||||||
|             "model_dtype": "bfloat16", |  | ||||||
|             "max_batch_size": 2048, |  | ||||||
|             "max_input_len": 4096, |  | ||||||
|             "max_seq_len": 6144, |  | ||||||
|             "max_num_tokens": 16384, |  | ||||||
|             "trt_llm_version": "v0.11.0" |  | ||||||
|         }, |  | ||||||
|         "trt_client_parameters": { |  | ||||||
|             "endpoint": "/v2/models/ensemble/generate_stream" |  | ||||||
|         },  |  | ||||||
|         "vllm_server_parameters": { |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "gpu_memory_utilization": 0.9, |  | ||||||
|             "max_num_seqs": 512, |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "vllm_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "sglang_server_parameters": { |  | ||||||
|             "disable_radix_cache": "", |  | ||||||
|             "enable_torch_compile": "", |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "sglang_client_parameters": { |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "llama8B_tp1_sonnet_512_16", |  | ||||||
|         "qps_list": [4,8,16,32,"inf"], |  | ||||||
|         "common_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-8B-Instruct", |  | ||||||
|             "tp": 1, |  | ||||||
|             "dataset_name": "sonnet", |  | ||||||
|             "dataset_path": "./sonnet_4x.txt", |  | ||||||
|             "num_prompts": 500, |  | ||||||
|             "port": 8000, |  | ||||||
|             "sonnet_input_len": 512, |  | ||||||
|             "sonnet_output_len": 16, |  | ||||||
|             "sonnet_prefix_len": 50, |  | ||||||
|             "reuse_server": true |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_server_parameters": { |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_server_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_client_parameters": { |  | ||||||
|             "endpoint": "/generate_stream" |  | ||||||
|         }, |  | ||||||
|         "trt_server_parameters": { |  | ||||||
|             "model_type": "llama", |  | ||||||
|             "model_dtype": "bfloat16", |  | ||||||
|             "max_batch_size": 2048, |  | ||||||
|             "max_input_len": 4096, |  | ||||||
|             "max_seq_len": 6144, |  | ||||||
|             "max_num_tokens": 16384, |  | ||||||
|             "trt_llm_version": "v0.11.0" |  | ||||||
|         }, |  | ||||||
|         "trt_client_parameters": { |  | ||||||
|             "endpoint": "/v2/models/ensemble/generate_stream" |  | ||||||
|         },  |  | ||||||
|         "vllm_server_parameters": { |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "gpu_memory_utilization": 0.9, |  | ||||||
|             "max_num_seqs": 512, |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "vllm_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "sglang_server_parameters": { |  | ||||||
|             "disable_radix_cache": "", |  | ||||||
|             "enable_torch_compile": "", |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "sglang_client_parameters": { |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "llama8B_tp1_sonnet_512_256", |  | ||||||
|         "qps_list": [4,8,16,32,"inf"], |  | ||||||
|         "common_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-8B-Instruct", |  | ||||||
|             "tp": 1, |  | ||||||
|             "dataset_name": "sonnet", |  | ||||||
|             "dataset_path": "./sonnet_4x.txt", |  | ||||||
|             "num_prompts": 500, |  | ||||||
|             "port": 8000, |  | ||||||
|             "sonnet_input_len": 512, |  | ||||||
|             "sonnet_output_len": 256, |  | ||||||
|             "sonnet_prefix_len": 50, |  | ||||||
|             "reuse_server": true |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_server_parameters": { |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_server_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_client_parameters": { |  | ||||||
|             "endpoint": "/generate_stream" |  | ||||||
|         }, |  | ||||||
|         "trt_server_parameters": { |  | ||||||
|             "model_type": "llama", |  | ||||||
|             "model_dtype": "bfloat16", |  | ||||||
|             "max_batch_size": 2048, |  | ||||||
|             "max_input_len": 4096, |  | ||||||
|             "max_seq_len": 6144, |  | ||||||
|             "max_num_tokens": 16384, |  | ||||||
|             "trt_llm_version": "v0.11.0" |  | ||||||
|         }, |  | ||||||
|         "trt_client_parameters": { |  | ||||||
|             "endpoint": "/v2/models/ensemble/generate_stream" |  | ||||||
|         },  |  | ||||||
|         "vllm_server_parameters": { |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "gpu_memory_utilization": 0.9, |  | ||||||
|             "max_num_seqs": 512, |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "vllm_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "sglang_server_parameters": { |  | ||||||
|             "disable_radix_cache": "", |  | ||||||
|             "enable_torch_compile": "", |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "sglang_client_parameters": { |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "llama70B_tp4_sharegpt", |  | ||||||
|         "qps_list": [4,8,16,32,"inf"], |  | ||||||
|         "common_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-70B-Instruct", |  | ||||||
|             "tp": 4, |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 500, |  | ||||||
|             "port": 8000, |  | ||||||
|             "reuse_server": false |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_server_parameters": { |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_server_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_client_parameters": { |  | ||||||
|             "endpoint": "/generate_stream" |  | ||||||
|         }, |  | ||||||
|         "trt_server_parameters": { |  | ||||||
|             "model_type": "llama", |  | ||||||
|             "model_dtype": "bfloat16", |  | ||||||
|             "max_batch_size": 2048, |  | ||||||
|             "max_input_len": 4096, |  | ||||||
|             "max_seq_len": 6144, |  | ||||||
|             "max_num_tokens": 16384, |  | ||||||
|             "trt_llm_version": "v0.11.0" |  | ||||||
|         }, |  | ||||||
|         "trt_client_parameters": { |  | ||||||
|             "endpoint": "/v2/models/ensemble/generate_stream" |  | ||||||
|         },  |  | ||||||
|         "vllm_server_parameters": { |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "gpu_memory_utilization": 0.9, |  | ||||||
|             "max_num_seqs": 512, |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "vllm_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "sglang_server_parameters": { |  | ||||||
|             "disable_radix_cache": "", |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "sglang_client_parameters": { |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "llama70B_tp4_sonnet_512_16", |  | ||||||
|         "qps_list": [4,8,16,32,"inf"], |  | ||||||
|         "common_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-70B-Instruct", |  | ||||||
|             "tp": 4, |  | ||||||
|             "dataset_name": "sonnet", |  | ||||||
|             "dataset_path": "./sonnet_4x.txt", |  | ||||||
|             "num_prompts": 500, |  | ||||||
|             "port": 8000, |  | ||||||
|             "sonnet_input_len": 512, |  | ||||||
|             "sonnet_output_len": 16, |  | ||||||
|             "sonnet_prefix_len": 50, |  | ||||||
|             "reuse_server": true |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_server_parameters": { |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_server_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_client_parameters": { |  | ||||||
|             "endpoint": "/generate_stream" |  | ||||||
|         }, |  | ||||||
|         "trt_server_parameters": { |  | ||||||
|             "model_type": "llama", |  | ||||||
|             "model_dtype": "bfloat16", |  | ||||||
|             "max_batch_size": 2048, |  | ||||||
|             "max_input_len": 4096, |  | ||||||
|             "max_seq_len": 6144, |  | ||||||
|             "max_num_tokens": 16384, |  | ||||||
|             "trt_llm_version": "v0.11.0" |  | ||||||
|         }, |  | ||||||
|         "trt_client_parameters": { |  | ||||||
|             "endpoint": "/v2/models/ensemble/generate_stream" |  | ||||||
|         },  |  | ||||||
|         "vllm_server_parameters": { |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "gpu_memory_utilization": 0.9, |  | ||||||
|             "max_num_seqs": 512, |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "vllm_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "sglang_server_parameters": { |  | ||||||
|             "disable_radix_cache": "", |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "sglang_client_parameters": { |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "llama70B_tp4_sonnet_512_256", |  | ||||||
|         "qps_list": [4,8,16,32,"inf"], |  | ||||||
|         "common_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3-70B-Instruct", |  | ||||||
|             "tp": 4, |  | ||||||
|             "dataset_name": "sonnet", |  | ||||||
|             "dataset_path": "./sonnet_4x.txt", |  | ||||||
|             "num_prompts": 500, |  | ||||||
|             "port": 8000, |  | ||||||
|             "sonnet_input_len": 512, |  | ||||||
|             "sonnet_output_len": 256, |  | ||||||
|             "sonnet_prefix_len": 50, |  | ||||||
|             "reuse_server": true |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_server_parameters": { |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "lmdeploy_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_server_parameters": { |  | ||||||
|         }, |  | ||||||
|         "tgi_client_parameters": { |  | ||||||
|             "endpoint": "/generate_stream" |  | ||||||
|         }, |  | ||||||
|         "trt_server_parameters": { |  | ||||||
|             "model_type": "llama", |  | ||||||
|             "model_dtype": "bfloat16", |  | ||||||
|             "max_batch_size": 2048, |  | ||||||
|             "max_input_len": 4096, |  | ||||||
|             "max_seq_len": 6144, |  | ||||||
|             "max_num_tokens": 16384, |  | ||||||
|             "trt_llm_version": "v0.11.0" |  | ||||||
|         }, |  | ||||||
|         "trt_client_parameters": { |  | ||||||
|             "endpoint": "/v2/models/ensemble/generate_stream" |  | ||||||
|         },  |  | ||||||
|         "vllm_server_parameters": { |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "gpu_memory_utilization": 0.9, |  | ||||||
|             "max_num_seqs": 512, |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "vllm_client_parameters": { |  | ||||||
|         }, |  | ||||||
|         "sglang_server_parameters": { |  | ||||||
|             "disable_radix_cache": "", |  | ||||||
|             "dtype": "bfloat16" |  | ||||||
|         }, |  | ||||||
|         "sglang_client_parameters": { |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,610 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_bf16_tp1_sharegpt", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_bf16_tp2_sharegpt", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_bf16_tp4_sharegpt", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_bf16_tp1_random_128_128", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 1000 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_bf16_tp2_random_128_128", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 1000 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_bf16_tp4_random_128_128", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
|             "num_prompts": 1000 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int8_tp1_sharegpt", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int8_tp2_sharegpt", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int8_tp4_sharegpt", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int8_tp1_random_128_128", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 1000 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int8_tp2_random_128_128", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 1000 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int8_tp4_random_128_128", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 1000 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int4_tp1_sharegpt", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
| 	    "quantization": "awq", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int4_tp2_sharegpt", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
| 	    "quantization": "awq", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int4_tp4_sharegpt", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
| 	    "quantization": "awq", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int4_tp1_random_128_128", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
| 	    "quantization": "awq", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 1000 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int4_tp2_random_128_128", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
| 	    "quantization": "awq", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 1000 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_int4_tp4_random_128_128", |  | ||||||
|         "qps_list": ["inf"], |  | ||||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
| 	    "quantization": "awq", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 1000 |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,276 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp1_sharegpt", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "max_concurrency_list": [32], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 32 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp2_sharegpt", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "max_concurrency_list": [32], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 32 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp1_random_128_128", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "max_concurrency_list": [32], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 32 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp2_random_128_128", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "max_concurrency_list": [32], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 32 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp1_random_128_2048", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "max_concurrency_list": [32], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 2048, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 32 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp2_random_128_2048", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "max_concurrency_list": [32], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 128, |  | ||||||
| 	    "random-output-len": 2048, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 32 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp1_random_2048_128", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "max_concurrency_list": [32], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 2048, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 32 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp2_random_2048_128", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "max_concurrency_list": [32], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
| 	    "enable_chunked_prefill": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "random", |  | ||||||
| 	    "random-input-len": 2048, |  | ||||||
| 	    "random-output-len": 128, |  | ||||||
| 	    "ignore-eos": "", |  | ||||||
|             "num_prompts": 32 |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,82 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp1_sharegpt", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "PT_HPU_LAZY_MODE": 1, |  | ||||||
|             "VLLM_CONTIGUOUS_PA": 1, |  | ||||||
|             "VLLM_DEFRAG": 1 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
|             "swap_space": 16, |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "max-model-len": 2048, |  | ||||||
|             "max-num-seqs": 256, |  | ||||||
|             "async-scheduling": "" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama70B_tp4_sharegpt", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "PT_HPU_LAZY_MODE": 1, |  | ||||||
|             "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, |  | ||||||
|             "VLLM_CONTIGUOUS_PA": 1, |  | ||||||
|             "VLLM_DEFRAG": 1 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
|             "swap_space": 16, |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "max-model-len": 2048, |  | ||||||
|             "max-num-seqs": 256, |  | ||||||
|             "async-scheduling": "" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_mixtral8x7B_tp2_sharegpt", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "server_environment_variables": { |  | ||||||
|             "PT_HPU_LAZY_MODE": 1, |  | ||||||
|             "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, |  | ||||||
|             "VLLM_CONTIGUOUS_PA": 1, |  | ||||||
|             "VLLM_DEFRAG": 1 |  | ||||||
|         }, |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
|             "swap_space": 16, |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "max-model-len": 2048, |  | ||||||
|             "max-num-seqs": 256, |  | ||||||
|             "async-scheduling": "" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,77 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama8B_tp1_sharegpt", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
|             "swap_space": 16, |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama70B_tp4_sharegpt", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
|             "swap_space": 16, |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_mixtral8x7B_tp2_sharegpt", |  | ||||||
|         "qps_list": [1, 4, 16, "inf"], |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
|             "swap_space": 16, |  | ||||||
|             "disable_log_stats": "", |  | ||||||
|             "load_format": "dummy" |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200 |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "serving_llama70B_tp4_sharegpt_specdecode", |  | ||||||
|         "qps_list": [2], |  | ||||||
|         "server_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",  |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
|             "swap_space": 16, |  | ||||||
|             "speculative_config": { |  | ||||||
|                 "model": "turboderp/Qwama-0.5B-Instruct", |  | ||||||
|                 "num_speculative_tokens": 4, |  | ||||||
|                 "draft_tensor_parallel_size": 1 |  | ||||||
|             } |  | ||||||
|         }, |  | ||||||
|         "client_parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "dataset_name": "sharegpt", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200  |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,27 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "throughput_llama8B_tp2", |  | ||||||
|         "environment_variables": { |  | ||||||
|             "VLLM_RPC_TIMEOUT": 100000, |  | ||||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |  | ||||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |  | ||||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, |  | ||||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 |  | ||||||
|         }, |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
| 	    "dtype": "bfloat16", |  | ||||||
| 	    "distributed_executor_backend": "mp", |  | ||||||
| 	    "block_size": 128, |  | ||||||
| 	    "trust_remote_code": "", |  | ||||||
|             "disable_log_stats": "", |  | ||||||
| 	    "enforce_eager": "", |  | ||||||
| 	    "max_num_batched_tokens": 2048, |  | ||||||
| 	    "max_num_seqs": 256, |  | ||||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200, |  | ||||||
|             "backend": "vllm" |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,61 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "throughput_llama8B_tp1", |  | ||||||
|         "environment_variables": { |  | ||||||
|             "PT_HPU_LAZY_MODE": 1, |  | ||||||
|             "VLLM_CONTIGUOUS_PA": 1, |  | ||||||
|             "VLLM_DEFRAG": 1 |  | ||||||
|         }, |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 1000, |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "max-model-len": 2048, |  | ||||||
|             "max-num-seqs": 512, |  | ||||||
|             "async-scheduling": "" |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "throughput_llama70B_tp4", |  | ||||||
|         "environment_variables": { |  | ||||||
|             "PT_HPU_LAZY_MODE": 1, |  | ||||||
|             "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, |  | ||||||
|             "VLLM_CONTIGUOUS_PA": 1, |  | ||||||
|             "VLLM_DEFRAG": 1 |  | ||||||
|         }, |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 1000, |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "max-model-len": 2048, |  | ||||||
|             "max-num-seqs": 512, |  | ||||||
|             "async-scheduling": "" |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "throughput_mixtral8x7B_tp2", |  | ||||||
|         "environment_variables": { |  | ||||||
|             "PT_HPU_LAZY_MODE": 1, |  | ||||||
|             "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, |  | ||||||
|             "VLLM_CONTIGUOUS_PA": 1, |  | ||||||
|             "VLLM_DEFRAG": 1 |  | ||||||
|         }, |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 1000, |  | ||||||
|             "backend": "vllm", |  | ||||||
|             "max-model-len": 2048, |  | ||||||
|             "max-num-seqs": 512, |  | ||||||
|             "async-scheduling": "" |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,35 +0,0 @@ | |||||||
| [ |  | ||||||
|     { |  | ||||||
|         "test_name": "throughput_llama8B_tp1", |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", |  | ||||||
|             "tensor_parallel_size": 1, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200, |  | ||||||
|             "backend": "vllm" |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "throughput_llama70B_tp4", |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", |  | ||||||
|             "tensor_parallel_size": 4, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200, |  | ||||||
|             "backend": "vllm" |  | ||||||
|         } |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         "test_name": "throughput_mixtral8x7B_tp2", |  | ||||||
|         "parameters": { |  | ||||||
|             "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", |  | ||||||
|             "tensor_parallel_size": 2, |  | ||||||
|             "load_format": "dummy", |  | ||||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", |  | ||||||
|             "num_prompts": 200, |  | ||||||
|             "backend": "vllm" |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| ] |  | ||||||
| @ -1,201 +0,0 @@ | |||||||
| steps: |  | ||||||
|   # aarch64 + CUDA builds |  | ||||||
|   - label: "Build arm64 wheel - CUDA 12.9" |  | ||||||
|     depends_on: ~ |  | ||||||
|     id: build-wheel-arm64-cuda-12-9 |  | ||||||
|     agents: |  | ||||||
|       queue: arm64_cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: |  | ||||||
|       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." |  | ||||||
|       - "mkdir artifacts" |  | ||||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" |  | ||||||
|       - "bash .buildkite/scripts/upload-wheels.sh" |  | ||||||
|     env: |  | ||||||
|       DOCKER_BUILDKIT: "1" |  | ||||||
|  |  | ||||||
|   # aarch64 build |  | ||||||
|   - label: "Build arm64 CPU wheel" |  | ||||||
|     depends_on: ~ |  | ||||||
|     id: build-wheel-arm64-cpu |  | ||||||
|     agents: |  | ||||||
|       queue: arm64_cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." |  | ||||||
|       - "mkdir artifacts" |  | ||||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" |  | ||||||
|       - "bash .buildkite/scripts/upload-wheels.sh" |  | ||||||
|     env: |  | ||||||
|       DOCKER_BUILDKIT: "1" |  | ||||||
|  |  | ||||||
|   # x86 + CUDA builds |  | ||||||
|   - label: "Build wheel - CUDA 12.8" |  | ||||||
|     depends_on: ~ |  | ||||||
|     id: build-wheel-cuda-12-8 |  | ||||||
|     agents: |  | ||||||
|       queue: cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." |  | ||||||
|       - "mkdir artifacts" |  | ||||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" |  | ||||||
|       - "bash .buildkite/scripts/upload-wheels.sh" |  | ||||||
|     env: |  | ||||||
|       DOCKER_BUILDKIT: "1" |  | ||||||
|  |  | ||||||
|   - label: "Build wheel - CUDA 12.9" |  | ||||||
|     depends_on: ~ |  | ||||||
|     id: build-wheel-cuda-12-9 |  | ||||||
|     agents: |  | ||||||
|       queue: cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." |  | ||||||
|       - "mkdir artifacts" |  | ||||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" |  | ||||||
|       - "bash .buildkite/scripts/upload-wheels.sh" |  | ||||||
|     env: |  | ||||||
|       DOCKER_BUILDKIT: "1" |  | ||||||
|  |  | ||||||
|   - label: "Build wheel - CUDA 13.0" |  | ||||||
|     depends_on: ~ |  | ||||||
|     id: build-wheel-cuda-13-0 |  | ||||||
|     agents: |  | ||||||
|       queue: cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." |  | ||||||
|       - "mkdir artifacts" |  | ||||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" |  | ||||||
|       - "bash .buildkite/scripts/upload-wheels.sh" |  | ||||||
|     env: |  | ||||||
|       DOCKER_BUILDKIT: "1" |  | ||||||
|  |  | ||||||
|   # Build release images (12.9) |  | ||||||
|   - label: "Build release image (x86)" |  | ||||||
|     depends_on: ~ |  | ||||||
|     id: build-release-image-x86 |  | ||||||
|     agents: |  | ||||||
|       queue: cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." |  | ||||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" |  | ||||||
|       # re-tag to default image tag and push, just in case arm64 build fails |  | ||||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" |  | ||||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" |  | ||||||
|  |  | ||||||
|   - label: "Build release image (arm64)" |  | ||||||
|     depends_on: ~ |  | ||||||
|     id: build-release-image-arm64 |  | ||||||
|     agents: |  | ||||||
|       queue: arm64_cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." |  | ||||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" |  | ||||||
|  |  | ||||||
|   # Add job to create multi-arch manifest |  | ||||||
|   - label: "Create multi-arch manifest" |  | ||||||
|     depends_on: |  | ||||||
|       - build-release-image-x86 |  | ||||||
|       - build-release-image-arm64 |  | ||||||
|     id: create-multi-arch-manifest |  | ||||||
|     agents: |  | ||||||
|       queue: cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" |  | ||||||
|       - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" |  | ||||||
|       - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" |  | ||||||
|  |  | ||||||
|   - label: "Annotate release workflow" |  | ||||||
|     depends_on: |  | ||||||
|       - create-multi-arch-manifest |  | ||||||
|       - build-wheel-cuda-12-8 |  | ||||||
|     id: annotate-release-workflow |  | ||||||
|     agents: |  | ||||||
|       queue: cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "bash .buildkite/scripts/annotate-release.sh" |  | ||||||
|  |  | ||||||
|   - label: "Build and publish TPU release image" |  | ||||||
|     depends_on: ~ |  | ||||||
|     if: build.env("NIGHTLY") == "1" |  | ||||||
|     agents: |  | ||||||
|       queue: tpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "yes | docker system prune -a" |  | ||||||
|       - "git fetch --all" |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." |  | ||||||
|       - "docker push vllm/vllm-tpu:nightly" |  | ||||||
|       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" |  | ||||||
|     plugins: |  | ||||||
|       - docker-login#v3.0.0: |  | ||||||
|           username: vllmbot |  | ||||||
|           password-env: DOCKERHUB_TOKEN |  | ||||||
|     env: |  | ||||||
|       DOCKER_BUILDKIT: "1" |  | ||||||
|  |  | ||||||
|   - input: "Provide Release version here" |  | ||||||
|     id: input-release-version |  | ||||||
|     fields: |  | ||||||
|       - text: "What is the release version?" |  | ||||||
|         key: release-version |  | ||||||
|  |  | ||||||
|   - block: "Build CPU release image" |  | ||||||
|     key: block-cpu-release-image-build |  | ||||||
|     depends_on: ~ |  | ||||||
|  |  | ||||||
|   - label: "Build and publish CPU release image" |  | ||||||
|     depends_on: block-cpu-release-image-build |  | ||||||
|     agents: |  | ||||||
|       queue: cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." |  | ||||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" |  | ||||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" |  | ||||||
|     env: |  | ||||||
|       DOCKER_BUILDKIT: "1" |  | ||||||
|  |  | ||||||
|   - block: "Build arm64 CPU release image" |  | ||||||
|     key: block-arm64-cpu-release-image-build |  | ||||||
|     depends_on: ~ |  | ||||||
|  |  | ||||||
|   - label: "Build and publish arm64 CPU release image" |  | ||||||
|     depends_on: block-arm64-cpu-release-image-build |  | ||||||
|     agents: |  | ||||||
|       queue: arm64_cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" |  | ||||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." |  | ||||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest" |  | ||||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)" |  | ||||||
|     env: |  | ||||||
|       DOCKER_BUILDKIT: "1" |  | ||||||
|  |  | ||||||
|   - label: "Build and publish nightly multi-arch image to DockerHub" |  | ||||||
|     depends_on: |  | ||||||
|       - create-multi-arch-manifest |  | ||||||
|     if: build.env("NIGHTLY") == "1" |  | ||||||
|     agents: |  | ||||||
|       queue: cpu_queue_postmerge |  | ||||||
|     commands: |  | ||||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" |  | ||||||
|       - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64" |  | ||||||
|       - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64" |  | ||||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64" |  | ||||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64" |  | ||||||
|       - "docker push vllm/vllm-openai:nightly-x86_64" |  | ||||||
|       - "docker push vllm/vllm-openai:nightly-aarch64" |  | ||||||
|       - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" |  | ||||||
|       - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" |  | ||||||
|       - "docker manifest push vllm/vllm-openai:nightly" |  | ||||||
|       - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" |  | ||||||
|       # Clean up old nightly builds (keep only last 14) |  | ||||||
|       - "bash .buildkite/scripts/cleanup-nightly-builds.sh" |  | ||||||
|     plugins: |  | ||||||
|       - docker-login#v3.0.0: |  | ||||||
|           username: vllmbot |  | ||||||
|           password-env: DOCKERHUB_TOKEN |  | ||||||
|     env: |  | ||||||
|       DOCKER_BUILDKIT: "1" |  | ||||||
|       DOCKERHUB_USERNAME: "vllmbot" |  | ||||||
							
								
								
									
										73
									
								
								.buildkite/run-amd-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								.buildkite/run-amd-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,73 @@ | |||||||
|  | # This script runs test inside the corresponding ROCm docker container. | ||||||
|  | set -ex | ||||||
|  |  | ||||||
|  | # Print ROCm version | ||||||
|  | echo "--- ROCm info" | ||||||
|  | rocminfo | ||||||
|  |  | ||||||
|  | # cleanup older docker images | ||||||
|  | cleanup_docker() { | ||||||
|  |   # Get Docker's root directory | ||||||
|  |   docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||||
|  |   if [ -z "$docker_root" ]; then | ||||||
|  |     echo "Failed to determine Docker root directory." | ||||||
|  |     exit 1 | ||||||
|  |   fi | ||||||
|  |   echo "Docker root directory: $docker_root" | ||||||
|  |   # Check disk usage of the filesystem where Docker's root directory is located | ||||||
|  |   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||||
|  |   # Define the threshold | ||||||
|  |   threshold=70 | ||||||
|  |   if [ "$disk_usage" -gt "$threshold" ]; then | ||||||
|  |     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||||
|  |     # Remove dangling images (those that are not tagged and not used by any container) | ||||||
|  |     docker image prune -f | ||||||
|  |     # Remove unused volumes | ||||||
|  |     docker volume prune -f | ||||||
|  |     echo "Docker images and volumes cleanup completed." | ||||||
|  |   else | ||||||
|  |     echo "Disk usage is below $threshold%. No cleanup needed." | ||||||
|  |   fi | ||||||
|  | } | ||||||
|  |  | ||||||
|  | # Call the cleanup docker function | ||||||
|  | cleanup_docker | ||||||
|  |  | ||||||
|  | echo "--- Resetting GPUs" | ||||||
|  |  | ||||||
|  | echo "reset" > /opt/amdgpu/etc/gpu_state | ||||||
|  |  | ||||||
|  | while true; do | ||||||
|  |         sleep 3 | ||||||
|  |         if grep -q clean /opt/amdgpu/etc/gpu_state; then | ||||||
|  |                 echo "GPUs state is \"clean\"" | ||||||
|  |                 break | ||||||
|  |         fi | ||||||
|  | done | ||||||
|  |  | ||||||
|  | echo "--- Building container" | ||||||
|  | sha=$(git rev-parse --short HEAD) | ||||||
|  | image_name=rocm_${sha} | ||||||
|  | container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo) | ||||||
|  | docker build \ | ||||||
|  |         -t ${image_name} \ | ||||||
|  |         -f Dockerfile.rocm \ | ||||||
|  |         --progress plain \ | ||||||
|  |         . | ||||||
|  |  | ||||||
|  | remove_docker_container() { | ||||||
|  |    docker rm -f ${container_name} || docker image rm -f ${image_name} || true | ||||||
|  | } | ||||||
|  | trap remove_docker_container EXIT | ||||||
|  |  | ||||||
|  | echo "--- Running container" | ||||||
|  |  | ||||||
|  | docker run \ | ||||||
|  |         --device /dev/kfd --device /dev/dri \ | ||||||
|  |         --network host \ | ||||||
|  |         --rm \ | ||||||
|  |         -e HF_TOKEN \ | ||||||
|  |         --name ${container_name} \ | ||||||
|  |         ${image_name} \ | ||||||
|  |         /bin/bash -c "${@}" | ||||||
|  |  | ||||||
| @ -1,30 +1,28 @@ | |||||||
| #!/bin/bash |  | ||||||
| 
 |  | ||||||
| # This script is run by buildkite to run the benchmarks and upload the results to buildkite | # This script is run by buildkite to run the benchmarks and upload the results to buildkite | ||||||
| 
 | 
 | ||||||
| set -ex | set -ex | ||||||
| set -o pipefail | set -o pipefail | ||||||
| 
 | 
 | ||||||
| # cd 2 levels into the working directory | # cd into parent directory of this file | ||||||
| cd "$(dirname "${BASH_SOURCE[0]}")/../.." | cd "$(dirname "${BASH_SOURCE[0]}")/.." | ||||||
| 
 | 
 | ||||||
| (which wget && which curl) || (apt-get update && apt-get install -y wget curl) | (which wget && which curl) || (apt-get update && apt-get install -y wget curl) | ||||||
| 
 | 
 | ||||||
| # run python-based benchmarks and upload the result to buildkite | # run python-based benchmarks and upload the result to buildkite | ||||||
| vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt | python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt | ||||||
| bench_latency_exit_code=$? | bench_latency_exit_code=$? | ||||||
| 
 | 
 | ||||||
| vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt | python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt | ||||||
| bench_throughput_exit_code=$? | bench_throughput_exit_code=$? | ||||||
| 
 | 
 | ||||||
| # run server-based benchmarks and upload the result to buildkite | # run server-based benchmarks and upload the result to buildkite | ||||||
| vllm serve meta-llama/Llama-2-7b-chat-hf & | python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf & | ||||||
| server_pid=$! | server_pid=$! | ||||||
| wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||||||
| 
 | 
 | ||||||
| # wait for server to start, timeout after 600 seconds | # wait for server to start, timeout after 600 seconds | ||||||
| timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 | timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 | ||||||
| vllm bench serve \ | python3 benchmarks/benchmark_serving.py \ | ||||||
|     --backend vllm \ |     --backend vllm \ | ||||||
|     --dataset-name sharegpt \ |     --dataset-name sharegpt \ | ||||||
|     --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ |     --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ | ||||||
| @ -52,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md | |||||||
| sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line | sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line | ||||||
| echo "" >> benchmark_results.md | echo "" >> benchmark_results.md | ||||||
| echo '```' >> benchmark_results.md | echo '```' >> benchmark_results.md | ||||||
| tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines | tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines | ||||||
| echo '```' >> benchmark_results.md | echo '```' >> benchmark_results.md | ||||||
| 
 | 
 | ||||||
| # if the agent binary is not found, skip uploading the results, exit 0 | # if the agent binary is not found, skip uploading the results, exit 0 | ||||||
| if [ ! -f /usr/bin/buildkite-agent ]; then | if [ ! -f /workspace/buildkite-agent ]; then | ||||||
|     exit 0 |     exit 0 | ||||||
| fi | fi | ||||||
| 
 | 
 | ||||||
| # upload the results to buildkite | # upload the results to buildkite | ||||||
| buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md | /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md | ||||||
| 
 | 
 | ||||||
| # exit with the exit code of the benchmarks | # exit with the exit code of the benchmarks | ||||||
| if [ $bench_latency_exit_code -ne 0 ]; then | if [ $bench_latency_exit_code -ne 0 ]; then | ||||||
| @ -77,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then | |||||||
| fi | fi | ||||||
| 
 | 
 | ||||||
| rm ShareGPT_V3_unfiltered_cleaned_split.json | rm ShareGPT_V3_unfiltered_cleaned_split.json | ||||||
| buildkite-agent artifact upload "*.json" | /workspace/buildkite-agent artifact upload "*.json" | ||||||
							
								
								
									
										14
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,14 @@ | |||||||
|  | # This script build the CPU docker image and run the offline inference inside the container. | ||||||
|  | # It serves a sanity check for compilation and basic model usage. | ||||||
|  | set -ex | ||||||
|  |  | ||||||
|  | # Try building the docker image | ||||||
|  | docker build -t cpu-test -f Dockerfile.cpu . | ||||||
|  |  | ||||||
|  | # Setup cleanup | ||||||
|  | remove_docker_container() { docker rm -f cpu-test || true; } | ||||||
|  | trap remove_docker_container EXIT | ||||||
|  | remove_docker_container | ||||||
|  |  | ||||||
|  | # Run the image and launch offline inference | ||||||
|  | docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py | ||||||
							
								
								
									
										51
									
								
								.buildkite/run-neuron-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								.buildkite/run-neuron-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,51 @@ | |||||||
|  | # This script build the Neuron docker image and run the API server inside the container. | ||||||
|  | # It serves a sanity check for compilation and basic model usage. | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | # Try building the docker image | ||||||
|  | aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com | ||||||
|  |  | ||||||
|  | # prune old image and containers to save disk space, and only once a day | ||||||
|  | # by using a timestamp file in tmp. | ||||||
|  | if [ -f /tmp/neuron-docker-build-timestamp ]; then | ||||||
|  |     last_build=$(cat /tmp/neuron-docker-build-timestamp) | ||||||
|  |     current_time=$(date +%s) | ||||||
|  |     if [ $((current_time - last_build)) -gt 86400 ]; then | ||||||
|  |         docker system prune -f | ||||||
|  |         echo $current_time > /tmp/neuron-docker-build-timestamp | ||||||
|  |     fi | ||||||
|  | else | ||||||
|  |     echo $(date +%s) > /tmp/neuron-docker-build-timestamp | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | docker build -t neuron -f Dockerfile.neuron . | ||||||
|  |  | ||||||
|  | # Setup cleanup | ||||||
|  | remove_docker_container() { docker rm -f neuron || true; } | ||||||
|  | trap remove_docker_container EXIT | ||||||
|  | remove_docker_container | ||||||
|  |  | ||||||
|  | # Run the image | ||||||
|  | docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ | ||||||
|  |        --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & | ||||||
|  |  | ||||||
|  | # Wait for the server to start | ||||||
|  | wait_for_server_to_start() { | ||||||
|  |     timeout=300 | ||||||
|  |     counter=0 | ||||||
|  |  | ||||||
|  |     while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do | ||||||
|  |         sleep 1 | ||||||
|  |         counter=$((counter + 1)) | ||||||
|  |         if [ $counter -ge $timeout ]; then | ||||||
|  |             echo "Timeout after $timeout seconds" | ||||||
|  |             break | ||||||
|  |         fi | ||||||
|  |     done | ||||||
|  | } | ||||||
|  | wait_for_server_to_start | ||||||
|  |  | ||||||
|  | # Test a simple prompt | ||||||
|  | curl -X POST -H "Content-Type: application/json" \ | ||||||
|  |     localhost:8000/generate \ | ||||||
|  |     -d '{"prompt": "San Francisco is a"}' | ||||||
| @ -1,46 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| set -ex |  | ||||||
|  |  | ||||||
| # Get release version and strip leading 'v' if present |  | ||||||
| RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//') |  | ||||||
|  |  | ||||||
| if [ -z "$RELEASE_VERSION" ]; then |  | ||||||
|   echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid." |  | ||||||
|   exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF |  | ||||||
| To download the wheel: |  | ||||||
| \`\`\` |  | ||||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . |  | ||||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . |  | ||||||
|  |  | ||||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . |  | ||||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . |  | ||||||
| \`\`\` |  | ||||||
|  |  | ||||||
| To download and upload the image: |  | ||||||
|  |  | ||||||
| \`\`\` |  | ||||||
| docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 |  | ||||||
| docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 |  | ||||||
|  |  | ||||||
| docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 |  | ||||||
| docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 |  | ||||||
| docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 |  | ||||||
| docker push vllm/vllm-openai:latest-x86_64 |  | ||||||
| docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 |  | ||||||
|  |  | ||||||
| docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 |  | ||||||
| docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 |  | ||||||
| docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 |  | ||||||
| docker push vllm/vllm-openai:latest-aarch64 |  | ||||||
| docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 |  | ||||||
|  |  | ||||||
| docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend |  | ||||||
| docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend |  | ||||||
| docker manifest push vllm/vllm-openai:latest |  | ||||||
| docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} |  | ||||||
| \`\`\` |  | ||||||
| EOF  |  | ||||||
| @ -1,17 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| # Usage: ./ci_clean_log.sh ci.log |  | ||||||
| # This script strips timestamps and color codes from CI log files. |  | ||||||
|  |  | ||||||
| # Check if argument is given |  | ||||||
| if [ $# -lt 1 ]; then |  | ||||||
|     echo "Usage: $0 ci.log" |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| INPUT_FILE="$1" |  | ||||||
|  |  | ||||||
| # Strip timestamps |  | ||||||
| sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE" |  | ||||||
|  |  | ||||||
| # Strip colorization |  | ||||||
| sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE" |  | ||||||
| @ -1,120 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| set -ex |  | ||||||
|  |  | ||||||
| # Clean up old nightly builds from DockerHub, keeping only the last 14 builds |  | ||||||
| # This script uses DockerHub API to list and delete old tags with "nightly-" prefix |  | ||||||
|  |  | ||||||
| # DockerHub API endpoint for vllm/vllm-openai repository |  | ||||||
| REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" |  | ||||||
|  |  | ||||||
| # Get DockerHub credentials from environment |  | ||||||
| if [ -z "$DOCKERHUB_TOKEN" ]; then |  | ||||||
|     echo "Error: DOCKERHUB_TOKEN environment variable is not set" |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| if [ -z "$DOCKERHUB_USERNAME" ]; then |  | ||||||
|     echo "Error: DOCKERHUB_USERNAME environment variable is not set" |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Get DockerHub bearer token |  | ||||||
| echo "Getting DockerHub bearer token..." |  | ||||||
| set +x |  | ||||||
| BEARER_TOKEN=$(curl -s -X POST \ |  | ||||||
|     -H "Content-Type: application/json" \ |  | ||||||
|     -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \ |  | ||||||
|     "https://hub.docker.com/v2/users/login" | jq -r '.token') |  | ||||||
| set -x |  | ||||||
|  |  | ||||||
| if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then |  | ||||||
|     echo "Error: Failed to get DockerHub bearer token" |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Function to get all tags from DockerHub |  | ||||||
| get_all_tags() { |  | ||||||
|     local page=1 |  | ||||||
|     local all_tags="" |  | ||||||
|      |  | ||||||
|     while true; do |  | ||||||
|         set +x |  | ||||||
|         local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \ |  | ||||||
|             "$REPO_API_URL?page=$page&page_size=100") |  | ||||||
|         set -x |  | ||||||
|          |  | ||||||
|         # Get both last_updated timestamp and tag name, separated by | |  | ||||||
|         local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') |  | ||||||
|          |  | ||||||
|         if [ -z "$tags" ]; then |  | ||||||
|             break |  | ||||||
|         fi |  | ||||||
|          |  | ||||||
|         all_tags="$all_tags$tags"$'\n' |  | ||||||
|         page=$((page + 1)) |  | ||||||
|     done |  | ||||||
|      |  | ||||||
|     # Sort by timestamp (newest first) and extract just the tag names |  | ||||||
|     echo "$all_tags" | sort -r | cut -d'|' -f2 |  | ||||||
| } |  | ||||||
|  |  | ||||||
| delete_tag() { |  | ||||||
|     local tag_name="$1" |  | ||||||
|     echo "Deleting tag: $tag_name" |  | ||||||
|      |  | ||||||
|     local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" |  | ||||||
|     set +x |  | ||||||
|     local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url") |  | ||||||
|     set -x |  | ||||||
|      |  | ||||||
|     if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then |  | ||||||
|         echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')" |  | ||||||
|     else |  | ||||||
|         echo "Successfully deleted tag: $tag_name" |  | ||||||
|     fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first) |  | ||||||
| echo "Fetching all tags from DockerHub..." |  | ||||||
| all_tags=$(get_all_tags) |  | ||||||
|  |  | ||||||
| if [ -z "$all_tags" ]; then |  | ||||||
|     echo "No tags found to clean up" |  | ||||||
|     exit 0 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Count total tags |  | ||||||
| total_tags=$(echo "$all_tags" | wc -l) |  | ||||||
| echo "Found $total_tags tags" |  | ||||||
|  |  | ||||||
| # Keep only the last 14 builds (including the current one) |  | ||||||
| tags_to_keep=14 |  | ||||||
| tags_to_delete=$((total_tags - tags_to_keep)) |  | ||||||
|  |  | ||||||
| if [ $tags_to_delete -le 0 ]; then |  | ||||||
|     echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)" |  | ||||||
|     exit 0 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep" |  | ||||||
|  |  | ||||||
| # Get tags to delete (skip the first $tags_to_keep tags) |  | ||||||
| tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1))) |  | ||||||
|  |  | ||||||
| if [ -z "$tags_to_delete_list" ]; then |  | ||||||
|     echo "No tags to delete" |  | ||||||
|     exit 0 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Delete old tags |  | ||||||
| echo "Deleting old tags..." |  | ||||||
| while IFS= read -r tag; do |  | ||||||
|     if [ -n "$tag" ]; then |  | ||||||
|         delete_tag "$tag" |  | ||||||
|         # Add a small delay to avoid rate limiting |  | ||||||
|         sleep 1 |  | ||||||
|     fi |  | ||||||
| done <<< "$tags_to_delete_list" |  | ||||||
|  |  | ||||||
| echo "Cleanup completed successfully" |  | ||||||
| @ -1,231 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # This script runs test inside the corresponding ROCm docker container. |  | ||||||
| set -o pipefail |  | ||||||
|  |  | ||||||
| # Export Python path |  | ||||||
| export PYTHONPATH=".." |  | ||||||
|  |  | ||||||
| # Print ROCm version |  | ||||||
| echo "--- Confirming Clean Initial State" |  | ||||||
| while true; do |  | ||||||
|         sleep 3 |  | ||||||
|         if grep -q clean /opt/amdgpu/etc/gpu_state; then |  | ||||||
|                 echo "GPUs state is \"clean\"" |  | ||||||
|                 break |  | ||||||
|         fi |  | ||||||
| done |  | ||||||
|  |  | ||||||
| echo "--- ROCm info" |  | ||||||
| rocminfo |  | ||||||
|  |  | ||||||
| # cleanup older docker images |  | ||||||
| cleanup_docker() { |  | ||||||
|   # Get Docker's root directory |  | ||||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') |  | ||||||
|   if [ -z "$docker_root" ]; then |  | ||||||
|     echo "Failed to determine Docker root directory." |  | ||||||
|     exit 1 |  | ||||||
|   fi |  | ||||||
|   echo "Docker root directory: $docker_root" |  | ||||||
|   # Check disk usage of the filesystem where Docker's root directory is located |  | ||||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') |  | ||||||
|   # Define the threshold |  | ||||||
|   threshold=70 |  | ||||||
|   if [ "$disk_usage" -gt "$threshold" ]; then |  | ||||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." |  | ||||||
|     # Remove dangling images (those that are not tagged and not used by any container) |  | ||||||
|     docker image prune -f |  | ||||||
|     # Remove unused volumes / force the system prune for old images as well. |  | ||||||
|     docker volume prune -f && docker system prune --force --filter "until=72h" --all |  | ||||||
|     echo "Docker images and volumes cleanup completed." |  | ||||||
|   else |  | ||||||
|     echo "Disk usage is below $threshold%. No cleanup needed." |  | ||||||
|   fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # Call the cleanup docker function |  | ||||||
| cleanup_docker |  | ||||||
|  |  | ||||||
| echo "--- Resetting GPUs" |  | ||||||
|  |  | ||||||
| echo "reset" > /opt/amdgpu/etc/gpu_state |  | ||||||
|  |  | ||||||
| while true; do |  | ||||||
|         sleep 3 |  | ||||||
|         if grep -q clean /opt/amdgpu/etc/gpu_state; then |  | ||||||
|                 echo "GPUs state is \"clean\"" |  | ||||||
|                 break |  | ||||||
|         fi |  | ||||||
| done |  | ||||||
|  |  | ||||||
| echo "--- Pulling container"  |  | ||||||
| image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" |  | ||||||
| container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" |  | ||||||
| docker pull "${image_name}" |  | ||||||
|  |  | ||||||
| remove_docker_container() { |  | ||||||
|    docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true |  | ||||||
| } |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
|  |  | ||||||
| echo "--- Running container" |  | ||||||
|  |  | ||||||
| HF_CACHE="$(realpath ~)/huggingface" |  | ||||||
| mkdir -p "${HF_CACHE}" |  | ||||||
| HF_MOUNT="/root/.cache/huggingface" |  | ||||||
|  |  | ||||||
| commands=$@ |  | ||||||
| echo "Commands:$commands" |  | ||||||
|  |  | ||||||
| if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then |  | ||||||
|   commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"} |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then |  | ||||||
|   commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then |  | ||||||
|   commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| if [[ $commands == *"pytest -v -s lora"* ]]; then |  | ||||||
|   commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| #ignore certain kernels tests |  | ||||||
| if [[ $commands == *" kernels/core"* ]]; then |  | ||||||
|   commands="${commands} \ |  | ||||||
|   --ignore=kernels/core/test_fused_quant_layernorm.py \ |  | ||||||
|   --ignore=kernels/core/test_permute_cols.py" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| if [[ $commands == *" kernels/attention"* ]]; then |  | ||||||
|   commands="${commands} \ |  | ||||||
|   --ignore=kernels/attention/test_attention_selector.py \ |  | ||||||
|   --ignore=kernels/attention/test_encoder_decoder_attn.py \ |  | ||||||
|   --ignore=kernels/attention/test_flash_attn.py \ |  | ||||||
|   --ignore=kernels/attention/test_flashinfer.py \ |  | ||||||
|   --ignore=kernels/attention/test_prefix_prefill.py \ |  | ||||||
|   --ignore=kernels/attention/test_cascade_flash_attn.py \ |  | ||||||
|   --ignore=kernels/attention/test_mha_attn.py \ |  | ||||||
|   --ignore=kernels/attention/test_lightning_attn.py \ |  | ||||||
|   --ignore=kernels/attention/test_attention.py" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| if [[ $commands == *" kernels/quantization"* ]]; then |  | ||||||
|   commands="${commands} \ |  | ||||||
|   --ignore=kernels/quantization/test_int8_quant.py \ |  | ||||||
|   --ignore=kernels/quantization/test_machete_mm.py \ |  | ||||||
|   --ignore=kernels/quantization/test_block_fp8.py \ |  | ||||||
|   --ignore=kernels/quantization/test_block_int8.py \ |  | ||||||
|   --ignore=kernels/quantization/test_marlin_gemm.py \ |  | ||||||
|   --ignore=kernels/quantization/test_cutlass_scaled_mm.py \ |  | ||||||
|   --ignore=kernels/quantization/test_int8_kernel.py" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| if [[ $commands == *" kernels/mamba"* ]]; then |  | ||||||
|   commands="${commands} \ |  | ||||||
|   --ignore=kernels/mamba/test_mamba_mixer2.py \ |  | ||||||
|   --ignore=kernels/mamba/test_causal_conv1d.py \ |  | ||||||
|   --ignore=kernels/mamba/test_mamba_ssm_ssd.py" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| if [[ $commands == *" kernels/moe"* ]]; then |  | ||||||
|   commands="${commands} \ |  | ||||||
|   --ignore=kernels/moe/test_moe.py \ |  | ||||||
|   --ignore=kernels/moe/test_cutlass_moe.py \ |  | ||||||
|   --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| #ignore certain Entrypoints/openai tests |  | ||||||
| if [[ $commands == *" entrypoints/openai "* ]]; then |  | ||||||
|   commands=${commands//" entrypoints/openai "/" entrypoints/openai \ |  | ||||||
|   --ignore=entrypoints/openai/test_audio.py \ |  | ||||||
|   --ignore=entrypoints/openai/test_shutdown.py \ |  | ||||||
|   --ignore=entrypoints/openai/test_completion.py \ |  | ||||||
|   --ignore=entrypoints/openai/test_sleep.py \ |  | ||||||
|   --ignore=entrypoints/openai/test_models.py \ |  | ||||||
|   --ignore=entrypoints/openai/test_lora_adapters.py \ |  | ||||||
|   --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ |  | ||||||
|   --ignore=entrypoints/openai/test_root_path.py \ |  | ||||||
|   --ignore=entrypoints/openai/test_tokenization.py \ |  | ||||||
|   --ignore=entrypoints/openai/test_prompt_validation.py "} |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| #ignore certain Entrypoints/llm tests |  | ||||||
| if [[ $commands == *" entrypoints/llm "* ]]; then |  | ||||||
|   commands=${commands//" entrypoints/llm "/" entrypoints/llm \ |  | ||||||
|   --ignore=entrypoints/llm/test_chat.py \ |  | ||||||
|   --ignore=entrypoints/llm/test_accuracy.py \ |  | ||||||
|   --ignore=entrypoints/llm/test_init.py \ |  | ||||||
|   --ignore=entrypoints/llm/test_prompt_validation.py "} |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # --ignore=entrypoints/openai/test_encoder_decoder.py \ |  | ||||||
| # --ignore=entrypoints/openai/test_embedding.py \ |  | ||||||
| # --ignore=entrypoints/openai/test_oot_registration.py |  | ||||||
| # --ignore=entrypoints/openai/test_accuracy.py \ |  | ||||||
| # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| PARALLEL_JOB_COUNT=8 |  | ||||||
| MYPYTHONPATH=".." |  | ||||||
|  |  | ||||||
| # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.  |  | ||||||
| if [[ $commands == *"--shard-id="* ]]; then |  | ||||||
|   # assign job count as the number of shards used    |  | ||||||
|   commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} |  | ||||||
|   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do |  | ||||||
|     # assign shard-id for each shard |  | ||||||
|     commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} |  | ||||||
|     echo "Shard ${GPU} commands:$commands_gpu" |  | ||||||
|     echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" |  | ||||||
|     docker run \ |  | ||||||
|         --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ |  | ||||||
|         --network=host \ |  | ||||||
|         --shm-size=16gb \ |  | ||||||
|         --rm \ |  | ||||||
|         -e HIP_VISIBLE_DEVICES="${GPU}" \ |  | ||||||
|         -e HF_TOKEN \ |  | ||||||
|         -e AWS_ACCESS_KEY_ID \ |  | ||||||
|         -e AWS_SECRET_ACCESS_KEY \ |  | ||||||
|         -v "${HF_CACHE}:${HF_MOUNT}" \ |  | ||||||
|         -e "HF_HOME=${HF_MOUNT}" \ |  | ||||||
|         -e "PYTHONPATH=${MYPYTHONPATH}" \ |  | ||||||
|         --name "${container_name}_${GPU}" \ |  | ||||||
|         "${image_name}" \ |  | ||||||
|         /bin/bash -c "${commands_gpu}" \ |  | ||||||
|         |& while read -r line; do echo ">>Shard $GPU: $line"; done & |  | ||||||
|     PIDS+=($!) |  | ||||||
|   done |  | ||||||
|   #wait for all processes to finish and collect exit codes |  | ||||||
|   for pid in "${PIDS[@]}"; do |  | ||||||
|     wait "${pid}" |  | ||||||
|     STATUS+=($?) |  | ||||||
|   done |  | ||||||
|   for st in "${STATUS[@]}"; do |  | ||||||
|     if [[ ${st} -ne 0 ]]; then |  | ||||||
|       echo "One of the processes failed with $st" |  | ||||||
|       exit "${st}" |  | ||||||
|     fi |  | ||||||
|   done |  | ||||||
| else |  | ||||||
|   echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" |  | ||||||
|   docker run \ |  | ||||||
|           --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ |  | ||||||
|           --network=host \ |  | ||||||
|           --shm-size=16gb \ |  | ||||||
|           --rm \ |  | ||||||
|           -e HIP_VISIBLE_DEVICES=0 \ |  | ||||||
|           -e HF_TOKEN \ |  | ||||||
|           -e AWS_ACCESS_KEY_ID \ |  | ||||||
|           -e AWS_SECRET_ACCESS_KEY \ |  | ||||||
|           -v "${HF_CACHE}:${HF_MOUNT}" \ |  | ||||||
|           -e "HF_HOME=${HF_MOUNT}" \ |  | ||||||
|           -e "PYTHONPATH=${MYPYTHONPATH}" \ |  | ||||||
|           --name "${container_name}" \ |  | ||||||
|           "${image_name}" \ |  | ||||||
|           /bin/bash -c "${commands}" |  | ||||||
| fi |  | ||||||
| @ -1,52 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # This script build the CPU docker image and run the offline inference inside the container. |  | ||||||
| # It serves a sanity check for compilation and basic model usage. |  | ||||||
| set -ex |  | ||||||
|  |  | ||||||
| # Setup cleanup |  | ||||||
| remove_docker_container() { |  | ||||||
|   if [[ -n "$container_id" ]]; then |  | ||||||
|       podman stop --all -t0 |  | ||||||
|       podman rm -f "$container_id" || true |  | ||||||
|   fi |  | ||||||
|   podman system prune -f |  | ||||||
| } |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
| remove_docker_container |  | ||||||
|  |  | ||||||
| # Try building the docker image |  | ||||||
| podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le . |  | ||||||
|  |  | ||||||
| # Run the image |  | ||||||
| container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc) |  | ||||||
|  |  | ||||||
| function cpu_tests() { |  | ||||||
|  |  | ||||||
|   # offline inference |  | ||||||
|   podman exec -it "$container_id" bash -c " |  | ||||||
|     set -xve |  | ||||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log |  | ||||||
|  |  | ||||||
|   # Run basic model test |  | ||||||
|   podman exec -it "$container_id" bash -c " |  | ||||||
|     set -evx |  | ||||||
|     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib |  | ||||||
|     pip install sentence-transformers datamodel_code_generator |  | ||||||
|  |  | ||||||
|     # Note: disable Bart until supports V1 |  | ||||||
|     # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model |  | ||||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] |  | ||||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] |  | ||||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] |  | ||||||
|     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] |  | ||||||
|     # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. |  | ||||||
|     # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # All of CPU tests are expected to be finished less than 40 mins. |  | ||||||
|  |  | ||||||
| export container_id |  | ||||||
| export -f cpu_tests |  | ||||||
| timeout 120m bash -c cpu_tests |  | ||||||
|  |  | ||||||
| @ -1,13 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # This script build the CPU docker image and run the offline inference inside the container. |  | ||||||
| # It serves a sanity check for compilation and basic model usage. |  | ||||||
| set -ex |  | ||||||
|  |  | ||||||
| # Setup cleanup |  | ||||||
| remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; } |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
| remove_docker_container |  | ||||||
|  |  | ||||||
| # Try building the docker image |  | ||||||
| docker build -t cpu-test -f docker/Dockerfile.s390x . |  | ||||||
| @ -1,119 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # This script build the CPU docker image and run the offline inference inside the container. |  | ||||||
| # It serves a sanity check for compilation and basic model usage. |  | ||||||
| set -ex |  | ||||||
|  |  | ||||||
| # allow to bind to different cores |  | ||||||
| CORE_RANGE=${CORE_RANGE:-48-95} |  | ||||||
| # used for TP/PP E2E test |  | ||||||
| OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} |  | ||||||
| NUMA_NODE=${NUMA_NODE:-1} |  | ||||||
|  |  | ||||||
| export CMAKE_BUILD_PARALLEL_LEVEL=32 |  | ||||||
|  |  | ||||||
| # Setup cleanup |  | ||||||
| remove_docker_container() { |  | ||||||
|     set -e; |  | ||||||
|     docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; |  | ||||||
| } |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
| remove_docker_container |  | ||||||
|  |  | ||||||
| # Try building the docker image |  | ||||||
| numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . |  | ||||||
| numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . |  | ||||||
|  |  | ||||||
| # Run the image, setting --shm-size=4g for tensor parallel. |  | ||||||
| docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" |  | ||||||
| docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 |  | ||||||
|  |  | ||||||
| function cpu_tests() { |  | ||||||
|   set -e |  | ||||||
|   export NUMA_NODE=$2 |  | ||||||
|  |  | ||||||
|   # list packages |  | ||||||
|   docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " |  | ||||||
|     set -e |  | ||||||
|     pip list" |  | ||||||
|  |  | ||||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " |  | ||||||
|     set -e |  | ||||||
|     pip list" |  | ||||||
|  |  | ||||||
|   # offline inference |  | ||||||
|   docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " |  | ||||||
|     set -e |  | ||||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" |  | ||||||
|  |  | ||||||
|   # Run kernel tests |  | ||||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " |  | ||||||
|     set -e |  | ||||||
|     pytest -x -v -s tests/kernels/test_onednn.py" |  | ||||||
|  |  | ||||||
|   # Run basic model test |  | ||||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " |  | ||||||
|     set -e |  | ||||||
|     # Note: disable until supports V1 |  | ||||||
|     # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model |  | ||||||
|     # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model |  | ||||||
|  |  | ||||||
|     pytest -x -v -s tests/models/language/generation -m cpu_model |  | ||||||
|     VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model |  | ||||||
|  |  | ||||||
|     pytest -x -v -s tests/models/language/pooling -m cpu_model |  | ||||||
|     pytest -x -v -s tests/models/multimodal/generation \ |  | ||||||
|                 --ignore=tests/models/multimodal/generation/test_pixtral.py \ |  | ||||||
|                 -m cpu_model" |  | ||||||
|  |  | ||||||
|   # Run compressed-tensor test |  | ||||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " |  | ||||||
|     set -e |  | ||||||
|     pytest -x -s -v \ |  | ||||||
|     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" |  | ||||||
|  |  | ||||||
|   # Note: disable it until supports V1 |  | ||||||
|   # Run AWQ test |  | ||||||
|   # docker exec cpu-test-"$NUMA_NODE" bash -c " |  | ||||||
|   #   set -e |  | ||||||
|   #   VLLM_USE_V1=0 pytest -x -s -v \ |  | ||||||
|   #   tests/quantization/test_ipex_quant.py" |  | ||||||
|  |  | ||||||
|   # Run multi-lora tests |  | ||||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " |  | ||||||
|     set -e |  | ||||||
|     pytest -x -s -v \ |  | ||||||
|     tests/lora/test_qwen2vl.py" |  | ||||||
|  |  | ||||||
|   # online serving: tp+pp |  | ||||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c ' |  | ||||||
|     set -e |  | ||||||
|     VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & |  | ||||||
|     server_pid=$! |  | ||||||
|     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 |  | ||||||
|     vllm bench serve \ |  | ||||||
|       --backend vllm \ |  | ||||||
|       --dataset-name random \ |  | ||||||
|       --model meta-llama/Llama-3.2-3B-Instruct \ |  | ||||||
|       --num-prompts 20 \ |  | ||||||
|       --endpoint /v1/completions |  | ||||||
|     kill -s SIGTERM $server_pid &' |  | ||||||
|  |  | ||||||
|   # online serving: tp+dp |  | ||||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c ' |  | ||||||
|     set -e |  | ||||||
|     VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & |  | ||||||
|     server_pid=$! |  | ||||||
|     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 |  | ||||||
|     vllm bench serve \ |  | ||||||
|       --backend vllm \ |  | ||||||
|       --dataset-name random \ |  | ||||||
|       --model meta-llama/Llama-3.2-3B-Instruct \ |  | ||||||
|       --num-prompts 20 \ |  | ||||||
|       --endpoint /v1/completions |  | ||||||
|     kill -s SIGTERM $server_pid &' |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # All of CPU tests are expected to be finished less than 40 mins. |  | ||||||
| export -f cpu_tests |  | ||||||
| timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" |  | ||||||
| @ -1,29 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # This script build the GH200 docker image and run the offline inference inside the container. |  | ||||||
| # It serves a sanity check for compilation and basic model usage. |  | ||||||
| set -ex |  | ||||||
|  |  | ||||||
| # Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile |  | ||||||
| python3 use_existing_torch.py |  | ||||||
|  |  | ||||||
| # Try building the docker image |  | ||||||
| DOCKER_BUILDKIT=1 docker build . \ |  | ||||||
|   --file docker/Dockerfile \ |  | ||||||
|   --target vllm-openai \ |  | ||||||
|   --platform "linux/arm64" \ |  | ||||||
|   -t gh200-test \ |  | ||||||
|   --build-arg max_jobs=66 \ |  | ||||||
|   --build-arg nvcc_threads=2 \ |  | ||||||
|   --build-arg RUN_WHEEL_CHECK=false \ |  | ||||||
|   --build-arg torch_cuda_arch_list="9.0+PTX" |  | ||||||
|  |  | ||||||
| # Setup cleanup |  | ||||||
| remove_docker_container() { docker rm -f gh200-test || true; } |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
| remove_docker_container |  | ||||||
|  |  | ||||||
| # Run the image and test offline inference |  | ||||||
| docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' |  | ||||||
|     python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B |  | ||||||
| ' |  | ||||||
| @ -1,56 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # This script build the CPU docker image and run the offline inference inside the container. |  | ||||||
| # It serves a sanity check for compilation and basic model usage. |  | ||||||
| set -exuo pipefail |  | ||||||
|  |  | ||||||
| # Try building the docker image |  | ||||||
| cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - . |  | ||||||
| FROM gaudi-base-image:latest |  | ||||||
|  |  | ||||||
| COPY ./ /workspace/vllm |  | ||||||
|  |  | ||||||
| WORKDIR /workspace/vllm |  | ||||||
|  |  | ||||||
| ENV no_proxy=localhost,127.0.0.1 |  | ||||||
| ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true |  | ||||||
|  |  | ||||||
| RUN VLLM_TARGET_DEVICE=empty pip install . |  | ||||||
| RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git |  | ||||||
|  |  | ||||||
| # install development dependencies (for testing) |  | ||||||
| RUN python3 -m pip install -e tests/vllm_test_utils |  | ||||||
|  |  | ||||||
| WORKDIR /workspace/ |  | ||||||
|  |  | ||||||
| RUN git clone https://github.com/vllm-project/vllm-gaudi.git |  | ||||||
|  |  | ||||||
| RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks |  | ||||||
|  |  | ||||||
| EOF |  | ||||||
|  |  | ||||||
| # Setup cleanup |  | ||||||
| # certain versions of HPU software stack have a bug that can |  | ||||||
| # override the exit code of the script, so we need to use |  | ||||||
| # separate remove_docker_containers and remove_docker_containers_and_exit |  | ||||||
| # functions, while other platforms only need one remove_docker_container |  | ||||||
| # function. |  | ||||||
| EXITCODE=1 |  | ||||||
| remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; } |  | ||||||
| trap 'remove_docker_containers; exit $EXITCODE;' EXIT |  | ||||||
| remove_docker_containers |  | ||||||
|  |  | ||||||
| echo "Running HPU plugin v1 test" |  | ||||||
| docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \ |  | ||||||
|   -e HABANA_VISIBLE_DEVICES=all \ |  | ||||||
|   hpu-plugin-v1-test-env \ |  | ||||||
|   /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh" |  | ||||||
|  |  | ||||||
| EXITCODE=$? |  | ||||||
| if [ $EXITCODE -eq 0 ]; then |  | ||||||
|   echo "Test with basic model passed" |  | ||||||
| else |  | ||||||
|   echo "Test with basic model FAILED with exit code: $EXITCODE" >&2 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # The trap will handle the container removal and final exit. |  | ||||||
| @ -1,191 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # This script build the Ascend NPU docker image and run the offline inference inside the container. |  | ||||||
| # It serves a sanity check for compilation and basic model usage. |  | ||||||
| set -ex |  | ||||||
|  |  | ||||||
| # Base ubuntu image with basic ascend development libraries and python installed |  | ||||||
| VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git" |  | ||||||
| CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg" |  | ||||||
| TEST_RUN_CONFIG_FILE="vllm_test.cfg" |  | ||||||
| VLLM_ASCEND_TMP_DIR= |  | ||||||
| # Get the test run configuration file from the vllm-ascend repository |  | ||||||
| fetch_vllm_test_cfg() { |  | ||||||
|     VLLM_ASCEND_TMP_DIR=$(mktemp -d) |  | ||||||
|     # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval |  | ||||||
|     cleanup() { |  | ||||||
|         rm -rf "${VLLM_ASCEND_TMP_DIR}" |  | ||||||
|     } |  | ||||||
|     trap cleanup EXIT |  | ||||||
|  |  | ||||||
|     GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}" |  | ||||||
|     if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then |  | ||||||
|         echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2 |  | ||||||
|         exit 1 |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     # If the file already exists locally, just overwrite it |  | ||||||
|     cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}" |  | ||||||
|     echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}" |  | ||||||
|  |  | ||||||
|     # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources |  | ||||||
|     # when the trap is abnormal has been completed, so the temporary resources are manually deleted here. |  | ||||||
|     rm -rf "${VLLM_ASCEND_TMP_DIR}" |  | ||||||
|     trap - EXIT |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # Downloads test run configuration file from a remote URL. |  | ||||||
| # Loads the configuration into the current script environment. |  | ||||||
| get_config() { |  | ||||||
|     if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then |  | ||||||
|         echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2 |  | ||||||
|         exit 1 |  | ||||||
|     fi |  | ||||||
|     source "${TEST_RUN_CONFIG_FILE}" |  | ||||||
|     echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}" |  | ||||||
|     return 0 |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # get test running configuration. |  | ||||||
| fetch_vllm_test_cfg |  | ||||||
| get_config |  | ||||||
| # Check if the function call was successful. If not, exit the script. |  | ||||||
| if [ $? -ne 0 ]; then |  | ||||||
|   exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}" |  | ||||||
| container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" |  | ||||||
|  |  | ||||||
| # BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards |  | ||||||
| agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}') |  | ||||||
| echo "agent_idx: ${agent_idx}" |  | ||||||
| builder_name="cachebuilder${agent_idx}" |  | ||||||
| builder_cache_dir="/mnt/docker-cache${agent_idx}" |  | ||||||
| mkdir -p ${builder_cache_dir} |  | ||||||
|  |  | ||||||
| # Try building the docker image |  | ||||||
| cat <<EOF | DOCKER_BUILDKIT=1 docker build \ |  | ||||||
|     --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \ |  | ||||||
|     --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \ |  | ||||||
|                            --cache-to type=local,dest=${builder_cache_dir},mode=max \ |  | ||||||
|     --progress=plain --load -t ${image_name} -f - . |  | ||||||
| FROM ${BASE_IMAGE_NAME} |  | ||||||
|  |  | ||||||
| # Define environments |  | ||||||
| ENV DEBIAN_FRONTEND=noninteractive |  | ||||||
|  |  | ||||||
| RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \ |  | ||||||
|     pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \ |  | ||||||
|     apt-get update -y && \ |  | ||||||
|     apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ |  | ||||||
|     rm -rf /var/cache/apt/* && \ |  | ||||||
|     rm -rf /var/lib/apt/lists/* |  | ||||||
|  |  | ||||||
| # Install for pytest to make the docker build cache layer always valid |  | ||||||
| RUN --mount=type=cache,target=/root/.cache/pip \ |  | ||||||
|     pip install pytest>=6.0  modelscope |  | ||||||
|  |  | ||||||
| WORKDIR /workspace/vllm |  | ||||||
|  |  | ||||||
| # Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. |  | ||||||
| COPY requirements/common.txt /workspace/vllm/requirements/common.txt |  | ||||||
| RUN --mount=type=cache,target=/root/.cache/pip \ |  | ||||||
|     pip install -r requirements/common.txt |  | ||||||
|  |  | ||||||
| COPY . . |  | ||||||
|  |  | ||||||
| # Install vLLM |  | ||||||
| RUN --mount=type=cache,target=/root/.cache/pip \ |  | ||||||
|     VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ |  | ||||||
|     python3 -m pip uninstall -y triton |  | ||||||
|  |  | ||||||
| # Install vllm-ascend |  | ||||||
| WORKDIR /workspace |  | ||||||
| ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git |  | ||||||
| ARG VLLM_ASCEND_TAG=main |  | ||||||
| RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \ |  | ||||||
|     git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend |  | ||||||
|  |  | ||||||
| # Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. |  | ||||||
| RUN --mount=type=cache,target=/root/.cache/pip \ |  | ||||||
|     pip install -r /workspace/vllm-ascend/requirements.txt |  | ||||||
|  |  | ||||||
| RUN --mount=type=cache,target=/root/.cache/pip \ |  | ||||||
|     export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ |  | ||||||
|     source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ |  | ||||||
|     source /usr/local/Ascend/nnal/atb/set_env.sh && \ |  | ||||||
|     export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ |  | ||||||
|     python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ |  | ||||||
|  |  | ||||||
| ENV VLLM_WORKER_MULTIPROC_METHOD=spawn |  | ||||||
| ENV VLLM_USE_MODELSCOPE=True |  | ||||||
|  |  | ||||||
| WORKDIR /workspace/vllm-ascend |  | ||||||
|  |  | ||||||
| CMD ["/bin/bash"] |  | ||||||
|  |  | ||||||
| EOF |  | ||||||
|  |  | ||||||
| # Setup cleanup |  | ||||||
| remove_docker_container() { |  | ||||||
|   docker rm -f "${container_name}" || true; |  | ||||||
|   docker image rm -f "${image_name}" || true; |  | ||||||
|   docker system prune -f || true; |  | ||||||
| } |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
|  |  | ||||||
| # Generate corresponding --device args based on BUILDKITE_AGENT_NAME |  | ||||||
| # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1. |  | ||||||
| #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards. |  | ||||||
| #   returns --device /dev/davinci0 --device /dev/davinci1 |  | ||||||
| parse_and_gen_devices() { |  | ||||||
|     local input="$1" |  | ||||||
|     local index cards_num |  | ||||||
|     if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then |  | ||||||
|         index="${BASH_REMATCH[1]}" |  | ||||||
|         cards_num="${BASH_REMATCH[2]}" |  | ||||||
|     else |  | ||||||
|         echo "parse error" >&2 |  | ||||||
|         return 1 |  | ||||||
|     fi |  | ||||||
|  |  | ||||||
|     local devices="" |  | ||||||
|     local i=0 |  | ||||||
|     while (( i < cards_num )); do |  | ||||||
|         local dev_idx=$(((index - 1)*cards_num + i )) |  | ||||||
|         devices="$devices --device /dev/davinci${dev_idx}" |  | ||||||
|         ((i++)) |  | ||||||
|     done |  | ||||||
|  |  | ||||||
|     # trim leading space |  | ||||||
|     devices="${devices#"${devices%%[![:space:]]*}"}" |  | ||||||
|     # Output devices: assigned to the caller variable |  | ||||||
|     printf '%s' "$devices" |  | ||||||
| } |  | ||||||
|  |  | ||||||
| devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1 |  | ||||||
|  |  | ||||||
| # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware. |  | ||||||
| # This test checks whether the OOT platform interface is functioning properly in conjunction with |  | ||||||
| # the hardware plugin vllm-ascend. |  | ||||||
| model_cache_dir=/mnt/modelscope${agent_idx} |  | ||||||
| mkdir -p ${model_cache_dir} |  | ||||||
| docker run \ |  | ||||||
|     ${devices} \ |  | ||||||
|     --device /dev/davinci_manager \ |  | ||||||
|     --device /dev/devmm_svm \ |  | ||||||
|     --device /dev/hisi_hdc \ |  | ||||||
|     -v /usr/local/dcmi:/usr/local/dcmi \ |  | ||||||
|     -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ |  | ||||||
|     -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ |  | ||||||
|     -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ |  | ||||||
|     -v /etc/ascend_install.info:/etc/ascend_install.info \ |  | ||||||
|     -v ${model_cache_dir}:/root/.cache/modelscope \ |  | ||||||
|     --entrypoint="" \ |  | ||||||
|     --name "${container_name}" \ |  | ||||||
|     "${image_name}" \ |  | ||||||
|     bash -c ' |  | ||||||
|     set -e |  | ||||||
|     pytest -v -s tests/e2e/vllm_interface/ |  | ||||||
| ' |  | ||||||
| @ -1,166 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| set -xu |  | ||||||
|  |  | ||||||
|  |  | ||||||
| remove_docker_container() {  |  | ||||||
|     docker rm -f tpu-test || true; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
|  |  | ||||||
| # Remove the container that might not be cleaned up in the previous run. |  | ||||||
| remove_docker_container |  | ||||||
|  |  | ||||||
| # Build the docker image. |  | ||||||
| docker build -f docker/Dockerfile.tpu -t vllm-tpu . |  | ||||||
|  |  | ||||||
| # Set up cleanup. |  | ||||||
| cleanup_docker() { |  | ||||||
|   # Get Docker's root directory |  | ||||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') |  | ||||||
|   if [ -z "$docker_root" ]; then |  | ||||||
|     echo "Failed to determine Docker root directory." |  | ||||||
|     exit 1 |  | ||||||
|   fi |  | ||||||
|   echo "Docker root directory: $docker_root" |  | ||||||
|   # Check disk usage of the filesystem where Docker's root directory is located |  | ||||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') |  | ||||||
|   # Define the threshold |  | ||||||
|   threshold=70 |  | ||||||
|   if [ "$disk_usage" -gt "$threshold" ]; then |  | ||||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." |  | ||||||
|     # Remove dangling images (those that are not tagged and not used by any container) |  | ||||||
|     docker image prune -f |  | ||||||
|     # Remove unused volumes / force the system prune for old images as well. |  | ||||||
|     docker volume prune -f && docker system prune --force --filter "until=72h" --all |  | ||||||
|     echo "Docker images and volumes cleanup completed." |  | ||||||
|   else |  | ||||||
|     echo "Disk usage is below $threshold%. No cleanup needed." |  | ||||||
|   fi |  | ||||||
| } |  | ||||||
| cleanup_docker |  | ||||||
|  |  | ||||||
| # For HF_TOKEN. |  | ||||||
| source /etc/environment |  | ||||||
|  |  | ||||||
| docker run --privileged --net host --shm-size=16G -it \ |  | ||||||
|     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ |  | ||||||
|     vllm-tpu /bin/bash -c ' |  | ||||||
| set -e # Exit immediately if a command exits with a non-zero status. |  | ||||||
| set -u # Treat unset variables as an error. |  | ||||||
|  |  | ||||||
| echo "--- Starting script inside Docker container ---" |  | ||||||
|  |  | ||||||
| # Create results directory |  | ||||||
| RESULTS_DIR=$(mktemp -d) |  | ||||||
| # If mktemp fails, set -e will cause the script to exit. |  | ||||||
| echo "Results will be stored in: $RESULTS_DIR" |  | ||||||
|  |  | ||||||
| # Install dependencies |  | ||||||
| echo "--- Installing Python dependencies ---" |  | ||||||
| python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ |  | ||||||
|     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ |  | ||||||
|     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ |  | ||||||
|     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 |  | ||||||
| echo "--- Python dependencies installed ---" |  | ||||||
|  |  | ||||||
| export VLLM_XLA_CHECK_RECOMPILATION=1 |  | ||||||
| export VLLM_XLA_CACHE_PATH= |  | ||||||
|  |  | ||||||
| echo "--- Hardware Information ---" |  | ||||||
| # tpu-info |  | ||||||
| echo "--- Starting Tests ---" |  | ||||||
| set +e |  | ||||||
| overall_script_exit_code=0 |  | ||||||
|  |  | ||||||
| # --- Test Definitions --- |  | ||||||
| # If a test fails, this function will print logs and will not cause the main script to exit. |  | ||||||
| run_test() { |  | ||||||
|     local test_num=$1 |  | ||||||
|     local test_name=$2 |  | ||||||
|     local test_command=$3 |  | ||||||
|     local log_file="$RESULTS_DIR/test_${test_num}.log" |  | ||||||
|     local actual_exit_code |  | ||||||
|  |  | ||||||
|     echo "--- TEST_$test_num: Running $test_name ---" |  | ||||||
|      |  | ||||||
|     # Execute the test command. |  | ||||||
|     eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) |  | ||||||
|     actual_exit_code=$? |  | ||||||
|  |  | ||||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log |  | ||||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log |  | ||||||
|  |  | ||||||
|     if [ "$actual_exit_code" -ne 0 ]; then |  | ||||||
|         echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 |  | ||||||
|         echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 |  | ||||||
|         if [ -f "$log_file" ]; then |  | ||||||
|             cat "$log_file" >&2 |  | ||||||
|         else |  | ||||||
|             echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 |  | ||||||
|         fi |  | ||||||
|         echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 |  | ||||||
|         return "$actual_exit_code" # Return the failure code |  | ||||||
|     else |  | ||||||
|         echo "TEST_$test_num ($test_name) PASSED." |  | ||||||
|         return 0 # Return success |  | ||||||
|     fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # Helper function to call run_test and update the overall script exit code |  | ||||||
| run_and_track_test() { |  | ||||||
|     local test_num_arg="$1" |  | ||||||
|     local test_name_arg="$2" |  | ||||||
|     local test_command_arg="$3" |  | ||||||
|  |  | ||||||
|     # Run the test |  | ||||||
|     run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" |  | ||||||
|     local test_specific_exit_code=$? |  | ||||||
|  |  | ||||||
|     # If the test failed, set the overall script exit code to 1 |  | ||||||
|     if [ "$test_specific_exit_code" -ne 0 ]; then |  | ||||||
|         # No need for extra echo here, run_test already logged the failure. |  | ||||||
|         overall_script_exit_code=1 |  | ||||||
|     fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # --- Actual Test Execution --- |  | ||||||
| run_and_track_test 1 "test_struct_output_generate.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" |  | ||||||
| run_and_track_test 2 "test_moe_pallas.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" |  | ||||||
| run_and_track_test 3 "test_lora.py" \ |  | ||||||
|     "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" |  | ||||||
| run_and_track_test 4 "test_tpu_qkv_linear.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" |  | ||||||
| run_and_track_test 5 "test_spmd_model_weight_loading.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" |  | ||||||
| run_and_track_test 6 "test_kv_cache_update_kernel.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" |  | ||||||
| run_and_track_test 7 "test_tpu_int8.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py" |  | ||||||
|  |  | ||||||
| # After all tests have been attempted, exit with the overall status. |  | ||||||
| if [ "$overall_script_exit_code" -ne 0 ]; then |  | ||||||
|     echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" |  | ||||||
| else |  | ||||||
|     echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" |  | ||||||
| fi |  | ||||||
| exit "$overall_script_exit_code" |  | ||||||
| ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. |  | ||||||
|  |  | ||||||
| # Capture the exit code of the docker run command |  | ||||||
| DOCKER_RUN_EXIT_CODE=$? |  | ||||||
|  |  | ||||||
| # The trap will run for cleanup. |  | ||||||
| # Exit the main script with the Docker run command's exit code. |  | ||||||
| if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then |  | ||||||
|     echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." |  | ||||||
|     exit "$DOCKER_RUN_EXIT_CODE" |  | ||||||
| else |  | ||||||
|     echo "Docker run command completed successfully." |  | ||||||
|     exit 0 |  | ||||||
| fi |  | ||||||
| # TODO: This test fails because it uses RANDOM_SEED sampling |  | ||||||
| # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ |  | ||||||
| @ -1,174 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| set -xu |  | ||||||
|  |  | ||||||
|  |  | ||||||
| remove_docker_container() {  |  | ||||||
|     docker rm -f tpu-test || true;  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
|  |  | ||||||
| # Remove the container that might not be cleaned up in the previous run. |  | ||||||
| remove_docker_container |  | ||||||
|  |  | ||||||
| # Build the docker image. |  | ||||||
| docker build -f docker/Dockerfile.tpu -t vllm-tpu . |  | ||||||
|  |  | ||||||
| # Set up cleanup. |  | ||||||
| cleanup_docker() { |  | ||||||
|   # Get Docker's root directory |  | ||||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') |  | ||||||
|   if [ -z "$docker_root" ]; then |  | ||||||
|     echo "Failed to determine Docker root directory." |  | ||||||
|     exit 1 |  | ||||||
|   fi |  | ||||||
|   echo "Docker root directory: $docker_root" |  | ||||||
|   # Check disk usage of the filesystem where Docker's root directory is located |  | ||||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') |  | ||||||
|   # Define the threshold |  | ||||||
|   threshold=70 |  | ||||||
|   if [ "$disk_usage" -gt "$threshold" ]; then |  | ||||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." |  | ||||||
|     # Remove dangling images (those that are not tagged and not used by any container) |  | ||||||
|     docker image prune -f |  | ||||||
|     # Remove unused volumes / force the system prune for old images as well. |  | ||||||
|     docker volume prune -f && docker system prune --force --filter "until=72h" --all |  | ||||||
|     echo "Docker images and volumes cleanup completed." |  | ||||||
|   else |  | ||||||
|     echo "Disk usage is below $threshold%. No cleanup needed." |  | ||||||
|   fi |  | ||||||
| } |  | ||||||
| cleanup_docker |  | ||||||
|  |  | ||||||
| # For HF_TOKEN. |  | ||||||
| source /etc/environment |  | ||||||
|  |  | ||||||
| docker run --privileged --net host --shm-size=16G -it \ |  | ||||||
|     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ |  | ||||||
|     vllm-tpu /bin/bash -c ' |  | ||||||
| set -e # Exit immediately if a command exits with a non-zero status. |  | ||||||
| set -u # Treat unset variables as an error. |  | ||||||
|  |  | ||||||
| echo "--- Starting script inside Docker container ---" |  | ||||||
|  |  | ||||||
| # Create results directory |  | ||||||
| RESULTS_DIR=$(mktemp -d) |  | ||||||
| # If mktemp fails, set -e will cause the script to exit. |  | ||||||
| echo "Results will be stored in: $RESULTS_DIR" |  | ||||||
|  |  | ||||||
| # Install dependencies |  | ||||||
| echo "--- Installing Python dependencies ---" |  | ||||||
| python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ |  | ||||||
|     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ |  | ||||||
|     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ |  | ||||||
|     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 |  | ||||||
| echo "--- Python dependencies installed ---" |  | ||||||
|  |  | ||||||
| export VLLM_XLA_CHECK_RECOMPILATION=1 |  | ||||||
| export VLLM_XLA_CACHE_PATH= |  | ||||||
|  |  | ||||||
| echo "--- Hardware Information ---" |  | ||||||
| # tpu-info |  | ||||||
| echo "--- Starting Tests ---" |  | ||||||
| set +e |  | ||||||
| overall_script_exit_code=0 |  | ||||||
|  |  | ||||||
| # --- Test Definitions --- |  | ||||||
| # If a test fails, this function will print logs and will not cause the main script to exit. |  | ||||||
| run_test() { |  | ||||||
|     local test_num=$1 |  | ||||||
|     local test_name=$2 |  | ||||||
|     local test_command=$3 |  | ||||||
|     local log_file="$RESULTS_DIR/test_${test_num}.log" |  | ||||||
|     local actual_exit_code |  | ||||||
|  |  | ||||||
|     echo "--- TEST_$test_num: Running $test_name ---" |  | ||||||
|      |  | ||||||
|     # Execute the test command. |  | ||||||
|     eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) |  | ||||||
|     actual_exit_code=$? |  | ||||||
|  |  | ||||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log |  | ||||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log |  | ||||||
|  |  | ||||||
|     if [ "$actual_exit_code" -ne 0 ]; then |  | ||||||
|         echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 |  | ||||||
|         echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 |  | ||||||
|         if [ -f "$log_file" ]; then |  | ||||||
|             cat "$log_file" >&2 |  | ||||||
|         else |  | ||||||
|             echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 |  | ||||||
|         fi |  | ||||||
|         echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 |  | ||||||
|         return "$actual_exit_code" # Return the failure code |  | ||||||
|     else |  | ||||||
|         echo "TEST_$test_num ($test_name) PASSED." |  | ||||||
|         return 0 # Return success |  | ||||||
|     fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # Helper function to call run_test and update the overall script exit code |  | ||||||
| run_and_track_test() { |  | ||||||
|     local test_num_arg="$1" |  | ||||||
|     local test_name_arg="$2" |  | ||||||
|     local test_command_arg="$3" |  | ||||||
|  |  | ||||||
|     # Run the test |  | ||||||
|     run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" |  | ||||||
|     local test_specific_exit_code=$? |  | ||||||
|  |  | ||||||
|     # If the test failed, set the overall script exit code to 1 |  | ||||||
|     if [ "$test_specific_exit_code" -ne 0 ]; then |  | ||||||
|         # No need for extra echo here, run_test already logged the failure. |  | ||||||
|         overall_script_exit_code=1 |  | ||||||
|     fi |  | ||||||
| } |  | ||||||
|  |  | ||||||
| # --- Actual Test Execution --- |  | ||||||
| run_and_track_test 0 "test_perf.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py" |  | ||||||
| run_and_track_test 1 "test_compilation.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py" |  | ||||||
| run_and_track_test 2 "test_basic.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" |  | ||||||
| run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" |  | ||||||
| run_and_track_test 4 "test_quantization_accuracy.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" |  | ||||||
| run_and_track_test 5 "examples/offline_inference/tpu.py" \ |  | ||||||
|     "python3 /workspace/vllm/examples/offline_inference/tpu.py" |  | ||||||
| run_and_track_test 6 "test_tpu_model_runner.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" |  | ||||||
| run_and_track_test 7 "test_sampler.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" |  | ||||||
| run_and_track_test 8 "test_topk_topp_sampler.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" |  | ||||||
| run_and_track_test 9 "test_multimodal.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" |  | ||||||
| run_and_track_test 10 "test_pallas.py" \ |  | ||||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" |  | ||||||
|  |  | ||||||
| # After all tests have been attempted, exit with the overall status. |  | ||||||
| if [ "$overall_script_exit_code" -ne 0 ]; then |  | ||||||
|     echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" |  | ||||||
| else |  | ||||||
|     echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" |  | ||||||
| fi |  | ||||||
| exit "$overall_script_exit_code" |  | ||||||
| ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. |  | ||||||
|  |  | ||||||
| # Capture the exit code of the docker run command |  | ||||||
| DOCKER_RUN_EXIT_CODE=$? |  | ||||||
|  |  | ||||||
| # The trap will run for cleanup. |  | ||||||
| # Exit the main script with the Docker run command's exit code. |  | ||||||
| if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then |  | ||||||
|     echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." |  | ||||||
|     exit "$DOCKER_RUN_EXIT_CODE" |  | ||||||
| else |  | ||||||
|     echo "Docker run command completed successfully." |  | ||||||
|     exit 0 |  | ||||||
| fi |  | ||||||
| # TODO: This test fails because it uses RANDOM_SEED sampling |  | ||||||
| # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ |  | ||||||
| @ -1,51 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # This script build the CPU docker image and run the offline inference inside the container. |  | ||||||
| # It serves a sanity check for compilation and basic model usage. |  | ||||||
| set -ex |  | ||||||
|  |  | ||||||
| image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}" |  | ||||||
| container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" |  | ||||||
|  |  | ||||||
| # Try building the docker image |  | ||||||
| docker build -t ${image_name} -f docker/Dockerfile.xpu . |  | ||||||
|  |  | ||||||
| # Setup cleanup |  | ||||||
| remove_docker_container() { |  | ||||||
|   docker rm -f "${container_name}" || true; |  | ||||||
|   docker image rm -f "${image_name}" || true; |  | ||||||
|   docker system prune -f || true; |  | ||||||
| } |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
|  |  | ||||||
| # Run the image and test offline inference/tensor parallel |  | ||||||
| docker run \ |  | ||||||
|     --device /dev/dri:/dev/dri \ |  | ||||||
|     --net=host \ |  | ||||||
|     --ipc=host \ |  | ||||||
|     --privileged \ |  | ||||||
|     -v /dev/dri/by-path:/dev/dri/by-path \ |  | ||||||
|     --entrypoint="" \ |  | ||||||
|     -e "HF_TOKEN=${HF_TOKEN}" \ |  | ||||||
|     -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \ |  | ||||||
|     --name "${container_name}" \ |  | ||||||
|     "${image_name}" \ |  | ||||||
|     bash -c ' |  | ||||||
|     set -e |  | ||||||
|     echo $ZE_AFFINITY_MASK |  | ||||||
|     pip install tblib==3.1.0 |  | ||||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager |  | ||||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE |  | ||||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray |  | ||||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp |  | ||||||
|     VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager |  | ||||||
|     cd tests |  | ||||||
|     pytest -v -s v1/core |  | ||||||
|     pytest -v -s v1/engine |  | ||||||
|     pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py |  | ||||||
|     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py |  | ||||||
|     pytest -v -s v1/structured_output |  | ||||||
|     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py |  | ||||||
|     pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py |  | ||||||
|     pytest -v -s v1/test_serial_utils.py |  | ||||||
| ' |  | ||||||
| @ -1,18 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| # Usage: ./rerun_test.sh path/to/test.py::test_name |  | ||||||
|  |  | ||||||
| # Check if argument is given |  | ||||||
| if [ $# -lt 1 ]; then |  | ||||||
|     echo "Usage: $0 path/to/test.py::test_name" |  | ||||||
|     echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]" |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| TEST=$1 |  | ||||||
| COUNT=1 |  | ||||||
|  |  | ||||||
| while pytest -sv "$TEST"; do |  | ||||||
|     COUNT=$((COUNT + 1)) |  | ||||||
|     echo "RUN NUMBER ${COUNT}" |  | ||||||
| done |  | ||||||
| @ -1,108 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| set -euox pipefail |  | ||||||
|  |  | ||||||
| if [[ $# -lt 4 ]]; then |  | ||||||
|     echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| WORKING_DIR=$1 |  | ||||||
| NUM_NODES=$2 |  | ||||||
| NUM_GPUS=$3 |  | ||||||
| DOCKER_IMAGE=$4 |  | ||||||
|  |  | ||||||
| shift 4 |  | ||||||
| COMMANDS=("$@") |  | ||||||
| if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then |  | ||||||
|     echo "The number of commands must be equal to the number of nodes." |  | ||||||
|     echo "Number of nodes: $NUM_NODES" |  | ||||||
|     echo "Number of commands: ${#COMMANDS[@]}" |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| echo "List of commands" |  | ||||||
| for command in "${COMMANDS[@]}"; do |  | ||||||
|     echo "$command" |  | ||||||
| done |  | ||||||
|  |  | ||||||
| start_network() { |  | ||||||
|     docker network create --subnet=192.168.10.0/24 docker-net |  | ||||||
| } |  | ||||||
|  |  | ||||||
| start_nodes() { |  | ||||||
|     for node in $(seq 0 $(($NUM_NODES-1))); do |  | ||||||
|         GPU_DEVICES='"device=' |  | ||||||
|         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do |  | ||||||
|             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) |  | ||||||
|             GPU_DEVICES+=$(($DEVICE_NUM)) |  | ||||||
|             if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then |  | ||||||
|                 GPU_DEVICES+=',' |  | ||||||
|             fi |  | ||||||
|         done |  | ||||||
|         GPU_DEVICES+='"' |  | ||||||
|  |  | ||||||
|         # start the container in detached mode |  | ||||||
|         # things to note: |  | ||||||
|         # 1. --shm-size=10.24gb is required. don't use --ipc=host |  | ||||||
|         # 2. pass HF_TOKEN to the container |  | ||||||
|         # 3. map the huggingface cache directory to the container |  | ||||||
|         # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: |  | ||||||
|         #    starting from 192.168.10.11) |  | ||||||
|         docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \ |  | ||||||
|             -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \ |  | ||||||
|             --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \ |  | ||||||
|             /bin/bash -c "tail -f /dev/null" |  | ||||||
|  |  | ||||||
|         # organize containers into a ray cluster |  | ||||||
|         if [ "$node" -eq 0 ]; then |  | ||||||
|             # start the ray head node |  | ||||||
|             docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block" |  | ||||||
|             # wait for the head node to be ready |  | ||||||
|             sleep 10 |  | ||||||
|         else |  | ||||||
|             # start the ray worker nodes, and connect them to the head node |  | ||||||
|             docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block" |  | ||||||
|         fi |  | ||||||
|     done |  | ||||||
|  |  | ||||||
|     # wait for the cluster to be ready |  | ||||||
|     sleep 10 |  | ||||||
|  |  | ||||||
|     # print the cluster status |  | ||||||
|     docker exec node0 /bin/bash -c "ray status" |  | ||||||
| } |  | ||||||
|  |  | ||||||
| run_nodes() { |  | ||||||
|     # important: iterate in reverse order to start the head node last |  | ||||||
|     # we start the worker nodes first, in detached mode, and then start the head node |  | ||||||
|     # in the foreground, so that the output of the head node is visible in the buildkite logs |  | ||||||
|     for node in $(seq $(($NUM_NODES - 1)) -1 0); do |  | ||||||
|         GPU_DEVICES='"device=' |  | ||||||
|         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do |  | ||||||
|             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) |  | ||||||
|             GPU_DEVICES+=$(($DEVICE_NUM)) |  | ||||||
|             if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then |  | ||||||
|                 GPU_DEVICES+=',' |  | ||||||
|             fi |  | ||||||
|         done |  | ||||||
|         GPU_DEVICES+='"' |  | ||||||
|         echo "Running node$node with GPU devices: $GPU_DEVICES" |  | ||||||
|         if [ "$node" -ne 0 ]; then |  | ||||||
|             docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" |  | ||||||
|         else |  | ||||||
|             docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" |  | ||||||
|         fi |  | ||||||
|     done |  | ||||||
| } |  | ||||||
| cleanup() { |  | ||||||
|     for node in $(seq 0 $(($NUM_NODES-1))); do |  | ||||||
|         docker stop "node$node" |  | ||||||
|     done |  | ||||||
|     docker network rm docker-net |  | ||||||
| } |  | ||||||
| trap cleanup EXIT |  | ||||||
| start_network |  | ||||||
| start_nodes |  | ||||||
| run_nodes |  | ||||||
|  |  | ||||||
| @ -1,59 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
| # SPDX-License-Identifier: Apache-2.0 |  | ||||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |  | ||||||
|  |  | ||||||
| # Setup script for Prime-RL integration tests |  | ||||||
| # This script prepares the environment for running Prime-RL tests with nightly vLLM |  | ||||||
|  |  | ||||||
| set -euo pipefail |  | ||||||
|  |  | ||||||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |  | ||||||
| REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" |  | ||||||
| PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git" |  | ||||||
| PRIME_RL_DIR="${REPO_ROOT}/prime-rl" |  | ||||||
|  |  | ||||||
| echo "Setting up Prime-RL integration test environment..." |  | ||||||
|  |  | ||||||
| # Clean up any existing Prime-RL directory |  | ||||||
| if [ -d "${PRIME_RL_DIR}" ]; then |  | ||||||
|     echo "Removing existing Prime-RL directory..." |  | ||||||
|     rm -rf "${PRIME_RL_DIR}" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Install UV if not available |  | ||||||
| if ! command -v uv &> /dev/null; then |  | ||||||
|     echo "Installing UV package manager..." |  | ||||||
|     curl -LsSf https://astral.sh/uv/install.sh | sh |  | ||||||
|     source $HOME/.local/bin/env |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Clone Prime-RL repository at specific branch for reproducible tests |  | ||||||
| PRIME_RL_BRANCH="integ-vllm-main" |  | ||||||
| echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..." |  | ||||||
| git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}" |  | ||||||
| cd "${PRIME_RL_DIR}" |  | ||||||
|  |  | ||||||
| echo "Setting up UV project environment..." |  | ||||||
| export UV_PROJECT_ENVIRONMENT=/usr/local |  | ||||||
| ln -s /usr/bin/python3 /usr/local/bin/python |  | ||||||
|  |  | ||||||
| # Remove vllm pin from pyproject.toml |  | ||||||
| echo "Removing vllm pin from pyproject.toml..." |  | ||||||
| sed -i '/vllm==/d' pyproject.toml |  | ||||||
|  |  | ||||||
| # Sync Prime-RL dependencies |  | ||||||
| echo "Installing Prime-RL dependencies..." |  | ||||||
| uv sync --inexact && uv sync --inexact --all-extras |  | ||||||
|  |  | ||||||
| # Verify installation |  | ||||||
| echo "Verifying installations..." |  | ||||||
| uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" |  | ||||||
| uv run python -c "import prime_rl; print('Prime-RL imported successfully')" |  | ||||||
|  |  | ||||||
| echo "Prime-RL integration test environment setup complete!" |  | ||||||
|  |  | ||||||
| echo "Running Prime-RL integration tests..." |  | ||||||
| export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY |  | ||||||
| uv run pytest -vs tests/integration/test_rl.py -m gpu |  | ||||||
|  |  | ||||||
| echo "Prime-RL integration tests completed!" |  | ||||||
| @ -1,62 +0,0 @@ | |||||||
| #!/usr/bin/env bash |  | ||||||
| set -euxo pipefail |  | ||||||
|  |  | ||||||
| # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] |  | ||||||
| THRESHOLD=${1:-0.25} |  | ||||||
| NUM_Q=${2:-1319} |  | ||||||
| PORT=${3:-8010} |  | ||||||
| OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} |  | ||||||
| mkdir -p "${OUT_DIR}" |  | ||||||
|  |  | ||||||
| wait_for_server() { |  | ||||||
|   local port=$1 |  | ||||||
|   timeout 600 bash -c ' |  | ||||||
|     until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do |  | ||||||
|       sleep 1 |  | ||||||
|     done' |  | ||||||
| } |  | ||||||
|  |  | ||||||
| MODEL="deepseek-ai/DeepSeek-V2-lite" |  | ||||||
| BACKENDS=("deepep_high_throughput" "deepep_low_latency") |  | ||||||
|  |  | ||||||
| cleanup() { |  | ||||||
|   if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then |  | ||||||
|     kill "${SERVER_PID}" 2>/dev/null || true |  | ||||||
|     for _ in {1..20}; do |  | ||||||
|       kill -0 "${SERVER_PID}" 2>/dev/null || break |  | ||||||
|       sleep 0.5 |  | ||||||
|     done |  | ||||||
|     kill -9 "${SERVER_PID}" 2>/dev/null || true |  | ||||||
|   fi |  | ||||||
| } |  | ||||||
| trap cleanup EXIT |  | ||||||
|  |  | ||||||
| for BACK in "${BACKENDS[@]}"; do |  | ||||||
|   VLLM_DEEP_GEMM_WARMUP=skip \ |  | ||||||
|   VLLM_ALL2ALL_BACKEND=$BACK \ |  | ||||||
|   vllm serve "$MODEL" \ |  | ||||||
|     --enforce-eager \ |  | ||||||
|     --tensor-parallel-size 2 \ |  | ||||||
|     --data-parallel-size 2 \ |  | ||||||
|     --enable-expert-parallel \ |  | ||||||
|     --enable-eplb \ |  | ||||||
|     --trust-remote-code \ |  | ||||||
|     --max-model-len 2048 \ |  | ||||||
|     --port $PORT & |  | ||||||
|   SERVER_PID=$! |  | ||||||
|   wait_for_server $PORT |  | ||||||
|  |  | ||||||
|   TAG=$(echo "$MODEL" | tr '/: \\n' '_____') |  | ||||||
|   OUT="${OUT_DIR}/${TAG}_${BACK}.json" |  | ||||||
|   python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} |  | ||||||
|   python3 - <<PY |  | ||||||
| import json; acc=json.load(open('${OUT}'))['accuracy'] |  | ||||||
| print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") |  | ||||||
| assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" |  | ||||||
| PY |  | ||||||
|  |  | ||||||
|   cleanup |  | ||||||
|   SERVER_PID= |  | ||||||
|   sleep 1 |  | ||||||
|   PORT=$((PORT+1)) |  | ||||||
| done |  | ||||||
| @ -1,61 +0,0 @@ | |||||||
| #!/usr/bin/env bash |  | ||||||
| set -euxo pipefail |  | ||||||
|  |  | ||||||
| # args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] |  | ||||||
| THRESHOLD=${1:-0.8} |  | ||||||
| NUM_Q=${2:-1319} |  | ||||||
| PORT=${3:-8020} |  | ||||||
| OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} |  | ||||||
| mkdir -p "${OUT_DIR}" |  | ||||||
|  |  | ||||||
| wait_for_server() { |  | ||||||
|   local port=$1 |  | ||||||
|   timeout 600 bash -c ' |  | ||||||
|     until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do |  | ||||||
|       sleep 1 |  | ||||||
|     done' |  | ||||||
| } |  | ||||||
|  |  | ||||||
| MODEL="QWen/Qwen3-30B-A3B-FP8" |  | ||||||
| BACKENDS=("deepep_high_throughput" "deepep_low_latency") |  | ||||||
|  |  | ||||||
| cleanup() { |  | ||||||
|   if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then |  | ||||||
|     kill "${SERVER_PID}" 2>/dev/null || true |  | ||||||
|     for _ in {1..20}; do |  | ||||||
|       kill -0 "${SERVER_PID}" 2>/dev/null || break |  | ||||||
|       sleep 0.5 |  | ||||||
|     done |  | ||||||
|     kill -9 "${SERVER_PID}" 2>/dev/null || true |  | ||||||
|   fi |  | ||||||
| } |  | ||||||
| trap cleanup EXIT |  | ||||||
|  |  | ||||||
| for BACK in "${BACKENDS[@]}"; do |  | ||||||
|   VLLM_DEEP_GEMM_WARMUP=skip \ |  | ||||||
|   VLLM_ALL2ALL_BACKEND=$BACK \ |  | ||||||
|   vllm serve "$MODEL" \ |  | ||||||
|     --enforce-eager \ |  | ||||||
|     --tensor-parallel-size 2 \ |  | ||||||
|     --data-parallel-size 2 \ |  | ||||||
|     --enable-expert-parallel \ |  | ||||||
|     --trust-remote-code \ |  | ||||||
|     --max-model-len 2048 \ |  | ||||||
|     --port $PORT & |  | ||||||
|   SERVER_PID=$! |  | ||||||
|   wait_for_server $PORT |  | ||||||
|  |  | ||||||
|   TAG=$(echo "$MODEL" | tr '/: \\n' '_____') |  | ||||||
|   OUT="${OUT_DIR}/${TAG}_${BACK}.json" |  | ||||||
|   python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} |  | ||||||
|   python3 - <<PY |  | ||||||
| import json; acc=json.load(open('${OUT}'))['accuracy'] |  | ||||||
| print(f"${MODEL} ${BACK}: accuracy {acc:.3f}") |  | ||||||
| assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" |  | ||||||
| PY |  | ||||||
|  |  | ||||||
|   cleanup |  | ||||||
|   SERVER_PID= |  | ||||||
|   sleep 1 |  | ||||||
|   PORT=$((PORT+1)) |  | ||||||
| done |  | ||||||
| @ -1,24 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| set -euo pipefail |  | ||||||
|  |  | ||||||
| docker_root=$(docker info -f '{{.DockerRootDir}}') |  | ||||||
| if [ -z "$docker_root" ]; then |  | ||||||
|   echo "Failed to determine Docker root directory." |  | ||||||
|   exit 1 |  | ||||||
| fi |  | ||||||
| echo "Docker root directory: $docker_root" |  | ||||||
| # Check disk usage of the filesystem where Docker's root directory is located |  | ||||||
| disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') |  | ||||||
| # Define the threshold |  | ||||||
| threshold=70 |  | ||||||
| if [ "$disk_usage" -gt "$threshold" ]; then |  | ||||||
|   echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." |  | ||||||
|   # Remove dangling images (those that are not tagged and not used by any container) |  | ||||||
|   docker image prune -f |  | ||||||
|   # Remove unused volumes / force the system prune for old images as well. |  | ||||||
|   docker volume prune -f && docker system prune --force --filter "until=24h" --all |  | ||||||
|   echo "Docker images and volumes cleanup completed." |  | ||||||
| else |  | ||||||
|   echo "Disk usage is below $threshold%. No cleanup needed." |  | ||||||
| fi |  | ||||||
| @ -1,14 +0,0 @@ | |||||||
| # Environment config |  | ||||||
| TEST_NAME=llama8b |  | ||||||
| CONTAINER_NAME=tpu-test |  | ||||||
|  |  | ||||||
| # vllm config |  | ||||||
| MODEL=meta-llama/Llama-3.1-8B-Instruct |  | ||||||
| MAX_NUM_SEQS=256 |  | ||||||
| MAX_NUM_BATCHED_TOKENS=1024 |  | ||||||
| TENSOR_PARALLEL_SIZE=1 |  | ||||||
| MAX_MODEL_LEN=2048 |  | ||||||
| DOWNLOAD_DIR=/mnt/disks/persist |  | ||||||
| EXPECTED_THROUGHPUT=8.0 |  | ||||||
| INPUT_LEN=1800 |  | ||||||
| OUTPUT_LEN=128 |  | ||||||
| @ -1,90 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| if [ ! -f "$1" ]; then |  | ||||||
|   echo "Error: The env file '$1' does not exist." |  | ||||||
|   exit 1  # Exit the script with a non-zero status to indicate an error |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| ENV_FILE=$1 |  | ||||||
|  |  | ||||||
| # For testing on local vm, use `set -a` to export all variables |  | ||||||
| source /etc/environment |  | ||||||
| source $ENV_FILE |  | ||||||
|  |  | ||||||
| remove_docker_container() {  |  | ||||||
|     docker rm -f $CONTAINER_NAME || true; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| trap remove_docker_container EXIT |  | ||||||
|  |  | ||||||
| # Remove the container that might not be cleaned up in the previous run. |  | ||||||
| remove_docker_container |  | ||||||
|  |  | ||||||
| LOG_ROOT=$(mktemp -d) |  | ||||||
| # If mktemp fails, set -e will cause the script to exit. |  | ||||||
| echo "Results will be stored in: $LOG_ROOT" |  | ||||||
|  |  | ||||||
| if [ -z "$HF_TOKEN" ]; then |  | ||||||
|   echo "Error: HF_TOKEN is not set or is empty."   |  | ||||||
|   exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Make sure mounted disk or dir exists |  | ||||||
| if [ ! -d "$DOWNLOAD_DIR" ]; then |  | ||||||
|     echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder." |  | ||||||
|     exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| echo "Run model $MODEL" |  | ||||||
| echo |  | ||||||
|  |  | ||||||
| echo "starting docker...$CONTAINER_NAME" |  | ||||||
| echo     |  | ||||||
| docker run \ |  | ||||||
|  -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \ |  | ||||||
|  --env-file $ENV_FILE \ |  | ||||||
|  -e HF_TOKEN="$HF_TOKEN" \ |  | ||||||
|  -e TARGET_COMMIT=$BUILDKITE_COMMIT \ |  | ||||||
|  -e MODEL=$MODEL \ |  | ||||||
|  -e WORKSPACE=/workspace \ |  | ||||||
|  --name $CONTAINER_NAME \ |  | ||||||
|  -d \ |  | ||||||
|  --privileged \ |  | ||||||
|  --network host \ |  | ||||||
|  -v /dev/shm:/dev/shm \ |  | ||||||
|  vllm/vllm-tpu-bm tail -f /dev/null |  | ||||||
|  |  | ||||||
| echo "run script..." |  | ||||||
| echo |  | ||||||
| docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh" |  | ||||||
|  |  | ||||||
| echo "copy result back..." |  | ||||||
| VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt |  | ||||||
| BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt |  | ||||||
| docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"  |  | ||||||
| docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG" |  | ||||||
|  |  | ||||||
| throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') |  | ||||||
| echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput" |  | ||||||
|  |  | ||||||
| if [ "$BUILDKITE" = "true" ]; then |  | ||||||
|   echo "Running inside Buildkite" |  | ||||||
|   buildkite-agent artifact upload "$VLLM_LOG"  |  | ||||||
|   buildkite-agent artifact upload "$BM_LOG" |  | ||||||
| else |  | ||||||
|   echo "Not running inside Buildkite" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # |  | ||||||
| # compare the throughput with EXPECTED_THROUGHPUT  |  | ||||||
| # and assert meeting the expectation |  | ||||||
| #  |  | ||||||
| if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then |  | ||||||
|   echo "Failed to get the throughput" |  | ||||||
|   exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then |  | ||||||
|   echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)" |  | ||||||
|   exit 1 |  | ||||||
| fi |  | ||||||
| @ -1,14 +0,0 @@ | |||||||
| # Environment config |  | ||||||
| TEST_NAME=llama8bw8a8 |  | ||||||
| CONTAINER_NAME=tpu-test |  | ||||||
|  |  | ||||||
| # vllm config |  | ||||||
| MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 |  | ||||||
| MAX_NUM_SEQS=128 |  | ||||||
| MAX_NUM_BATCHED_TOKENS=1024 |  | ||||||
| TENSOR_PARALLEL_SIZE=1 |  | ||||||
| MAX_MODEL_LEN=2048 |  | ||||||
| DOWNLOAD_DIR=/mnt/disks/persist |  | ||||||
| EXPECTED_THROUGHPUT=8.7 |  | ||||||
| INPUT_LEN=1800 |  | ||||||
| OUTPUT_LEN=128 |  | ||||||
| @ -1,93 +0,0 @@ | |||||||
| #!/bin/bash |  | ||||||
|  |  | ||||||
| set -euo pipefail |  | ||||||
|  |  | ||||||
| VLLM_LOG="$WORKSPACE/vllm_log.txt" |  | ||||||
| BM_LOG="$WORKSPACE/bm_log.txt" |  | ||||||
|  |  | ||||||
| if [ -n "$TARGET_COMMIT" ]; then |  | ||||||
|   head_hash=$(git rev-parse HEAD) |  | ||||||
|   if [ "$TARGET_COMMIT" != "$head_hash" ]; then |  | ||||||
|     echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash" |  | ||||||
|     exit 1 |  | ||||||
|   fi |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| echo "model: $MODEL" |  | ||||||
| echo |  | ||||||
|  |  | ||||||
| # |  | ||||||
| # create a log folder |  | ||||||
| # |  | ||||||
| mkdir "$WORKSPACE/log" |  | ||||||
|  |  | ||||||
| # TODO: Move to image building. |  | ||||||
| pip install pandas |  | ||||||
| pip install datasets |  | ||||||
|  |  | ||||||
| # |  | ||||||
| # create sonnet_4x |  | ||||||
| # |  | ||||||
| echo "Create sonnet_4x.txt" |  | ||||||
| echo "" > benchmarks/sonnet_4x.txt |  | ||||||
| for _ in {1..4} |  | ||||||
|  do |  | ||||||
|   cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt |  | ||||||
| done |  | ||||||
|  |  | ||||||
| # |  | ||||||
| # start vllm service in backend |  | ||||||
| # |  | ||||||
| echo "lanching vllm..." |  | ||||||
| echo "logging to $VLLM_LOG" |  | ||||||
| echo |  | ||||||
|  |  | ||||||
| vllm serve $MODEL \ |  | ||||||
|  --seed 42 \ |  | ||||||
|  --max-num-seqs $MAX_NUM_SEQS \ |  | ||||||
|  --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ |  | ||||||
|  --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ |  | ||||||
|  --no-enable-prefix-caching \ |  | ||||||
|  --download_dir $DOWNLOAD_DIR \ |  | ||||||
|  --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 & |  | ||||||
|  |  | ||||||
|  |  | ||||||
| echo "wait for 20 minutes.." |  | ||||||
| echo |  | ||||||
| # sleep 1200 |  | ||||||
| # wait for 10 minutes... |  | ||||||
| for i in {1..120}; do |  | ||||||
|     # TODO: detect other type of errors. |  | ||||||
|     if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then |  | ||||||
|         echo "Detected RuntimeError, exiting." |  | ||||||
|         exit 1 |  | ||||||
|     elif grep -Fq "Application startup complete" "$VLLM_LOG"; then |  | ||||||
|         echo "Application started" |  | ||||||
|         break |  | ||||||
|     else |  | ||||||
|         echo "wait for 10 seconds..." |  | ||||||
|         sleep 10 |  | ||||||
|     fi |  | ||||||
| done |  | ||||||
|  |  | ||||||
| # |  | ||||||
| # run test |  | ||||||
| # |  | ||||||
| echo "run benchmark test..." |  | ||||||
| echo "logging to $BM_LOG" |  | ||||||
| echo |  | ||||||
| vllm bench serve \ |  | ||||||
|     --backend vllm \ |  | ||||||
|     --model $MODEL  \ |  | ||||||
|     --dataset-name sonnet \ |  | ||||||
|     --dataset-path benchmarks/sonnet_4x.txt \ |  | ||||||
|     --sonnet-input-len $INPUT_LEN \ |  | ||||||
|     --sonnet-output-len $OUTPUT_LEN \ |  | ||||||
|     --ignore-eos > "$BM_LOG" |  | ||||||
|  |  | ||||||
| echo "completed..." |  | ||||||
| echo |  | ||||||
|  |  | ||||||
| throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') |  | ||||||
| echo "throughput: $throughput" |  | ||||||
| echo |  | ||||||
| @ -1,83 +0,0 @@ | |||||||
| #!/usr/bin/env bash |  | ||||||
|  |  | ||||||
| set -ex |  | ||||||
|  |  | ||||||
| # Assume wheels are in artifacts/dist/*.whl |  | ||||||
| wheel_files=(artifacts/dist/*.whl) |  | ||||||
|  |  | ||||||
| # Check that exactly one wheel is found |  | ||||||
| if [[ ${#wheel_files[@]} -ne 1 ]]; then |  | ||||||
|   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" |  | ||||||
|   exit 1 |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Get the single wheel file |  | ||||||
| wheel="${wheel_files[0]}" |  | ||||||
|  |  | ||||||
| # Detect architecture and rename 'linux' to appropriate manylinux version |  | ||||||
| arch=$(uname -m) |  | ||||||
| if [[ $arch == "x86_64" ]]; then |  | ||||||
|     manylinux_version="manylinux1" |  | ||||||
| elif [[ $arch == "aarch64" ]]; then |  | ||||||
|     manylinux_version="manylinux2014" |  | ||||||
| else |  | ||||||
|     echo "Warning: Unknown architecture $arch, using manylinux1 as default" |  | ||||||
|     manylinux_version="manylinux1" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Rename 'linux' to the appropriate manylinux version in the wheel filename |  | ||||||
| new_wheel="${wheel/linux/$manylinux_version}" |  | ||||||
| mv -- "$wheel" "$new_wheel" |  | ||||||
| wheel="$new_wheel" |  | ||||||
|  |  | ||||||
| # Extract the version from the wheel |  | ||||||
| version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) |  | ||||||
| echo "Version: $version" |  | ||||||
|  |  | ||||||
| normal_wheel="$wheel" # Save the original wheel filename |  | ||||||
|  |  | ||||||
| # If the version contains "dev", rename it to v1.0.0.dev for consistency |  | ||||||
| if [[ $version == *dev* ]]; then |  | ||||||
|     suffix="${version##*.}" |  | ||||||
|     if [[ $suffix == cu* ]]; then |  | ||||||
|         new_version="1.0.0.dev+${suffix}" |  | ||||||
|     else |  | ||||||
|         new_version="1.0.0.dev" |  | ||||||
|     fi |  | ||||||
|     new_wheel="${wheel/$version/$new_version}" |  | ||||||
|     # use cp to keep both files in the artifacts directory |  | ||||||
|     cp -- "$wheel" "$new_wheel" |  | ||||||
|     wheel="$new_wheel" |  | ||||||
|     version="$new_version" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # Upload the wheel to S3 |  | ||||||
| python3 .buildkite/generate_index.py --wheel "$normal_wheel" |  | ||||||
|  |  | ||||||
| # generate index for this commit |  | ||||||
| aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" |  | ||||||
| aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" |  | ||||||
|  |  | ||||||
| if [[ $normal_wheel == *"cu129"* ]]; then |  | ||||||
|     # only upload index.html for cu129 wheels (default wheels) as it |  | ||||||
|     # is available on both x86 and arm64 |  | ||||||
|     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" |  | ||||||
|     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" |  | ||||||
| else |  | ||||||
|     echo "Skipping index files for non-cu129 wheels" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| # generate index for nightly |  | ||||||
| aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" |  | ||||||
| aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" |  | ||||||
|  |  | ||||||
| if [[ $normal_wheel == *"cu129"* ]]; then |  | ||||||
|     # only upload index.html for cu129 wheels (default wheels) as it |  | ||||||
|     # is available on both x86 and arm64 |  | ||||||
|     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" |  | ||||||
| else |  | ||||||
|     echo "Skipping index files for non-cu129 wheels" |  | ||||||
| fi |  | ||||||
|  |  | ||||||
| aws s3 cp "$wheel" "s3://vllm-wheels/$version/" |  | ||||||
| aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html" |  | ||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										93
									
								
								.buildkite/test-template.j2
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								.buildkite/test-template.j2
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,93 @@ | |||||||
|  | {% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %} | ||||||
|  | {% set default_num_gpu = 1 %} | ||||||
|  | {% set default_working_dir = "/vllm-workspace/tests" %} | ||||||
|  |  | ||||||
|  | steps: | ||||||
|  |   - label: ":docker: build image" | ||||||
|  |     commands:  | ||||||
|  |       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." | ||||||
|  |       - "docker push {{ docker_image }}" | ||||||
|  |     env: | ||||||
|  |       DOCKER_BUILDKIT: "1" | ||||||
|  |     retry: | ||||||
|  |       automatic: | ||||||
|  |         - exit_status: -1  # Agent was lost | ||||||
|  |           limit: 5 | ||||||
|  |         - exit_status: -10  # Agent was lost | ||||||
|  |           limit: 5 | ||||||
|  |   - wait | ||||||
|  |  | ||||||
|  |   - group: "AMD Tests" | ||||||
|  |     depends_on: ~ | ||||||
|  |     steps: | ||||||
|  |     {% for step in steps %} | ||||||
|  |     {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} | ||||||
|  |       - label: "AMD: {{ step.label }}" | ||||||
|  |         agents: | ||||||
|  |           queue: amd | ||||||
|  |         command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}" | ||||||
|  |         env: | ||||||
|  |           DOCKER_BUILDKIT: "1" | ||||||
|  |     {% endif %} | ||||||
|  |     {% endfor %} | ||||||
|  |  | ||||||
|  |   - label: "Neuron Test" | ||||||
|  |     depends_on: ~ | ||||||
|  |     agents: | ||||||
|  |       queue: neuron | ||||||
|  |     command: bash .buildkite/run-neuron-test.sh | ||||||
|  |     soft_fail: true | ||||||
|  |  | ||||||
|  |   - label: "Intel Test" | ||||||
|  |     depends_on: ~ | ||||||
|  |     command: bash .buildkite/run-cpu-test.sh | ||||||
|  |  | ||||||
|  |   {% for step in steps %} | ||||||
|  |   - label: "{{ step.label }}" | ||||||
|  |     agents: | ||||||
|  |       queue: kubernetes | ||||||
|  |     soft_fail: {{ step.soft_fail or false }} | ||||||
|  |     {% if step.parallelism %} | ||||||
|  |     parallelism: {{ step.parallelism }} | ||||||
|  |     {% endif %} | ||||||
|  |     retry: | ||||||
|  |       automatic: | ||||||
|  |         - exit_status: -1  # Agent was lost | ||||||
|  |           limit: 5 | ||||||
|  |         - exit_status: -10  # Agent was lost | ||||||
|  |           limit: 5 | ||||||
|  |     plugins: | ||||||
|  |       - kubernetes: | ||||||
|  |           podSpec: | ||||||
|  |             {% if step.num_gpus %} | ||||||
|  |             priorityClassName: gpu-priority-cls-{{ step.num_gpus }} | ||||||
|  |             {% endif %} | ||||||
|  |             volumes: | ||||||
|  |               - name: dshm | ||||||
|  |                 emptyDir: | ||||||
|  |                   medium: Memory | ||||||
|  |             containers: | ||||||
|  |               - image: "{{ docker_image }}" | ||||||
|  |                 command: ["bash"] | ||||||
|  |                 args: | ||||||
|  |                 - '-c' | ||||||
|  |                 - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'" | ||||||
|  |                 {% if not step.no_gpu %} | ||||||
|  |                 resources: | ||||||
|  |                   requests: | ||||||
|  |                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" | ||||||
|  |                   limits: | ||||||
|  |                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" | ||||||
|  |                 {% endif %} | ||||||
|  |                 env: | ||||||
|  |                   - name: VLLM_USAGE_SOURCE | ||||||
|  |                     value: ci-test | ||||||
|  |                   - name: HF_TOKEN | ||||||
|  |                     valueFrom: | ||||||
|  |                       secretKeyRef: | ||||||
|  |                         name: hf-token-secret | ||||||
|  |                         key: token | ||||||
|  |                 volumeMounts: | ||||||
|  |                   - mountPath: /dev/shm | ||||||
|  |                     name: dshm | ||||||
|  |   {% endfor %} | ||||||
							
								
								
									
										47
									
								
								.coveragerc
									
									
									
									
									
								
							
							
						
						
									
										47
									
								
								.coveragerc
									
									
									
									
									
								
							| @ -1,47 +0,0 @@ | |||||||
| [run] |  | ||||||
| # Track the installed vllm package (this is what actually gets imported during tests) |  | ||||||
| # Use wildcard pattern to match the installed location |  | ||||||
| source = |  | ||||||
|     vllm |  | ||||||
|     */dist-packages/vllm |  | ||||||
|     */site-packages/vllm |  | ||||||
| omit = |  | ||||||
|     */tests/* |  | ||||||
|     */test_* |  | ||||||
|     */__pycache__/* |  | ||||||
|     */build/* |  | ||||||
|     */dist/* |  | ||||||
|     */vllm.egg-info/* |  | ||||||
|     */third_party/* |  | ||||||
|     */examples/* |  | ||||||
|     */benchmarks/* |  | ||||||
|     */docs/* |  | ||||||
|  |  | ||||||
| [paths] |  | ||||||
| # Map all possible vllm locations to a canonical "vllm" path |  | ||||||
| # This ensures coverage.combine properly merges data from different test runs |  | ||||||
| source = |  | ||||||
|     vllm |  | ||||||
|     /vllm-workspace/src/vllm |  | ||||||
|     /vllm-workspace/vllm |  | ||||||
|     */site-packages/vllm |  | ||||||
|     */dist-packages/vllm |  | ||||||
|  |  | ||||||
| [report] |  | ||||||
| exclude_lines = |  | ||||||
|     pragma: no cover |  | ||||||
|     def __repr__ |  | ||||||
|     if self.debug: |  | ||||||
|     if settings.DEBUG |  | ||||||
|     raise AssertionError |  | ||||||
|     raise NotImplementedError |  | ||||||
|     if 0: |  | ||||||
|     if __name__ == .__main__.: |  | ||||||
|     class .*\bProtocol\): |  | ||||||
|     @(abc\.)?abstractmethod |  | ||||||
|  |  | ||||||
| [html] |  | ||||||
| directory = htmlcov |  | ||||||
|  |  | ||||||
| [xml] |  | ||||||
| output = coverage.xml |  | ||||||
| @ -1,33 +1 @@ | |||||||
| /.venv |  | ||||||
| /build |  | ||||||
| dist |  | ||||||
| vllm/*.so | vllm/*.so | ||||||
|  |  | ||||||
| # Byte-compiled / optimized / DLL files |  | ||||||
| __pycache__/ |  | ||||||
| *.py[cod] |  | ||||||
| *$py.class |  | ||||||
|  |  | ||||||
| .mypy_cache |  | ||||||
|  |  | ||||||
| # Distribution / packaging |  | ||||||
| .Python |  | ||||||
| /build/ |  | ||||||
| cmake-build-*/ |  | ||||||
| CMakeUserPresets.json |  | ||||||
| develop-eggs/ |  | ||||||
| /dist/ |  | ||||||
| downloads/ |  | ||||||
| eggs/ |  | ||||||
| .eggs/ |  | ||||||
| lib/ |  | ||||||
| lib64/ |  | ||||||
| parts/ |  | ||||||
| sdist/ |  | ||||||
| var/ |  | ||||||
| wheels/ |  | ||||||
| share/python-wheels/ |  | ||||||
| *.egg-info/ |  | ||||||
| .installed.cfg |  | ||||||
| *.egg |  | ||||||
| MANIFEST |  | ||||||
|  | |||||||
| @ -1,6 +0,0 @@ | |||||||
| # https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github |  | ||||||
| have_fun: false  # Just review the code |  | ||||||
| code_review: |  | ||||||
|   comment_severity_threshold: HIGH  # Reduce quantity of comments |  | ||||||
|   pull_request_opened: |  | ||||||
|     summary: false  # Don't summarize the PR in a separate comment |  | ||||||
| @ -1,4 +0,0 @@ | |||||||
| # Migrate from `yapf` & `isort` to `ruff` |  | ||||||
| d6953beb91da4e9c99be4c0a1304a2d24189535c |  | ||||||
| # Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y` |  | ||||||
| 8fcaaf6a165e661f63fc51be906bc05b0767332f |  | ||||||
							
								
								
									
										24
									
								
								.github/.bc-linter.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										24
									
								
								.github/.bc-linter.yml
									
									
									
									
										vendored
									
									
								
							| @ -1,24 +0,0 @@ | |||||||
| # doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md |  | ||||||
| version: 1 |  | ||||||
| paths: |  | ||||||
| # We temporarily disable globally, and will only enable with `annotations.include` |  | ||||||
| # include: |  | ||||||
| #   - "vllm/v1/attetion/*.py" |  | ||||||
| #   - "vllm/v1/core/*.py" |  | ||||||
| exclude: |  | ||||||
|   - "**/*.py" |  | ||||||
|  |  | ||||||
| scan: |  | ||||||
|   functions: true        # check free functions and methods |  | ||||||
|   classes: true          # check classes/dataclasses |  | ||||||
|   public_only: true      # ignore names starting with "_" at any level |  | ||||||
|  |  | ||||||
| annotations: |  | ||||||
|   include:               # decorators that force‑include a symbol |  | ||||||
|     - name: "bc_linter_include"  # matched by simple name or dotted suffix |  | ||||||
|       propagate_to_members: false # for classes, include methods/inner classes |  | ||||||
|   exclude:               # decorators that force‑exclude a symbol |  | ||||||
|     - name: "bc_linter_skip"     # matched by simple name or dotted suffix |  | ||||||
|       propagate_to_members: true  # for classes, exclude methods/inner classes |  | ||||||
|  |  | ||||||
| excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"] |  | ||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	