mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-10-25 18:17:48 +08:00 
			
		
		
		
	Compare commits
	
		
			22 Commits
		
	
	
		
			amd_dev
			...
			v1-blockta
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 7097f31955 | |||
| f840b53063 | |||
| 1ca4298b9b | |||
| ba64a0249f | |||
| 1260e43230 | |||
| a6e5d7b5b7 | |||
| ebfbe1244b | |||
| 6ba31aa5f6 | |||
| 34d6cc2aea | |||
| 27e8eb2e94 | |||
| ca4f9e69a8 | |||
| 52922193cd | |||
| bef68163a0 | |||
| ff5b1033dc | |||
| b938606993 | |||
| 3fdbd8e2f5 | |||
| 0420fb2c7b | |||
| ee965c9c69 | |||
| 0a669eed7b | |||
| 03b1e6fdbd | |||
| 8a4180c8b6 | |||
| 1aaced5830 | 
| @ -1,20 +1,14 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import os | ||||
| import sys | ||||
| import zipfile | ||||
|  | ||||
| # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB | ||||
| # Note that we have 800 MiB quota, please use it wisely. | ||||
| # See https://github.com/pypi/support/issues/6326 . | ||||
| # Please also sync the value with the one in Dockerfile. | ||||
| VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) | ||||
| # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB | ||||
| VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250)) | ||||
|  | ||||
|  | ||||
| def print_top_10_largest_files(zip_file): | ||||
|     """Print the top 10 largest files in the given zip file.""" | ||||
|     with zipfile.ZipFile(zip_file, "r") as z: | ||||
|     with zipfile.ZipFile(zip_file, 'r') as z: | ||||
|         file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] | ||||
|         file_sizes.sort(key=lambda x: x[1], reverse=True) | ||||
|         for f, size in file_sizes[:10]: | ||||
| @ -29,18 +23,14 @@ def check_wheel_size(directory): | ||||
|                 wheel_path = os.path.join(root, file_name) | ||||
|                 wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024) | ||||
|                 if wheel_size_mb > VLLM_MAX_SIZE_MB: | ||||
|                     print( | ||||
|                         f"Not allowed: Wheel {wheel_path} is larger " | ||||
|                     print(f"Not allowed: Wheel {wheel_path} is larger " | ||||
|                           f"({wheel_size_mb:.2f} MB) than the limit " | ||||
|                         f"({VLLM_MAX_SIZE_MB} MB)." | ||||
|                     ) | ||||
|                           f"({VLLM_MAX_SIZE_MB} MB).") | ||||
|                     print_top_10_largest_files(wheel_path) | ||||
|                     return 1 | ||||
|                 else: | ||||
|                     print( | ||||
|                         f"Wheel {wheel_path} is within the allowed size " | ||||
|                         f"({wheel_size_mb:.2f} MB)." | ||||
|                     ) | ||||
|                     print(f"Wheel {wheel_path} is within the allowed size " | ||||
|                           f"({wheel_size_mb:.2f} MB).") | ||||
|     return 0 | ||||
|  | ||||
|  | ||||
|  | ||||
| @ -1,6 +1,3 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import argparse | ||||
| import os | ||||
|  | ||||
| @ -8,8 +5,7 @@ template = """<!DOCTYPE html> | ||||
| <html> | ||||
|     <body> | ||||
|     <h1>Links for vLLM</h1/> | ||||
|         <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/> | ||||
|         <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/> | ||||
|         <a href="../{wheel_html_escaped}">{wheel}</a><br/> | ||||
|     </body> | ||||
| </html> | ||||
| """ | ||||
| @ -22,25 +18,7 @@ filename = os.path.basename(args.wheel) | ||||
|  | ||||
| with open("index.html", "w") as f: | ||||
|     print(f"Generated index.html for {args.wheel}") | ||||
|     # sync the abi tag with .buildkite/scripts/upload-wheels.sh | ||||
|     if "x86_64" in filename: | ||||
|         x86_wheel = filename | ||||
|         arm_wheel = filename.replace("x86_64", "aarch64").replace( | ||||
|             "manylinux1", "manylinux2014" | ||||
|         ) | ||||
|     elif "aarch64" in filename: | ||||
|         x86_wheel = filename.replace("aarch64", "x86_64").replace( | ||||
|             "manylinux2014", "manylinux1" | ||||
|         ) | ||||
|         arm_wheel = filename | ||||
|     else: | ||||
|         raise ValueError(f"Unsupported wheel: {filename}") | ||||
|     # cloudfront requires escaping the '+' character | ||||
|     f.write( | ||||
|         template.format( | ||||
|             x86_wheel=x86_wheel, | ||||
|             x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"), | ||||
|             arm_wheel=arm_wheel, | ||||
|             arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"), | ||||
|         ) | ||||
|     ) | ||||
|         template.format(wheel=filename, | ||||
|                         wheel_html_escaped=filename.replace("+", "%2B"))) | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2 | ||||
| model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 | ||||
| model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 | ||||
| model_name: "meta-llama/Meta-Llama-3-70B-Instruct" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 | ||||
| model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test" | ||||
| tasks: | ||||
|  | ||||
| @ -1,5 +1,4 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1 | ||||
| model_name: "meta-llama/Meta-Llama-3-8B-Instruct" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 | ||||
| model_name: "HandH1998/QQQ-Llama-3-8b-g128" | ||||
| tasks: | ||||
|  | ||||
| @ -1,11 +0,0 @@ | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1 | ||||
| model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.335 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.323 | ||||
| limit: 1319 | ||||
| num_fewshot: 5 | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 | ||||
| model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" | ||||
| tasks: | ||||
|  | ||||
| @ -1,12 +0,0 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8 | ||||
| model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" | ||||
| backend: "vllm-vlm" | ||||
| tasks: | ||||
| - name: "chartqa" | ||||
|   metrics: | ||||
|   - name: "relaxed_accuracy,none" | ||||
|     # TODO(zhewenl): model card is 0.90, but the actual score is 0.80. | ||||
|     value: 0.80 | ||||
| limit: 100 | ||||
| num_fewshot: 0 | ||||
| @ -1,10 +0,0 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5 | ||||
| model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" | ||||
| tasks: | ||||
| - name: "mmlu_pro" | ||||
|   metrics: | ||||
|   - name: "exact_match,custom-extract" | ||||
|     value: 0.80 | ||||
| limit: 250 # will run on 250 * 14 subjects = 3500 samples | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +1,11 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "mgoin/Minitron-4B-Base-FP8" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.231 | ||||
|     value: 0.233 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.22 | ||||
|     value: 0.236 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8 | ||||
| model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4 | ||||
| model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" | ||||
| tasks: | ||||
|  | ||||
| @ -1,5 +1,4 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4 | ||||
| model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|  | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1 | ||||
| model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.30 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.465 | ||||
| limit: 1319 | ||||
| num_fewshot: 5 | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 | ||||
| model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1 | ||||
| model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise" | ||||
| tasks: | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4 | ||||
| model_name: "Qwen/Qwen2-57B-A14B-Instruct" | ||||
| tasks: | ||||
|  | ||||
| @ -1,11 +0,0 @@ | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1 | ||||
| model_name: "Qwen/Qwen2.5-1.5B-Instruct" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.54 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.59 | ||||
| limit: 1319 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size) | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1 | ||||
| model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.47 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.64 | ||||
| limit: 1319 | ||||
| num_fewshot: 5 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1 | ||||
|  | ||||
| model_name: "Qwen/Qwen2.5-VL-7B-Instruct" | ||||
| backend: "vllm-vlm" | ||||
| tasks: | ||||
| - name: "chartqa" | ||||
|   metrics: | ||||
|   - name: "relaxed_accuracy,none" | ||||
|     value: 0.855 | ||||
| limit: 2500 | ||||
| num_fewshot: 0 | ||||
| @ -1,12 +0,0 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2 | ||||
| model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.6353 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.637 | ||||
| limit: null | ||||
| num_fewshot: null  | ||||
| @ -1 +0,0 @@ | ||||
| Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml | ||||
| @ -1 +0,0 @@ | ||||
| Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml | ||||
| @ -1 +0,0 @@ | ||||
| Qwen2.5-VL-7B-Instruct.yaml | ||||
| @ -1,6 +1,10 @@ | ||||
| Qwen2.5-1.5B-Instruct.yaml | ||||
| Meta-Llama-3-8B-Instruct.yaml | ||||
| Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml | ||||
| Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml | ||||
| Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml | ||||
| Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml | ||||
| Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml | ||||
| Qwen1.5-MoE-W4A16-compressed-tensors.yaml | ||||
| Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml | ||||
| Minitron-4B-Base-FP8.yaml | ||||
| Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml | ||||
| Qwen2-1.5B-Instruct-FP8W8.yaml | ||||
| Meta-Llama-3-8B-QQQ.yaml | ||||
|  | ||||
| @ -1,44 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| from pathlib import Path | ||||
|  | ||||
| import pytest | ||||
|  | ||||
|  | ||||
| def pytest_addoption(parser): | ||||
|     parser.addoption( | ||||
|         "--config-list-file", | ||||
|         action="store", | ||||
|         help="Path to the file listing model config YAMLs (one per line)", | ||||
|     ) | ||||
|     parser.addoption( | ||||
|         "--tp-size", | ||||
|         action="store", | ||||
|         default="1", | ||||
|         help="Tensor parallel size to use for evaluation", | ||||
|     ) | ||||
|  | ||||
|  | ||||
| @pytest.fixture(scope="session") | ||||
| def config_list_file(pytestconfig, config_dir): | ||||
|     rel_path = pytestconfig.getoption("--config-list-file") | ||||
|     return config_dir / rel_path | ||||
|  | ||||
|  | ||||
| @pytest.fixture(scope="session") | ||||
| def tp_size(pytestconfig): | ||||
|     return pytestconfig.getoption("--tp-size") | ||||
|  | ||||
|  | ||||
| def pytest_generate_tests(metafunc): | ||||
|     if "config_filename" in metafunc.fixturenames: | ||||
|         rel_path = metafunc.config.getoption("--config-list-file") | ||||
|         config_list_file = Path(rel_path).resolve() | ||||
|         config_dir = config_list_file.parent | ||||
|         with open(config_list_file, encoding="utf-8") as f: | ||||
|             configs = [ | ||||
|                 config_dir / line.strip() | ||||
|                 for line in f | ||||
|                 if line.strip() and not line.startswith("#") | ||||
|             ] | ||||
|         metafunc.parametrize("config_filename", configs) | ||||
| @ -1,44 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # We can use this script to compute baseline accuracy on chartqa for vllm. | ||||
| # | ||||
| # Make sure you have lm-eval-harness installed: | ||||
| #   pip install lm-eval==0.4.9 | ||||
|  | ||||
| usage() { | ||||
|     echo`` | ||||
|     echo "Runs lm eval harness on ChartQA using multimodal vllm." | ||||
|     echo "This pathway is intended to be used to create baselines for " | ||||
|     echo "our correctness tests in vllm's CI." | ||||
|     echo | ||||
|     echo "usage: ${0} <options>" | ||||
|     echo | ||||
|     echo "  -m    - huggingface stub or local directory of the model" | ||||
|     echo "  -l    - limit number of samples to run" | ||||
|     echo "  -t    - tensor parallel size to run at" | ||||
|     echo | ||||
| } | ||||
|  | ||||
| while getopts "m:l:t:" OPT; do | ||||
|   case ${OPT} in | ||||
|     m )  | ||||
|         MODEL="$OPTARG" | ||||
|         ;; | ||||
|     l )  | ||||
|         LIMIT="$OPTARG" | ||||
|         ;; | ||||
|     t )  | ||||
|         TP_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     \? )  | ||||
|         usage | ||||
|         exit 1 | ||||
|         ;; | ||||
|   esac | ||||
| done | ||||
|  | ||||
| lm_eval --model vllm-vlm \ | ||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \ | ||||
|   --tasks chartqa \ | ||||
|   --batch_size auto \ | ||||
|   --apply_chat_template \ | ||||
|   --limit $LIMIT | ||||
							
								
								
									
										2
									
								
								.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							
							
						
						
									
										2
									
								
								.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
									
									
									
									
									
										
										
										Executable file → Normal file
									
								
							| @ -2,7 +2,7 @@ | ||||
| # We can use this script to compute baseline accuracy on GSM for transformers. | ||||
| # | ||||
| # Make sure you have lm-eval-harness installed: | ||||
| #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] | ||||
| #   pip install lm-eval==0.4.4 | ||||
|  | ||||
| usage() { | ||||
|     echo`` | ||||
|  | ||||
| @ -3,7 +3,7 @@ | ||||
| # We use this for fp8, which HF does not support. | ||||
| # | ||||
| # Make sure you have lm-eval-harness installed: | ||||
| #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] | ||||
| #   pip install lm-eval==0.4.4 | ||||
|  | ||||
| usage() { | ||||
|     echo`` | ||||
| @ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do | ||||
| done | ||||
|  | ||||
| lm_eval --model vllm \ | ||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ | ||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \ | ||||
|   --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ | ||||
|   --batch_size "$BATCH_SIZE" | ||||
|  | ||||
| @ -1,50 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # We can use this script to compute baseline accuracy on MMLUPRO for vllm. | ||||
| # We use this for fp8, which HF does not support. | ||||
| # | ||||
| # Make sure you have lm-eval-harness installed: | ||||
| #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] | ||||
|  | ||||
| usage() { | ||||
|     echo`` | ||||
|     echo "Runs lm eval harness on MMLU Pro using huggingface transformers." | ||||
|     echo "This pathway is intended to be used to create baselines for " | ||||
|     echo "our automated nm-test-accuracy workflow" | ||||
|     echo | ||||
|     echo "usage: ${0} <options>" | ||||
|     echo | ||||
|     echo "  -m    - huggingface stub or local directory of the model" | ||||
|     echo "  -l    - limit number of samples to run" | ||||
|     echo "  -f    - number of fewshot samples to use" | ||||
|     echo "  -t    - tensor parallel size to run at" | ||||
|     echo | ||||
| } | ||||
|  | ||||
| while getopts "m:b:l:f:t:" OPT; do | ||||
|   case ${OPT} in | ||||
|     m ) | ||||
|         MODEL="$OPTARG" | ||||
|         ;; | ||||
|     b ) | ||||
|         BATCH_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     l ) | ||||
|         LIMIT="$OPTARG" | ||||
|         ;; | ||||
|     f ) | ||||
|         FEWSHOT="$OPTARG" | ||||
|         ;; | ||||
|     t ) | ||||
|         TP_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     \? ) | ||||
|         usage | ||||
|         exit 1 | ||||
|         ;; | ||||
|   esac | ||||
| done | ||||
|  | ||||
| lm_eval --model vllm \ | ||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ | ||||
|   --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ | ||||
|   --batch_size auto | ||||
							
								
								
									
										59
									
								
								.buildkite/lm-eval-harness/run-tests.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								.buildkite/lm-eval-harness/run-tests.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,59 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| usage() { | ||||
|     echo`` | ||||
|     echo "Runs lm eval harness on GSM8k using vllm and compares to " | ||||
|     echo "precomputed baseline (measured by HF transformers.)" | ||||
|     echo | ||||
|     echo "usage: ${0} <options>" | ||||
|     echo | ||||
|     echo "  -c    - path to the test data config (e.g. configs/small-models.txt)" | ||||
|     echo "  -t    - tensor parallel size" | ||||
|     echo | ||||
| } | ||||
|  | ||||
| SUCCESS=0 | ||||
|  | ||||
| while getopts "c:t:" OPT; do | ||||
|   case ${OPT} in | ||||
|     c )  | ||||
|         CONFIG="$OPTARG" | ||||
|         ;; | ||||
|     t ) | ||||
|         TP_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     \? ) | ||||
|         usage | ||||
|         exit 1 | ||||
|         ;; | ||||
|   esac | ||||
| done | ||||
|  | ||||
| # Parse list of configs. | ||||
| IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" | ||||
|  | ||||
| for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" | ||||
| do | ||||
|     LOCAL_SUCCESS=0 | ||||
|      | ||||
|     echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE===" | ||||
|  | ||||
|     export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG} | ||||
|     export LM_EVAL_TP_SIZE=$TP_SIZE | ||||
|     pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$? | ||||
|  | ||||
|     if [[ $LOCAL_SUCCESS == 0 ]]; then | ||||
|         echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" | ||||
|     else | ||||
|         echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" | ||||
|     fi | ||||
|  | ||||
|     SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) | ||||
|  | ||||
| done | ||||
|  | ||||
| if [ "${SUCCESS}" -eq "0" ]; then | ||||
|     exit 0 | ||||
| else | ||||
|     exit 1 | ||||
| fi | ||||
| @ -1,63 +1,63 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| """ | ||||
| LM eval harness on model to compare vs HF baseline computed offline. | ||||
| Configs are found in configs/$MODEL.yaml | ||||
|  | ||||
| pytest -s -v test_lm_eval_correctness.py \ | ||||
|     --config-list-file=configs/models-small.txt \ | ||||
|     --tp-size=1 | ||||
| * export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml | ||||
| * export LM_EVAL_TP_SIZE=4  | ||||
| * pytest -s test_lm_eval_correctness.py | ||||
| """ | ||||
|  | ||||
| import os | ||||
| from pathlib import Path | ||||
|  | ||||
| import lm_eval | ||||
| import numpy as np | ||||
| import numpy | ||||
| import yaml | ||||
|  | ||||
| RTOL = 0.08 | ||||
| RTOL = 0.05 | ||||
| TEST_DATA_FILE = os.environ.get( | ||||
|     "LM_EVAL_TEST_DATA_FILE", | ||||
|     ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") | ||||
|  | ||||
| TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) | ||||
|  | ||||
|  | ||||
| def launch_lm_eval(eval_config, tp_size): | ||||
|     trust_remote_code = eval_config.get("trust_remote_code", False) | ||||
|     max_model_len = eval_config.get("max_model_len", 4096) | ||||
|     batch_size = eval_config.get("batch_size", "auto") | ||||
|     backend = eval_config.get("backend", "vllm") | ||||
|     model_args = ( | ||||
|         f"pretrained={eval_config['model_name']}," | ||||
|         f"tensor_parallel_size={tp_size}," | ||||
|         f"enforce_eager=true," | ||||
|         f"add_bos_token=true," | ||||
|         f"trust_remote_code={trust_remote_code}," | ||||
|         f"max_model_len={max_model_len}," | ||||
|     ) | ||||
| def launch_lm_eval(eval_config): | ||||
|     trust_remote_code = eval_config.get('trust_remote_code', False) | ||||
|  | ||||
|     model_args = f"pretrained={eval_config['model_name']}," \ | ||||
|                  f"tensor_parallel_size={TP_SIZE}," \ | ||||
|                  f"add_bos_token=true," \ | ||||
|                  f"trust_remote_code={trust_remote_code}" | ||||
|  | ||||
|     results = lm_eval.simple_evaluate( | ||||
|         model=backend, | ||||
|         model="vllm", | ||||
|         model_args=model_args, | ||||
|         tasks=[task["name"] for task in eval_config["tasks"]], | ||||
|         num_fewshot=eval_config["num_fewshot"], | ||||
|         limit=eval_config["limit"], | ||||
|         # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help | ||||
|         # text models. however, this is regressing measured strict-match for | ||||
|         # existing text models in CI, so only apply it for mm. | ||||
|         apply_chat_template=backend == "vllm-vlm", | ||||
|         batch_size=batch_size, | ||||
|     ) | ||||
|         batch_size="auto") | ||||
|  | ||||
|     return results | ||||
|  | ||||
|  | ||||
| def test_lm_eval_correctness_param(config_filename, tp_size): | ||||
|     eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) | ||||
| def test_lm_eval_correctness(): | ||||
|     eval_config = yaml.safe_load( | ||||
|         Path(TEST_DATA_FILE).read_text(encoding="utf-8")) | ||||
|  | ||||
|     results = launch_lm_eval(eval_config, tp_size) | ||||
|     # Launch eval requests. | ||||
|     results = launch_lm_eval(eval_config) | ||||
|  | ||||
|     # Confirm scores match ground truth. | ||||
|     success = True | ||||
|     for task in eval_config["tasks"]: | ||||
|         for metric in task["metrics"]: | ||||
|             ground_truth = metric["value"] | ||||
|             measured_value = results["results"][task["name"]][metric["name"]] | ||||
|             print( | ||||
|                 f"{task['name']} | {metric['name']}: " | ||||
|                 f"ground_truth={ground_truth} | measured={measured_value}" | ||||
|             ) | ||||
|             success = success and np.isclose(ground_truth, measured_value, rtol=RTOL) | ||||
|             print(f'{task["name"]} | {metric["name"]}: ' | ||||
|                   f'ground_truth={ground_truth} | measured={measured_value}') | ||||
|             success = success and numpy.isclose( | ||||
|                 ground_truth, measured_value, rtol=RTOL) | ||||
|  | ||||
|     # Assert at the end, print all scores even on failure for debugging. | ||||
|     assert success | ||||
|  | ||||
| @ -1,22 +1,25 @@ | ||||
| # vLLM benchmark suite | ||||
|  | ||||
|  | ||||
| ## Introduction | ||||
|  | ||||
| This directory contains two sets of benchmark for vllm. | ||||
|  | ||||
| - Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance | ||||
| - Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm. | ||||
|  | ||||
| See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. | ||||
|  | ||||
| See  [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results. | ||||
|  | ||||
|  | ||||
| ## Performance benchmark quick overview | ||||
|  | ||||
| **Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models. | ||||
| **Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models. | ||||
|  | ||||
| **Benchmarking Duration**: about 1hr. | ||||
|  | ||||
| **For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run. | ||||
|  | ||||
|  | ||||
| ## Nightly benchmark quick overview | ||||
|  | ||||
| **Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.  | ||||
| @ -25,38 +28,27 @@ See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName | ||||
|  | ||||
| **Benchmarking Duration**: about 3.5hrs. | ||||
|  | ||||
|  | ||||
|  | ||||
| ## Trigger the benchmark | ||||
|  | ||||
| Performance benchmark will be triggered when: | ||||
|  | ||||
| - A PR being merged into vllm. | ||||
| - Every commit for those PRs with `perf-benchmarks` label AND `ready` label. | ||||
|  | ||||
| Manually Trigger the benchmark | ||||
|  | ||||
| ```bash | ||||
| bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
| ``` | ||||
|  | ||||
| Runtime environment variables: | ||||
|  | ||||
| - `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0. | ||||
| - `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file). | ||||
| - `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file). | ||||
| - `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file). | ||||
| - `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string. | ||||
| - `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string. | ||||
|  | ||||
| Nightly benchmark will be triggered when: | ||||
|  | ||||
| - Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label. | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| ## Performance benchmark details | ||||
|  | ||||
|  | ||||
| See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases. | ||||
| > NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead. | ||||
| > | ||||
| ### Latency test | ||||
|  | ||||
|  | ||||
| #### Latency test | ||||
|  | ||||
| Here is an example of one test inside `latency-tests.json`: | ||||
|  | ||||
| @ -76,25 +68,23 @@ Here is an example of one test inside `latency-tests.json`: | ||||
| ``` | ||||
|  | ||||
| In this example: | ||||
|  | ||||
| -  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. | ||||
| - The `parameters` attribute control the command line arguments to be used for `vllm bench latency`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `vllm bench latency`. For example, the corresponding command line arguments for `vllm bench latency` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` | ||||
| -  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` | ||||
|  | ||||
| Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. | ||||
|  | ||||
| WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. | ||||
|  | ||||
| ### Throughput test | ||||
|  | ||||
| The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `vllm bench throughput`. | ||||
| #### Throughput test | ||||
| The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. | ||||
|  | ||||
| The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. | ||||
|  | ||||
| ### Serving test | ||||
| #### Serving test | ||||
| We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: | ||||
|  | ||||
| We test the throughput by using `vllm bench serve` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: | ||||
|  | ||||
| ```json | ||||
| ``` | ||||
| [ | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp1_sharegpt", | ||||
| @ -104,6 +94,7 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co | ||||
|             "tensor_parallel_size": 1, | ||||
|             "swap_space": 16, | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
| @ -118,60 +109,45 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co | ||||
| ``` | ||||
|  | ||||
| Inside this example: | ||||
|  | ||||
| - The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. | ||||
| - The `server-parameters` includes the command line arguments for vLLM server. | ||||
| - The `client-parameters` includes the command line arguments for `vllm bench serve`. | ||||
| - The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `vllm bench serve` | ||||
| - The `client-parameters` includes the command line arguments for `benchmark_serving.py`. | ||||
| - The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py` | ||||
|  | ||||
| The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. | ||||
|  | ||||
| WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. | ||||
|  | ||||
| ### Visualizing the results | ||||
|  | ||||
| The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results. | ||||
| #### Visualizing the results | ||||
| The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. | ||||
| You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. | ||||
| If you do not see the table, please wait till the benchmark finish running. | ||||
| The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. | ||||
| The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. | ||||
|  | ||||
| The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`. | ||||
| When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`. | ||||
| `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.   | ||||
| If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead. | ||||
|  | ||||
| Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps. | ||||
| `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json` | ||||
|  | ||||
| |   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        | | ||||
| |----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------| | ||||
| | 0  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982                             | 156.526018                             | 1.097396 | | ||||
| | 1  | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334                             | 294.018783                             | 1.216863 | | ||||
|  | ||||
| A comparison diagram will be generated below the table. | ||||
| Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3 | ||||
| <img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" /> | ||||
|  | ||||
| ## Nightly test details | ||||
|  | ||||
| See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines. | ||||
|  | ||||
| ### Workflow | ||||
|  | ||||
| #### Workflow | ||||
|  | ||||
| - The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.  | ||||
| - Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container. | ||||
| - The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`. | ||||
| - At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. | ||||
| - Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container. | ||||
| - The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark. | ||||
| - At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite. | ||||
|  | ||||
| ### Nightly tests | ||||
| #### Nightly tests | ||||
|  | ||||
| In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark. | ||||
|  | ||||
| ### Docker containers | ||||
| #### Docker containers | ||||
|  | ||||
| The docker containers for benchmarking are specified in `nightly-pipeline.yaml`. | ||||
|  | ||||
| WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`. | ||||
| WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`. | ||||
|  | ||||
| WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git). | ||||
|  | ||||
|  | ||||
| @ -1,6 +1,5 @@ | ||||
| steps: | ||||
|   - label: "Wait for container to be ready" | ||||
|     key: wait-for-container-image | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     plugins: | ||||
| @ -10,18 +9,13 @@ steps: | ||||
|           - image: badouralix/curl-jq | ||||
|             command: | ||||
|             - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh | ||||
|   - label: "Cleanup H100" | ||||
|     agents: | ||||
|       queue: H100 | ||||
|     depends_on: ~ | ||||
|     command: docker system prune -a --volumes --force | ||||
|  | ||||
|   - wait | ||||
|  | ||||
|   - label: "A100" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch == "main" | ||||
|     plugins: | ||||
|     - kubernetes: | ||||
|         podSpec: | ||||
| @ -55,8 +49,6 @@ steps: | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: H200 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch == "main" | ||||
|     plugins: | ||||
|     - docker#v5.12.0: | ||||
|         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT | ||||
| @ -81,8 +73,7 @@ steps: | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: H100 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch == "main" | ||||
|     depends_on: ~ | ||||
|     plugins: | ||||
|     - docker#v5.12.0: | ||||
|         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT | ||||
| @ -98,87 +89,3 @@ steps: | ||||
|         environment: | ||||
|         - VLLM_USAGE_SOURCE | ||||
|         - HF_TOKEN | ||||
|  | ||||
|   # Premerge benchmark | ||||
|   - label: "A100" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: A100 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch != "main" | ||||
|     plugins: | ||||
|     - kubernetes: | ||||
|         podSpec: | ||||
|           priorityClassName: perf-benchmark | ||||
|           containers: | ||||
|           - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT | ||||
|             command: | ||||
|             - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
|             resources: | ||||
|               limits: | ||||
|                 nvidia.com/gpu: 8 | ||||
|             volumeMounts: | ||||
|             - name: devshm | ||||
|               mountPath: /dev/shm | ||||
|             env: | ||||
|             - name: VLLM_USAGE_SOURCE | ||||
|               value: ci-test | ||||
|             - name: HF_TOKEN | ||||
|               valueFrom: | ||||
|                 secretKeyRef: | ||||
|                   name: hf-token-secret | ||||
|                   key: token | ||||
|           nodeSelector: | ||||
|             nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB | ||||
|           volumes: | ||||
|           - name: devshm | ||||
|             emptyDir: | ||||
|               medium: Memory | ||||
|  | ||||
|   - label: "H200" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: H200 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch != "main" | ||||
|     plugins: | ||||
|     - docker#v5.12.0: | ||||
|         image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT | ||||
|         command: | ||||
|         - bash | ||||
|         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
|         mount-buildkite-agent: true | ||||
|         propagate-environment: true | ||||
|         ipc: host | ||||
|         gpus: 4,5,6,7 | ||||
|         volumes: | ||||
|           - /data/benchmark-hf-cache:/root/.cache/huggingface | ||||
|         environment: | ||||
|         - VLLM_USAGE_SOURCE | ||||
|         - HF_TOKEN | ||||
|  | ||||
|   #- block: "Run H100 Benchmark" | ||||
|     #key: block-h100 | ||||
|     #depends_on: ~ | ||||
|  | ||||
|   - label: "H100" | ||||
|     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" | ||||
|     agents: | ||||
|       queue: H100 | ||||
|     depends_on: wait-for-container-image | ||||
|     if: build.branch != "main" | ||||
|     plugins: | ||||
|     - docker#v5.12.0: | ||||
|         image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT | ||||
|         command: | ||||
|         - bash | ||||
|         - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh | ||||
|         mount-buildkite-agent: true | ||||
|         propagate-environment: true | ||||
|         ipc: host | ||||
|         gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used | ||||
|         volumes: | ||||
|           - /data/benchmark-hf-cache:/root/.cache/huggingface | ||||
|         environment: | ||||
|         - VLLM_USAGE_SOURCE | ||||
|         - HF_TOKEN | ||||
|  | ||||
| @ -1,4 +1,3 @@ | ||||
| # Nightly benchmark annotation | ||||
|  | ||||
| ## Description | ||||
|  | ||||
| @ -10,19 +9,20 @@ This file contains the downloading link for benchmarking results. | ||||
|  | ||||
| Please download the visualization scripts in the post | ||||
|  | ||||
|  | ||||
| ## Results reproduction | ||||
|  | ||||
| - Find the docker we use in `benchmarking pipeline` | ||||
| - Deploy the docker, and inside the docker: | ||||
|   - Download `nightly-benchmarks.zip`.  | ||||
|     - In the same folder, run the following code: | ||||
|  | ||||
|     ```bash | ||||
|     export HF_TOKEN=<your HF token> | ||||
|     apt update | ||||
|     apt install -y git | ||||
|     unzip nightly-benchmarks.zip | ||||
|     VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh | ||||
|     ``` | ||||
|   - In the same folder, run the following code | ||||
| ``` | ||||
| export HF_TOKEN=<your HF token> | ||||
| apt update | ||||
| apt install -y git | ||||
| unzip nightly-benchmarks.zip | ||||
| VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh | ||||
| ``` | ||||
|  | ||||
| And the results will be inside `./benchmarks/results`. | ||||
|  | ||||
|  | ||||
| @ -2,13 +2,13 @@ | ||||
| # Nightly benchmark | ||||
|  | ||||
| This benchmark aims to: | ||||
|  | ||||
| - Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload. | ||||
| - Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions. | ||||
|  | ||||
| Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end. | ||||
|  | ||||
| Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176) | ||||
| Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176) | ||||
|  | ||||
|  | ||||
| ## Setup | ||||
|  | ||||
| @ -17,7 +17,7 @@ Latest reproduction guide: [github issue link](https://github.com/vllm-project/v | ||||
|   - SGLang: `lmsysorg/sglang:v0.3.2-cu121` | ||||
|   - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12` | ||||
|   - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3` | ||||
|         - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.* | ||||
|     - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.* | ||||
|   - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark. | ||||
| - Hardware | ||||
|   - 8x Nvidia A100 GPUs | ||||
| @ -33,7 +33,7 @@ Latest reproduction guide: [github issue link](https://github.com/vllm-project/v | ||||
|     - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed. | ||||
|   - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). | ||||
|  | ||||
| ## Known issues | ||||
| # Known issues | ||||
|  | ||||
| - TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105). | ||||
| - TGI does not support `ignore-eos` flag. | ||||
| @ -1,44 +1,41 @@ | ||||
| # Performance benchmarks descriptions | ||||
|  | ||||
| ## Latency tests | ||||
|  | ||||
| - Input length: 32 tokens. | ||||
| - Output length: 128 tokens. | ||||
| - Batch size: fixed (8). | ||||
| - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. | ||||
| - CPU Models: llama-3.1 8B. | ||||
| - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. | ||||
| - Evaluation metrics: end-to-end latency (mean, median, p99). | ||||
|  | ||||
|  | ||||
| {latency_tests_markdown_table} | ||||
|  | ||||
|  | ||||
| ## Throughput tests | ||||
|  | ||||
| - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). | ||||
| - Output length: the corresponding output length of these 200 prompts. | ||||
| - Batch size: dynamically determined by vllm to achieve maximum throughput. | ||||
| - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. | ||||
| - CPU Models: llama-3.1 8B. | ||||
| - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. | ||||
| - Evaluation metrics: throughput. | ||||
|  | ||||
|  | ||||
| {throughput_tests_markdown_table} | ||||
|  | ||||
|  | ||||
| ## Serving tests | ||||
|  | ||||
| - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). | ||||
| - Output length: the corresponding output length of these 200 prompts. | ||||
| - Batch size: dynamically determined by vllm and the arrival pattern of the requests. | ||||
| - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). | ||||
| - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. | ||||
| - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 | ||||
| - CPU Models: llama-3.1 8B. | ||||
| - Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. | ||||
| - We also added a speculative decoding test for llama-3 70B, under QPS 2 | ||||
| - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). | ||||
| - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts. | ||||
|  | ||||
|  | ||||
| {serving_tests_markdown_table} | ||||
|  | ||||
| ## Platform Information | ||||
|  | ||||
| {platform_markdown_table} | ||||
|  | ||||
| ## json version of the benchmarking tables | ||||
|  | ||||
| @ -57,9 +54,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"]) | ||||
| ``` | ||||
|  | ||||
| The json string for all benchmarking tables: | ||||
|  | ||||
| ```json | ||||
| {benchmarking_results_in_json_string} | ||||
| ``` | ||||
|  | ||||
| You can also check the raw experiment data in the Artifact tab of the Buildkite page. | ||||
|  | ||||
|  | ||||
| @ -1,307 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| import argparse | ||||
| import json | ||||
| import os | ||||
| from importlib import util | ||||
|  | ||||
| import pandas as pd | ||||
|  | ||||
| plotly_found = util.find_spec("plotly.express") is not None | ||||
|  | ||||
|  | ||||
| def compare_data_columns( | ||||
|     files, name_column, data_column, info_cols, drop_column, debug=False | ||||
| ): | ||||
|     """ | ||||
|     Align concatenation by keys derived from info_cols instead of row order. | ||||
|     - Pick one canonical key list: subset of info_cols present in ALL files. | ||||
|     - For each file: set index to those keys, aggregate duplicates | ||||
|     - (mean for metric, first for names). | ||||
|     - Concat along axis=1 (indexes align), then reset_index so callers can | ||||
|     - group by columns. | ||||
|     - If --debug, add a <file_label>_name column per file. | ||||
|     """ | ||||
|     print("\ncompare_data_column:", data_column) | ||||
|  | ||||
|     frames = [] | ||||
|     raw_data_cols = [] | ||||
|     compare_frames = [] | ||||
|  | ||||
|     # 1) choose a canonical key list from info_cols that exists in ALL files | ||||
|     cols_per_file = [] | ||||
|     for f in files: | ||||
|         try: | ||||
|             df_tmp = pd.read_json(f, orient="records") | ||||
|         except Exception as err: | ||||
|             raise ValueError(f"Failed to read {f}") from err | ||||
|         cols_per_file.append(set(df_tmp.columns)) | ||||
|  | ||||
|     key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)] | ||||
|     if not key_cols: | ||||
|         # soft fallback: use any info_cols present in the first file | ||||
|         key_cols = [c for c in info_cols if c in list(cols_per_file[0])] | ||||
|     if not key_cols: | ||||
|         raise ValueError( | ||||
|             "No common key columns found from info_cols across the input files." | ||||
|         ) | ||||
|  | ||||
|     # 2) build a single "meta" block (keys as columns) once, aligned by the key index | ||||
|     meta_added = False | ||||
|  | ||||
|     for file in files: | ||||
|         df = pd.read_json(file, orient="records") | ||||
|  | ||||
|         # Keep rows that actually have the compared metric (same as original behavior) | ||||
|         if drop_column in df.columns: | ||||
|             df = df.dropna(subset=[drop_column], ignore_index=True) | ||||
|  | ||||
|         # Stabilize numeric key columns (harmless if missing) | ||||
|         for c in ( | ||||
|             "Input Len", | ||||
|             "Output Len", | ||||
|             "TP Size", | ||||
|             "PP Size", | ||||
|             "# of max concurrency.", | ||||
|             "qps", | ||||
|         ): | ||||
|             if c in df.columns: | ||||
|                 df[c] = pd.to_numeric(df[c], errors="coerce") | ||||
|  | ||||
|         # Ensure all key columns exist | ||||
|         for c in key_cols: | ||||
|             if c not in df.columns: | ||||
|                 df[c] = pd.NA | ||||
|  | ||||
|         # Set index = key_cols and aggregate duplicates → unique MultiIndex | ||||
|         df_idx = df.set_index(key_cols, drop=False) | ||||
|  | ||||
|         # meta (key columns), unique per key | ||||
|         meta = df_idx[key_cols] | ||||
|         if not meta.index.is_unique: | ||||
|             meta = meta.groupby(level=key_cols, dropna=False).first() | ||||
|  | ||||
|         # metric series for this file, aggregated to one row per key | ||||
|         file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file) | ||||
|         s = df_idx[data_column] | ||||
|         if not s.index.is_unique: | ||||
|             s = s.groupby(level=key_cols, dropna=False).mean() | ||||
|         s.name = file_label  # column label like original | ||||
|  | ||||
|         # add meta once (from first file) so keys are the leftmost columns | ||||
|         if not meta_added: | ||||
|             frames.append(meta) | ||||
|             meta_added = True | ||||
|  | ||||
|         # (NEW) debug: aligned test-name column per file | ||||
|         if debug and name_column in df_idx.columns: | ||||
|             name_s = df_idx[name_column] | ||||
|             if not name_s.index.is_unique: | ||||
|                 name_s = name_s.groupby(level=key_cols, dropna=False).first() | ||||
|             name_s.name = f"{file_label}_name" | ||||
|             frames.append(name_s) | ||||
|  | ||||
|         frames.append(s) | ||||
|         raw_data_cols.append(file_label) | ||||
|         compare_frames.append(s) | ||||
|  | ||||
|         # Generalize ratio: for any file N>=2, add ratio (fileN / file1) | ||||
|         if len(compare_frames) >= 2: | ||||
|             base = compare_frames[0] | ||||
|             current = compare_frames[-1] | ||||
|             ratio = current / base | ||||
|             ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0 | ||||
|             ratio.name = f"Ratio 1 vs {len(compare_frames)}" | ||||
|             frames.append(ratio) | ||||
|  | ||||
|     # 4) concat on columns with aligned MultiIndex; | ||||
|     # then reset_index to return keys as columns | ||||
|     concat_df = pd.concat(frames, axis=1) | ||||
|     concat_df = concat_df.reset_index(drop=True).reset_index() | ||||
|     if "index" in concat_df.columns: | ||||
|         concat_df = concat_df.drop(columns=["index"]) | ||||
|  | ||||
|     # Ensure key/info columns appear first (in your info_cols order) | ||||
|     front = [c for c in info_cols if c in concat_df.columns] | ||||
|     rest = [c for c in concat_df.columns if c not in front] | ||||
|     concat_df = concat_df[front + rest] | ||||
|  | ||||
|     print(raw_data_cols) | ||||
|     return concat_df, raw_data_cols | ||||
|  | ||||
|  | ||||
| def split_json_by_tp_pp( | ||||
|     input_file: str = "benchmark_results.json", output_root: str = "." | ||||
| ) -> list[str]: | ||||
|     """ | ||||
|     Split a benchmark JSON into separate folders by (TP Size, PP Size). | ||||
|  | ||||
|     Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json | ||||
|     Returns: list of file paths written. | ||||
|     """ | ||||
|     # Load JSON data into DataFrame | ||||
|     with open(input_file, encoding="utf-8") as f: | ||||
|         data = json.load(f) | ||||
|  | ||||
|     # If the JSON is a dict with a list under common keys, use that list | ||||
|     if isinstance(data, dict): | ||||
|         for key in ("results", "serving_results", "benchmarks", "data"): | ||||
|             if isinstance(data.get(key), list): | ||||
|                 data = data[key] | ||||
|                 break | ||||
|  | ||||
|     df = pd.DataFrame(data) | ||||
|  | ||||
|     # Keep only "serving" tests | ||||
|     name_col = next( | ||||
|         (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None | ||||
|     ) | ||||
|     if name_col: | ||||
|         df = df[ | ||||
|             df[name_col].astype(str).str.contains(r"serving", case=False, na=False) | ||||
|         ].copy() | ||||
|  | ||||
|     # Handle alias column names | ||||
|     rename_map = { | ||||
|         "tp_size": "TP Size", | ||||
|         "tensor_parallel_size": "TP Size", | ||||
|         "pp_size": "PP Size", | ||||
|         "pipeline_parallel_size": "PP Size", | ||||
|     } | ||||
|     df.rename( | ||||
|         columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True | ||||
|     ) | ||||
|  | ||||
|     # Ensure TP/PP columns exist (default to 1 if missing) | ||||
|     if "TP Size" not in df.columns: | ||||
|         df["TP Size"] = 1 | ||||
|     if "PP Size" not in df.columns: | ||||
|         df["PP Size"] = 1 | ||||
|  | ||||
|     # make sure TP/PP are numeric ints with no NaN | ||||
|     df["TP Size"] = ( | ||||
|         pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int) | ||||
|     ) | ||||
|     df["PP Size"] = ( | ||||
|         pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int) | ||||
|     ) | ||||
|  | ||||
|     # Split into separate folders | ||||
|     saved_paths: list[str] = [] | ||||
|     for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False): | ||||
|         folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}") | ||||
|         os.makedirs(folder_name, exist_ok=True) | ||||
|         filepath = os.path.join(folder_name, "benchmark_results.json") | ||||
|         group_df.to_json(filepath, orient="records", indent=2, force_ascii=False) | ||||
|         print(f"Saved: {filepath}") | ||||
|         saved_paths.append(filepath) | ||||
|  | ||||
|     return saved_paths | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument( | ||||
|         "-f", "--file", action="append", type=str, help="input file name" | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--debug", action="store_true", help="show all information for debugging" | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--plot", | ||||
|         action=argparse.BooleanOptionalAction, | ||||
|         default=True, | ||||
|         help="plot perf diagrams or not --no-plot --plot", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "-x", | ||||
|         "--xaxis", | ||||
|         type=str, | ||||
|         default="# of max concurrency.", | ||||
|         help="column name to use as X Axis in comparison graph", | ||||
|     ) | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     drop_column = "P99" | ||||
|     name_column = "Test name" | ||||
|     info_cols = [ | ||||
|         "Model", | ||||
|         "Dataset Name", | ||||
|         "Input Len", | ||||
|         "Output Len", | ||||
|         "TP Size", | ||||
|         "PP Size", | ||||
|         "# of max concurrency.", | ||||
|         "qps", | ||||
|     ] | ||||
|     data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] | ||||
|     html_msgs_for_data_cols = [ | ||||
|         "Compare Output Tokens /n", | ||||
|         "Median TTFT /n", | ||||
|         "Median TPOT /n", | ||||
|     ] | ||||
|  | ||||
|     if len(args.file) == 1: | ||||
|         files = split_json_by_tp_pp(args.file[0], output_root="splits") | ||||
|         info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")] | ||||
|     else: | ||||
|         files = args.file | ||||
|     print("comparing : " + ", ".join(files)) | ||||
|     debug = args.debug | ||||
|     plot = args.plot | ||||
|     # For Plot feature, assign y axis from one of info_cols | ||||
|     y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6 | ||||
|     with open("perf_comparison.html", "w") as text_file: | ||||
|         for i in range(len(data_cols_to_compare)): | ||||
|             output_df, raw_data_cols = compare_data_columns( | ||||
|                 files, | ||||
|                 name_column, | ||||
|                 data_cols_to_compare[i], | ||||
|                 info_cols, | ||||
|                 drop_column, | ||||
|                 debug=debug, | ||||
|             ) | ||||
|  | ||||
|             # For Plot feature, insert y axis from one of info_cols | ||||
|             raw_data_cols.insert(0, info_cols[y_axis_index]) | ||||
|  | ||||
|             filtered_info_cols = info_cols[:-2] | ||||
|             existing_group_cols = [ | ||||
|                 c for c in filtered_info_cols if c in output_df.columns | ||||
|             ] | ||||
|             if not existing_group_cols: | ||||
|                 raise ValueError( | ||||
|                     f"No valid group-by columns  " | ||||
|                     f"Expected subset: {filtered_info_cols}, " | ||||
|                     f"but DataFrame has: {list(output_df.columns)}" | ||||
|                 ) | ||||
|             output_df_sorted = output_df.sort_values(by=existing_group_cols) | ||||
|             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) | ||||
|             for name, group in output_groups: | ||||
|                 html = group.to_html() | ||||
|                 text_file.write(html_msgs_for_data_cols[i]) | ||||
|                 text_file.write(html) | ||||
|  | ||||
|                 if plot and plotly_found: | ||||
|                     import plotly.express as px | ||||
|  | ||||
|                     df = group[raw_data_cols] | ||||
|                     df_sorted = df.sort_values(by=info_cols[y_axis_index]) | ||||
|                     # Melt DataFrame for plotting | ||||
|                     df_melted = df_sorted.melt( | ||||
|                         id_vars=info_cols[y_axis_index], | ||||
|                         var_name="Configuration", | ||||
|                         value_name=data_cols_to_compare[i], | ||||
|                     ) | ||||
|                     title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] | ||||
|                     # Create Plotly line chart | ||||
|                     fig = px.line( | ||||
|                         df_melted, | ||||
|                         x=info_cols[y_axis_index], | ||||
|                         y=data_cols_to_compare[i], | ||||
|                         color="Configuration", | ||||
|                         title=title, | ||||
|                         markers=True, | ||||
|                     ) | ||||
|                     # Export to HTML | ||||
|                     text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) | ||||
| @ -1,19 +1,12 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import argparse | ||||
| import json | ||||
| import os | ||||
| import shlex | ||||
| from importlib import util | ||||
| from pathlib import Path | ||||
| from typing import Any | ||||
|  | ||||
| import pandas as pd | ||||
| import psutil | ||||
| import regex as re | ||||
| from tabulate import tabulate | ||||
|  | ||||
| results_folder = Path("results/") | ||||
|  | ||||
| # latency results and the keys that will be printed into markdown | ||||
| latency_results = [] | ||||
| latency_column_mapping = { | ||||
| @ -33,39 +26,28 @@ throughput_results = [] | ||||
| throughput_results_column_mapping = { | ||||
|     "test_name": "Test name", | ||||
|     "gpu_type": "GPU", | ||||
|     "num_requests": "# of req.", | ||||
|     "total_num_tokens": "Total # of tokens", | ||||
|     "elapsed_time": "Elapsed time (s)", | ||||
|     # "num_requests": "# of req.", | ||||
|     # "total_num_tokens": "Total # of tokens", | ||||
|     # "elapsed_time": "Elapsed time (s)", | ||||
|     "requests_per_second": "Tput (req/s)", | ||||
|     "tokens_per_second": "Tput (tok/s)", | ||||
|     # "tokens_per_second": "Tput (tok/s)", | ||||
| } | ||||
|  | ||||
| # serving results and the keys that will be printed into markdown | ||||
| serving_results = [] | ||||
| serving_column_mapping = { | ||||
|     "test_name": "Test name", | ||||
|     "model_id": "Model", | ||||
|     "dataset_name": "Dataset Name", | ||||
|     "input_len": "Input Len", | ||||
|     "output_len": "Output Len", | ||||
|     "tp_size": "TP Size", | ||||
|     "pp_size": "PP Size", | ||||
|     "dtype": "dtype", | ||||
|     "gpu_type": "GPU", | ||||
|     "completed": "# of req.", | ||||
|     "qps": "qps", | ||||
|     "max_concurrency": "# of max concurrency.", | ||||
|     # "completed": "# of req.", | ||||
|     "request_throughput": "Tput (req/s)", | ||||
|     "total_token_throughput": "Total Token Tput (tok/s)", | ||||
|     "output_throughput": "Output Tput (tok/s)", | ||||
|     # "total_input_tokens": "Total input tokens", | ||||
|     # "total_output_tokens": "Total output tokens", | ||||
|     # "input_throughput": "Input Tput (tok/s)", | ||||
|     # "output_throughput": "Output Tput (tok/s)", | ||||
|     "mean_ttft_ms": "Mean TTFT (ms)", | ||||
|     "median_ttft_ms": "Median TTFT (ms)", | ||||
|     "p99_ttft_ms": "P99 TTFT (ms)", | ||||
|     "mean_tpot_ms": "Mean TPOT (ms)", | ||||
|     "median_tpot_ms": "Median", | ||||
|     "p99_tpot_ms": "P99", | ||||
|     # "mean_tpot_ms": "Mean TPOT (ms)", | ||||
|     # "median_tpot_ms": "Median", | ||||
|     # "p99_tpot_ms": "P99", | ||||
|     "mean_itl_ms": "Mean ITL (ms)", | ||||
|     "median_itl_ms": "Median ITL (ms)", | ||||
|     "p99_itl_ms": "P99 ITL (ms)", | ||||
| @ -81,194 +63,42 @@ def read_markdown(file): | ||||
|  | ||||
|  | ||||
| def results_to_json(latency, throughput, serving): | ||||
|     return json.dumps( | ||||
|         { | ||||
|             "latency": latency.to_dict(), | ||||
|             "throughput": throughput.to_dict(), | ||||
|             "serving": serving.to_dict(), | ||||
|         } | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def get_size_with_unit(bytes, suffix="B"): | ||||
|     """ | ||||
|     Scale bytes to its proper format | ||||
|     e.g: | ||||
|         1253656 => '1.20MB' | ||||
|         1253656678 => '1.17GB' | ||||
|     """ | ||||
|     factor = 1024 | ||||
|     for unit in ["", "K", "M", "G", "T", "P"]: | ||||
|         if bytes < factor: | ||||
|             return f"{bytes:.2f}{unit}{suffix}" | ||||
|         bytes /= factor | ||||
|  | ||||
|  | ||||
| def _coerce(val: str) -> Any: | ||||
|     """Best-effort type coercion from string to Python types.""" | ||||
|     low = val.lower() | ||||
|     if low == "null": | ||||
|         return None | ||||
|     if low == "true": | ||||
|         return True | ||||
|     if low == "false": | ||||
|         return False | ||||
|     # integers | ||||
|     if re.fullmatch(r"[+-]?\d+", val): | ||||
|         try: | ||||
|             return int(val) | ||||
|         except ValueError: | ||||
|             pass | ||||
|     # floats (keep 'inf'/'-inf'/'nan' as strings) | ||||
|     if re.fullmatch(r"[+-]?\d*\.\d+", val): | ||||
|         try: | ||||
|             return float(val) | ||||
|         except ValueError: | ||||
|             pass | ||||
|     return val | ||||
|  | ||||
|  | ||||
| def parse_client_command(cmd: str) -> dict[str, Any]: | ||||
|     """Parse the client_command shell string into {executable, script, args}.""" | ||||
|     toks = shlex.split(cmd) | ||||
|     if len(toks) < 2: | ||||
|         raise ValueError("client_command must include an executable and a script") | ||||
|     executable, script = toks[0], toks[1] | ||||
|     args: dict[str, Any] = {} | ||||
|  | ||||
|     i = 2 | ||||
|     while i < len(toks): | ||||
|         t = toks[i] | ||||
|         if t.startswith("--"): | ||||
|             # --key=value or --key (value) or boolean flag | ||||
|             if "=" in t: | ||||
|                 key, val = t.split("=", 1) | ||||
|                 if key == "--metadata": | ||||
|                     md = {} | ||||
|                     if val: | ||||
|                         if "=" in val: | ||||
|                             k, v = val.split("=", 1) | ||||
|                             md[k] = _coerce(v) | ||||
|                         else: | ||||
|                             md[val] = True | ||||
|                     args[key] = md | ||||
|                 else: | ||||
|                     args[key] = _coerce(val) | ||||
|                 i += 1 | ||||
|                 continue | ||||
|  | ||||
|             key = t | ||||
|  | ||||
|             # Special: consume metadata k=v pairs until next --flag | ||||
|             if key == "--metadata": | ||||
|                 i += 1 | ||||
|                 md = {} | ||||
|                 while i < len(toks) and not toks[i].startswith("--"): | ||||
|                     pair = toks[i] | ||||
|                     if "=" in pair: | ||||
|                         k, v = pair.split("=", 1) | ||||
|                         md[k] = _coerce(v) | ||||
|                     else: | ||||
|                         md[pair] = True | ||||
|                     i += 1 | ||||
|                 args[key] = md | ||||
|                 continue | ||||
|  | ||||
|             # Standard: check if next token is a value (not a flag) | ||||
|             if i + 1 < len(toks) and not toks[i + 1].startswith("--"): | ||||
|                 args[key] = _coerce(toks[i + 1]) | ||||
|                 i += 2 | ||||
|             else: | ||||
|                 # lone flag -> True | ||||
|                 args[key] = True | ||||
|                 i += 1 | ||||
|         else: | ||||
|             # unexpected positional; skip | ||||
|             i += 1 | ||||
|  | ||||
|     return {"executable": executable, "script": script, "args": args} | ||||
|     return json.dumps({ | ||||
|         'latency': latency.to_dict(), | ||||
|         'throughput': throughput.to_dict(), | ||||
|         'serving': serving.to_dict() | ||||
|     }) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument( | ||||
|         "-r", | ||||
|         "--result", | ||||
|         type=str, | ||||
|         default="results", | ||||
|         help="Folder name for benchmark output results.", | ||||
|     ) | ||||
|     args = parser.parse_args() | ||||
|     results_folder = Path(args.result) | ||||
|     if not results_folder.exists(): | ||||
|         raise FileNotFoundError(f"results folder does not exist: {results_folder}") | ||||
|  | ||||
|     # collect results | ||||
|     for test_file in results_folder.glob("*.json"): | ||||
|  | ||||
|         with open(test_file) as f: | ||||
|             raw_result = json.loads(f.read()) | ||||
|  | ||||
|         if "serving" in str(test_file): | ||||
|             # this result is generated via `vllm bench serve` command | ||||
|             # this result is generated via `benchmark_serving.py` | ||||
|  | ||||
|             # attach the benchmarking command to raw_result | ||||
|             try: | ||||
|             with open(test_file.with_suffix(".commands")) as f: | ||||
|                 command = json.loads(f.read()) | ||||
|             except OSError as e: | ||||
|                 print(e) | ||||
|                 continue | ||||
|             # Parse Server Command Arg | ||||
|             out: dict[str, Any] = { | ||||
|                 "server_command": parse_client_command(command["server_command"]) | ||||
|             } | ||||
|             parse_args = [ | ||||
|                 "--tensor-parallel-size", | ||||
|                 "--pipeline-parallel-size", | ||||
|                 "--dtype", | ||||
|             ] | ||||
|             col_mapping = ["tp_size", "pp_size", "dtype"] | ||||
|             for index, arg in enumerate(parse_args): | ||||
|                 if arg in out["server_command"]["args"]: | ||||
|                     raw_result.update( | ||||
|                         {col_mapping[index]: out["server_command"]["args"][arg]} | ||||
|                     ) | ||||
|  | ||||
|             # Parse Client Command Arg | ||||
|             out: dict[str, Any] = { | ||||
|                 "client_command": parse_client_command(command["client_command"]) | ||||
|             } | ||||
|             parse_args = [ | ||||
|                 "--dataset-name", | ||||
|                 "--random-input-len", | ||||
|                 "--random-output-len", | ||||
|                 "--request-rate", | ||||
|             ] | ||||
|             col_mapping = ["dataset_name", "input_len", "output_len", "qps"] | ||||
|  | ||||
|             for index, arg in enumerate(parse_args): | ||||
|                 if arg in out["client_command"]["args"]: | ||||
|                     raw_result.update( | ||||
|                         {col_mapping[index]: out["client_command"]["args"][arg]} | ||||
|                     ) | ||||
|             # Add Server, Client command | ||||
|             raw_result.update(command) | ||||
|  | ||||
|             # update the test name of this result | ||||
|             raw_result.update({"test_name": test_file.stem}) | ||||
|  | ||||
|             # add the result to raw_result | ||||
|             serving_results.append(raw_result) | ||||
|             continue | ||||
|  | ||||
|         elif "latency" in f.name: | ||||
|             # this result is generated via `vllm bench latency` command | ||||
|             # this result is generated via `benchmark_latency.py` | ||||
|  | ||||
|             # attach the benchmarking command to raw_result | ||||
|             try: | ||||
|             with open(test_file.with_suffix(".commands")) as f: | ||||
|                 command = json.loads(f.read()) | ||||
|             except OSError as e: | ||||
|                 print(e) | ||||
|                 continue | ||||
|  | ||||
|             raw_result.update(command) | ||||
|  | ||||
|             # update the test name of this result | ||||
| @ -278,8 +108,7 @@ if __name__ == "__main__": | ||||
|             for perc in [10, 25, 50, 75, 90, 99]: | ||||
|                 # Multiply 1000 to convert the time unit from s to ms | ||||
|                 raw_result.update( | ||||
|                     {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]} | ||||
|                 ) | ||||
|                     {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}) | ||||
|             raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 | ||||
|  | ||||
|             # add the result to raw_result | ||||
| @ -287,16 +116,11 @@ if __name__ == "__main__": | ||||
|             continue | ||||
|  | ||||
|         elif "throughput" in f.name: | ||||
|             # this result is generated via `vllm bench throughput` command | ||||
|             # this result is generated via `benchmark_throughput.py` | ||||
|  | ||||
|             # attach the benchmarking command to raw_result | ||||
|             try: | ||||
|             with open(test_file.with_suffix(".commands")) as f: | ||||
|                 command = json.loads(f.read()) | ||||
|             except OSError as e: | ||||
|                 print(e) | ||||
|                 continue | ||||
|  | ||||
|             raw_result.update(command) | ||||
|  | ||||
|             # update the test name of this result | ||||
| @ -312,51 +136,26 @@ if __name__ == "__main__": | ||||
|     serving_results = pd.DataFrame.from_dict(serving_results) | ||||
|     throughput_results = pd.DataFrame.from_dict(throughput_results) | ||||
|  | ||||
|     svmem = psutil.virtual_memory() | ||||
|     platform_data = { | ||||
|         "Physical cores": [psutil.cpu_count(logical=False)], | ||||
|         "Total cores": [psutil.cpu_count(logical=True)], | ||||
|         "Total Memory": [get_size_with_unit(svmem.total)], | ||||
|     } | ||||
|  | ||||
|     if util.find_spec("numa") is not None: | ||||
|         from numa import info | ||||
|  | ||||
|         platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()] | ||||
|  | ||||
|     if util.find_spec("cpuinfo") is not None: | ||||
|         from cpuinfo import get_cpu_info | ||||
|  | ||||
|         platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]] | ||||
|  | ||||
|     platform_results = pd.DataFrame.from_dict( | ||||
|         platform_data, orient="index", columns=["Platform Info"] | ||||
|     ) | ||||
|  | ||||
|     raw_results_json = results_to_json( | ||||
|         latency_results, throughput_results, serving_results | ||||
|     ) | ||||
|     raw_results_json = results_to_json(latency_results, throughput_results, | ||||
|                                        serving_results) | ||||
|  | ||||
|     # remapping the key, for visualization purpose | ||||
|     if not latency_results.empty: | ||||
|         latency_results = latency_results[list(latency_column_mapping.keys())].rename( | ||||
|             columns=latency_column_mapping | ||||
|         ) | ||||
|         latency_results = latency_results[list( | ||||
|             latency_column_mapping.keys())].rename( | ||||
|                 columns=latency_column_mapping) | ||||
|     if not serving_results.empty: | ||||
|         valid_columns = [ | ||||
|             col for col in serving_column_mapping if col in serving_results.columns | ||||
|         ] | ||||
|         serving_results = serving_results[valid_columns].rename( | ||||
|             columns=serving_column_mapping | ||||
|         ) | ||||
|         serving_results = serving_results[list( | ||||
|             serving_column_mapping.keys())].rename( | ||||
|                 columns=serving_column_mapping) | ||||
|     if not throughput_results.empty: | ||||
|         throughput_results = throughput_results[ | ||||
|             list(throughput_results_column_mapping.keys()) | ||||
|         ].rename(columns=throughput_results_column_mapping) | ||||
|         throughput_results = throughput_results[list( | ||||
|             throughput_results_column_mapping.keys())].rename( | ||||
|                 columns=throughput_results_column_mapping) | ||||
|  | ||||
|     processed_results_json = results_to_json( | ||||
|         latency_results, throughput_results, serving_results | ||||
|     ) | ||||
|     processed_results_json = results_to_json(latency_results, | ||||
|                                              throughput_results, | ||||
|                                              serving_results) | ||||
|  | ||||
|     for df in [latency_results, serving_results, throughput_results]: | ||||
|         if df.empty: | ||||
| @ -368,45 +167,38 @@ if __name__ == "__main__": | ||||
|         # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", | ||||
|         # we want to turn it into "8xGPUTYPE" | ||||
|         df["GPU"] = df["GPU"].apply( | ||||
|             lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}" | ||||
|         ) | ||||
|             lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}") | ||||
|  | ||||
|     # get markdown tables | ||||
|     latency_md_table = tabulate( | ||||
|         latency_results, headers="keys", tablefmt="pipe", showindex=False | ||||
|     ) | ||||
|     serving_md_table = tabulate( | ||||
|         serving_results, headers="keys", tablefmt="pipe", showindex=False | ||||
|     ) | ||||
|     throughput_md_table = tabulate( | ||||
|         throughput_results, headers="keys", tablefmt="pipe", showindex=False | ||||
|     ) | ||||
|     platform_md_table = tabulate( | ||||
|         platform_results, headers="keys", tablefmt="pipe", showindex=True | ||||
|     ) | ||||
|     latency_md_table = tabulate(latency_results, | ||||
|                                 headers='keys', | ||||
|                                 tablefmt='pipe', | ||||
|                                 showindex=False) | ||||
|     serving_md_table = tabulate(serving_results, | ||||
|                                 headers='keys', | ||||
|                                 tablefmt='pipe', | ||||
|                                 showindex=False) | ||||
|     throughput_md_table = tabulate(throughput_results, | ||||
|                                    headers='keys', | ||||
|                                    tablefmt='pipe', | ||||
|                                    showindex=False) | ||||
|  | ||||
|     # document the result | ||||
|     md_file = "benchmark_results.md" | ||||
|     json_file = "benchmark_results.json" | ||||
|     with open(results_folder / md_file, "w") as f: | ||||
|         results = read_markdown( | ||||
|             "../.buildkite/nightly-benchmarks/" | ||||
|             + "performance-benchmarks-descriptions.md" | ||||
|         ) | ||||
|     with open(results_folder / "benchmark_results.md", "w") as f: | ||||
|  | ||||
|         results = read_markdown("../.buildkite/nightly-benchmarks/" + | ||||
|                                 "performance-benchmarks-descriptions.md") | ||||
|         results = results.format( | ||||
|             latency_tests_markdown_table=latency_md_table, | ||||
|             throughput_tests_markdown_table=throughput_md_table, | ||||
|             serving_tests_markdown_table=serving_md_table, | ||||
|             platform_markdown_table=platform_md_table, | ||||
|             benchmarking_results_in_json_string=processed_results_json, | ||||
|         ) | ||||
|             benchmarking_results_in_json_string=processed_results_json) | ||||
|         f.write(results) | ||||
|  | ||||
|     # document benchmarking results in json | ||||
|     with open(results_folder / json_file, "w") as f: | ||||
|         results = ( | ||||
|             latency_results.to_dict(orient="records") | ||||
|             + throughput_results.to_dict(orient="records") | ||||
|             + serving_results.to_dict(orient="records") | ||||
|         ) | ||||
|     with open(results_folder / "benchmark_results.json", "w") as f: | ||||
|  | ||||
|         results = latency_results.to_dict( | ||||
|             orient='records') + throughput_results.to_dict( | ||||
|                 orient='records') + serving_results.to_dict(orient='records') | ||||
|         f.write(json.dumps(results)) | ||||
|  | ||||
| @ -1,6 +1,3 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import argparse | ||||
|  | ||||
| from transformers import AutoTokenizer | ||||
| @ -15,12 +12,15 @@ def main(model, cachedir): | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description="Download and save Hugging Face tokenizer" | ||||
|     ) | ||||
|     parser.add_argument("--model", type=str, required=True, help="Name of the model") | ||||
|     parser.add_argument( | ||||
|         "--cachedir", type=str, required=True, help="Directory to save the tokenizer" | ||||
|     ) | ||||
|         description="Download and save Hugging Face tokenizer") | ||||
|     parser.add_argument("--model", | ||||
|                         type=str, | ||||
|                         required=True, | ||||
|                         help="Name of the model") | ||||
|     parser.add_argument("--cachedir", | ||||
|                         type=str, | ||||
|                         required=True, | ||||
|                         help="Directory to save the tokenizer") | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|     main(args.model, args.cachedir) | ||||
|  | ||||
| @ -1,6 +1,3 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import argparse | ||||
| import json | ||||
| from pathlib import Path | ||||
| @ -12,33 +9,33 @@ from tabulate import tabulate | ||||
|  | ||||
| def parse_arguments(): | ||||
|     parser = argparse.ArgumentParser( | ||||
|         description="Parse command line arguments for summary-nightly-results script." | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--results-folder", | ||||
|         description= | ||||
|         'Parse command line arguments for summary-nightly-results script.') | ||||
|     parser.add_argument('--results-folder', | ||||
|                         type=str, | ||||
|                         required=True, | ||||
|         help="The folder where the results are stored.", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--description", type=str, required=True, help="Description of the results." | ||||
|     ) | ||||
|                         help='The folder where the results are stored.') | ||||
|     parser.add_argument('--description', | ||||
|                         type=str, | ||||
|                         required=True, | ||||
|                         help='Description of the results.') | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|     return args | ||||
|  | ||||
|  | ||||
| def get_perf(df, method, model, metric): | ||||
|  | ||||
|     means = [] | ||||
|  | ||||
|     for qps in [2, 4, 8, 16, "inf"]: | ||||
|         target = df["Test name"].str.contains(model) | ||||
|         target = target & df["Engine"].str.contains(method) | ||||
|         target = target & df["Test name"].str.contains("qps_" + str(qps)) | ||||
|         target = df['Test name'].str.contains(model) | ||||
|         target = target & df['Engine'].str.contains(method) | ||||
|         target = target & df['Test name'].str.contains("qps_" + str(qps)) | ||||
|         filtered_df = df[target] | ||||
|  | ||||
|         if filtered_df.empty: | ||||
|             means.append(0.0) | ||||
|             means.append(0.) | ||||
|         else: | ||||
|             means.append(filtered_df[metric].values[0]) | ||||
|  | ||||
| @ -46,6 +43,7 @@ def get_perf(df, method, model, metric): | ||||
|  | ||||
|  | ||||
| def get_perf_w_std(df, method, model, metric): | ||||
|  | ||||
|     if metric in ["TTFT", "ITL"]: | ||||
|         mean = get_perf(df, method, model, "Mean " + metric + " (ms)") | ||||
|         mean = mean.tolist() | ||||
| @ -60,8 +58,7 @@ def get_perf_w_std(df, method, model, metric): | ||||
|     else: | ||||
|         assert metric == "Tput" | ||||
|         mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( | ||||
|             df, method, model, "Output Tput (tok/s)" | ||||
|         ) | ||||
|             df, method, model, "Output Tput (tok/s)") | ||||
|         mean = mean.tolist() | ||||
|         std = None | ||||
|  | ||||
| @ -81,17 +78,18 @@ def main(args): | ||||
|     # generate markdown table | ||||
|     df = pd.DataFrame.from_dict(results) | ||||
|  | ||||
|     md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False) | ||||
|     md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) | ||||
|  | ||||
|     with open(args.description) as f: | ||||
|         description = f.read() | ||||
|  | ||||
|     description = description.format(nightly_results_benchmarking_table=md_table) | ||||
|     description = description.format( | ||||
|         nightly_results_benchmarking_table=md_table) | ||||
|  | ||||
|     with open("nightly_results.md", "w") as f: | ||||
|         f.write(description) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
| if __name__ == '__main__': | ||||
|     args = parse_arguments() | ||||
|     main(args) | ||||
|  | ||||
| @ -1,6 +1,3 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| from lmdeploy.serve.openai.api_client import APIClient | ||||
|  | ||||
| api_client = APIClient("http://localhost:8000") | ||||
|  | ||||
| @ -181,14 +181,18 @@ launch_vllm_server() { | ||||
|   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then | ||||
|     echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." | ||||
|     model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') | ||||
|     server_command="vllm serve $model \ | ||||
|     server_command="python3 \ | ||||
|         -m vllm.entrypoints.openai.api_server \ | ||||
|         -tp $tp \ | ||||
|         --model $model \ | ||||
|         --port $port \ | ||||
|         $server_args" | ||||
|   else | ||||
|     echo "Key 'fp8' does not exist in common params." | ||||
|     server_command="vllm serve $model \ | ||||
|     server_command="python3 \ | ||||
|         -m vllm.entrypoints.openai.api_server \ | ||||
|         -tp $tp \ | ||||
|         --model $model \ | ||||
|         --port $port \ | ||||
|         $server_args" | ||||
|   fi | ||||
|  | ||||
| @ -43,7 +43,7 @@ main() { | ||||
|      | ||||
|  | ||||
|  | ||||
|     # The figures should be generated by a separate process outside the CI/CD pipeline | ||||
|     # The figures should be genereated by a separate process outside the CI/CD pipeline | ||||
|  | ||||
|     # # generate figures | ||||
|     # python3 -m pip install tabulate pandas matplotlib | ||||
|  | ||||
| @ -95,14 +95,12 @@ json2args() { | ||||
| } | ||||
|  | ||||
| kill_gpu_processes() { | ||||
|   pkill -f '[p]ython' | ||||
|   pkill -f '[p]ython3' | ||||
|   pkill -f '[t]ritonserver' | ||||
|   pkill -f '[p]t_main_thread' | ||||
|   pkill -f '[t]ext-generation' | ||||
|   pkill -f '[l]mdeploy' | ||||
|   # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 | ||||
|   pkill -f '[V]LLM' | ||||
|   pkill -f python | ||||
|   pkill -f python3 | ||||
|   pkill -f tritonserver | ||||
|   pkill -f pt_main_thread | ||||
|   pkill -f text-generation | ||||
|   pkill -f lmdeploy | ||||
|  | ||||
|   while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do | ||||
|     sleep 1 | ||||
| @ -127,7 +125,7 @@ ensure_installed() { | ||||
| } | ||||
|  | ||||
| run_serving_tests() { | ||||
|   # run serving tests using `vllm bench serve` command | ||||
|   # run serving tests using `benchmark_serving.py` | ||||
|   # $1: a json file specifying serving test cases | ||||
|  | ||||
|   local serving_test_file | ||||
| @ -227,7 +225,7 @@ run_serving_tests() { | ||||
|  | ||||
|       if [[ "$dataset_name" = "sharegpt" ]]; then | ||||
|  | ||||
|         client_command="vllm bench serve \ | ||||
|         client_command="python3 benchmark_serving.py \ | ||||
|           --backend $backend \ | ||||
|           --tokenizer /tokenizer_cache \ | ||||
|           --model $model \ | ||||
| @ -248,7 +246,7 @@ run_serving_tests() { | ||||
|         sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len') | ||||
|         sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len') | ||||
|  | ||||
|         client_command="vllm bench serve \ | ||||
|         client_command="python3 benchmark_serving.py \ | ||||
|           --backend $backend \ | ||||
|           --tokenizer /tokenizer_cache \ | ||||
|           --model $model \ | ||||
| @ -303,104 +301,6 @@ run_serving_tests() { | ||||
|   kill_gpu_processes | ||||
| } | ||||
|  | ||||
| run_genai_perf_tests() { | ||||
|   # run genai-perf tests | ||||
|  | ||||
|   # $1: a json file specifying genai-perf test cases | ||||
|   local genai_perf_test_file | ||||
|   genai_perf_test_file=$1 | ||||
|  | ||||
|   # Iterate over genai-perf tests | ||||
|   jq -c '.[]' "$genai_perf_test_file" | while read -r params; do | ||||
|     # get the test name, and append the GPU type back to it. | ||||
|     test_name=$(echo "$params" | jq -r '.test_name') | ||||
|  | ||||
|     # if TEST_SELECTOR is set, only run the test cases that match the selector | ||||
|     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then | ||||
|       echo "Skip test case $test_name." | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     # prepend the current serving engine to the test name | ||||
|     test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name} | ||||
|  | ||||
|     # get common parameters | ||||
|     common_params=$(echo "$params" | jq -r '.common_parameters') | ||||
|     model=$(echo "$common_params" | jq -r '.model') | ||||
|     tp=$(echo "$common_params" | jq -r '.tp') | ||||
|     dataset_name=$(echo "$common_params" | jq -r '.dataset_name') | ||||
|     dataset_path=$(echo "$common_params" | jq -r '.dataset_path') | ||||
|     port=$(echo "$common_params" | jq -r '.port') | ||||
|     num_prompts=$(echo "$common_params" | jq -r '.num_prompts') | ||||
|     reuse_server=$(echo "$common_params" | jq -r '.reuse_server') | ||||
|  | ||||
|     # get client and server arguments | ||||
|     server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters") | ||||
|     qps_list=$(echo "$params" | jq -r '.qps_list') | ||||
|     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') | ||||
|     echo "Running over qps list $qps_list" | ||||
|  | ||||
|     # check if there is enough GPU to run the test | ||||
|     if [[ $gpu_count -lt $tp ]]; then | ||||
|       echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     if [[ $reuse_server == "true" ]]; then | ||||
|       echo "Reuse previous server for test case $test_name" | ||||
|     else | ||||
|       kill_gpu_processes | ||||
|       bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ | ||||
|         "$server_params" "$common_params" | ||||
|     fi | ||||
|  | ||||
|     if wait_for_server; then | ||||
|       echo "" | ||||
|       echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." | ||||
|     else | ||||
|       echo "" | ||||
|       echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period." | ||||
|       break | ||||
|     fi | ||||
|  | ||||
|     # iterate over different QPS | ||||
|     for qps in $qps_list; do | ||||
|       # remove the surrounding single quote from qps | ||||
|       if [[ "$qps" == *"inf"* ]]; then | ||||
|         echo "qps was $qps" | ||||
|         qps=$num_prompts | ||||
|         echo "now qps is $qps" | ||||
|       fi | ||||
|  | ||||
|       new_test_name=$test_name"_qps_"$qps | ||||
|       backend=$CURRENT_LLM_SERVING_ENGINE | ||||
|  | ||||
|       if [[ "$backend" == *"vllm"* ]]; then | ||||
|         backend="vllm" | ||||
|       fi | ||||
|       #TODO: add output dir. | ||||
|       client_command="genai-perf profile \ | ||||
|         -m $model \ | ||||
|         --service-kind openai \ | ||||
|         --backend "$backend" \ | ||||
|         --endpoint-type chat \ | ||||
|         --streaming \ | ||||
|         --url localhost:$port \ | ||||
|         --request-rate $qps \ | ||||
|         --num-prompts $num_prompts \ | ||||
|       " | ||||
|  | ||||
|     echo "Client command: $client_command" | ||||
|  | ||||
|     eval "$client_command" | ||||
|  | ||||
|     #TODO: process/record outputs | ||||
|     done | ||||
|   done | ||||
|  | ||||
|   kill_gpu_processes | ||||
|  | ||||
| } | ||||
|  | ||||
| prepare_dataset() { | ||||
|  | ||||
| @ -428,17 +328,12 @@ main() { | ||||
|  | ||||
|   pip install -U transformers | ||||
|  | ||||
|   pip install -r requirements/dev.txt | ||||
|   which genai-perf | ||||
|  | ||||
|   # check storage | ||||
|   df -h | ||||
|  | ||||
|   ensure_installed wget | ||||
|   ensure_installed curl | ||||
|   ensure_installed jq | ||||
|   # genai-perf dependency | ||||
|   ensure_installed libb64-0d | ||||
|  | ||||
|   prepare_dataset | ||||
|  | ||||
| @ -450,10 +345,6 @@ main() { | ||||
|   # run the test | ||||
|   run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" | ||||
|  | ||||
|   # run genai-perf tests | ||||
|   run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json" | ||||
|   mv artifacts/ $RESULTS_FOLDER/ | ||||
|  | ||||
|   # upload benchmark results to buildkite | ||||
|   python3 -m pip install tabulate pandas | ||||
|   python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" | ||||
|  | ||||
| @ -10,38 +10,15 @@ set -x | ||||
| set -o pipefail | ||||
|  | ||||
| check_gpus() { | ||||
|   if command -v nvidia-smi; then | ||||
|   # check the number of GPUs and GPU type. | ||||
|   declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) | ||||
|   elif command -v amd-smi; then | ||||
|     declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l) | ||||
|   fi | ||||
|  | ||||
|   if [[ $gpu_count -gt 0 ]]; then | ||||
|     echo "GPU found." | ||||
|   else | ||||
|     echo "Need at least 1 GPU to run benchmarking." | ||||
|     exit 1 | ||||
|   fi | ||||
|   if command -v nvidia-smi; then | ||||
|   declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') | ||||
|   elif command -v amd-smi; then | ||||
|     declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}') | ||||
|   fi | ||||
|   echo "GPU type is $gpu_type" | ||||
| } | ||||
|  | ||||
| check_cpus() { | ||||
|   # check the number of CPUs and NUMA Node and GPU type. | ||||
|   declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}') | ||||
|   if [[ $numa_count -gt 0 ]]; then | ||||
|     echo "NUMA found." | ||||
|     echo $numa_count | ||||
|   else | ||||
|     echo "Need at least 1 NUMA to run benchmarking." | ||||
|     exit 1 | ||||
|   fi | ||||
|   declare -g gpu_type="cpu" | ||||
|   echo "GPU type is $gpu_type" | ||||
| } | ||||
|  | ||||
| @ -83,22 +60,6 @@ json2args() { | ||||
|   echo "$args" | ||||
| } | ||||
|  | ||||
| json2envs() { | ||||
|   # transforms the JSON string to environment variables. | ||||
|   # example: | ||||
|   # input: { "VLLM_CPU_KVCACHE_SPACE": 5 } | ||||
|   # output: VLLM_CPU_KVCACHE_SPACE=5 | ||||
|   local json_string=$1 | ||||
|   local args=$( | ||||
|     echo "$json_string" | jq -r ' | ||||
|       to_entries | | ||||
|       map((.key ) + "=" + (.value | tostring)) | | ||||
|       join(" ") | ||||
|     ' | ||||
|   ) | ||||
|   echo "$args" | ||||
| } | ||||
|  | ||||
| wait_for_server() { | ||||
|   # wait for vllm server to start | ||||
|   # return 1 if vllm server crashes | ||||
| @ -126,19 +87,12 @@ kill_gpu_processes() { | ||||
|   ps -aux | ||||
|   lsof -t -i:8000 | xargs -r kill -9 | ||||
|   pgrep python3 | xargs -r kill -9 | ||||
|   # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445 | ||||
|   pgrep VLLM | xargs -r kill -9 | ||||
|  | ||||
|  | ||||
|   # wait until GPU memory usage smaller than 1GB | ||||
|   if command -v nvidia-smi; then | ||||
|   while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do | ||||
|     sleep 1 | ||||
|   done | ||||
|   elif command -v amd-smi; then | ||||
|     while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do | ||||
|       sleep 1 | ||||
|     done | ||||
|   fi | ||||
|  | ||||
|   # remove vllm config file | ||||
|   rm -rf ~/.config/vllm | ||||
| @ -165,7 +119,7 @@ upload_to_buildkite() { | ||||
| } | ||||
|  | ||||
| run_latency_tests() { | ||||
|   # run latency tests using `vllm bench latency` command | ||||
|   # run latency tests using `benchmark_latency.py` | ||||
|   # $1: a json file specifying latency test cases | ||||
|  | ||||
|   local latency_test_file | ||||
| @ -189,26 +143,15 @@ run_latency_tests() { | ||||
|     # get arguments | ||||
|     latency_params=$(echo "$params" | jq -r '.parameters') | ||||
|     latency_args=$(json2args "$latency_params") | ||||
|     latency_environment_variables=$(echo "$params" | jq -r '.environment_variables') | ||||
|     latency_envs=$(json2envs "$latency_environment_variables") | ||||
|  | ||||
|     # check if there is enough GPU to run the test | ||||
|     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') | ||||
|     if [ "$ON_CPU" == "1" ]; then | ||||
|       pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size') | ||||
|       world_size=$(($tp*$pp)) | ||||
|       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then | ||||
|         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." | ||||
|         continue | ||||
|       fi | ||||
|     else | ||||
|     if [[ $gpu_count -lt $tp ]]; then | ||||
|       echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." | ||||
|       continue | ||||
|     fi | ||||
|     fi | ||||
|  | ||||
|     latency_command=" $latency_envs vllm bench latency \ | ||||
|     latency_command="python3 benchmark_latency.py \ | ||||
|       --output-json $RESULTS_FOLDER/${test_name}.json \ | ||||
|       $latency_args" | ||||
|  | ||||
| @ -234,7 +177,7 @@ run_latency_tests() { | ||||
| } | ||||
|  | ||||
| run_throughput_tests() { | ||||
|   # run throughput tests using `vllm bench throughput` | ||||
|   # run throughput tests using `benchmark_throughput.py` | ||||
|   # $1: a json file specifying throughput test cases | ||||
|  | ||||
|   local throughput_test_file | ||||
| @ -258,26 +201,15 @@ run_throughput_tests() { | ||||
|     # get arguments | ||||
|     throughput_params=$(echo "$params" | jq -r '.parameters') | ||||
|     throughput_args=$(json2args "$throughput_params") | ||||
|     throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables') | ||||
|     throughput_envs=$(json2envs "$throughput_environment_variables") | ||||
|  | ||||
|     # check if there is enough GPU to run the test | ||||
|     tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') | ||||
|     if [ "$ON_CPU" == "1" ]; then | ||||
|       pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size') | ||||
|       world_size=$(($tp*$pp)) | ||||
|       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then | ||||
|         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." | ||||
|         continue | ||||
|       fi | ||||
|     else | ||||
|     if [[ $gpu_count -lt $tp ]]; then | ||||
|       echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." | ||||
|       continue | ||||
|     fi | ||||
|     fi | ||||
|  | ||||
|     throughput_command=" $throughput_envs vllm bench throughput \ | ||||
|     throughput_command="python3 benchmark_throughput.py \ | ||||
|       --output-json $RESULTS_FOLDER/${test_name}.json \ | ||||
|       $throughput_args" | ||||
|  | ||||
| @ -302,7 +234,7 @@ run_throughput_tests() { | ||||
| } | ||||
|  | ||||
| run_serving_tests() { | ||||
|   # run serving tests using `vllm bench serve` command | ||||
|   # run serving tests using `benchmark_serving.py` | ||||
|   # $1: a json file specifying serving test cases | ||||
|  | ||||
|   local serving_test_file | ||||
| @ -325,37 +257,19 @@ run_serving_tests() { | ||||
|  | ||||
|     # get client and server arguments | ||||
|     server_params=$(echo "$params" | jq -r '.server_parameters') | ||||
|     server_envs=$(echo "$params" | jq -r '.server_environment_variables') | ||||
|     client_params=$(echo "$params" | jq -r '.client_parameters') | ||||
|     server_args=$(json2args "$server_params") | ||||
|     server_envs=$(json2envs "$server_envs") | ||||
|     client_args=$(json2args "$client_params") | ||||
|     qps_list=$(echo "$params" | jq -r '.qps_list') | ||||
|     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') | ||||
|     echo "Running over qps list $qps_list" | ||||
|     max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list') | ||||
|     if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then | ||||
|         num_prompts=$(echo "$client_params" | jq -r '.num_prompts') | ||||
|         max_concurrency_list="[$num_prompts]" | ||||
|     fi | ||||
|     max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh') | ||||
|     echo "Running over max concurrency list $max_concurrency_list" | ||||
|  | ||||
|     # check if there is enough resources to run the test | ||||
|     # check if there is enough GPU to run the test | ||||
|     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') | ||||
|     if [ "$ON_CPU" == "1" ]; then | ||||
|       pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size') | ||||
|       world_size=$(($tp*$pp)) | ||||
|       if [[ $numa_count -lt $world_size  && -z "${REMOTE_HOST}" ]]; then | ||||
|         echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name." | ||||
|         continue | ||||
|       fi | ||||
|     else | ||||
|     if [[ $gpu_count -lt $tp ]]; then | ||||
|       echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." | ||||
|       continue | ||||
|     fi | ||||
|     fi | ||||
|  | ||||
|     # check if server model and client model is aligned | ||||
|     server_model=$(echo "$server_params" | jq -r '.model') | ||||
| @ -365,32 +279,23 @@ run_serving_tests() { | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     server_command="$server_envs vllm serve \ | ||||
|     server_command="python3 \ | ||||
|       -m vllm.entrypoints.openai.api_server \ | ||||
|       $server_args" | ||||
|  | ||||
|     # run the server | ||||
|     echo "Running test case $test_name" | ||||
|     echo "Server command: $server_command" | ||||
|     # support remote vllm server | ||||
|     client_remote_args="" | ||||
|     if [[ -z "${REMOTE_HOST}" ]]; then | ||||
|     bash -c "$server_command" & | ||||
|     server_pid=$! | ||||
|  | ||||
|     # wait until the server is alive | ||||
|     if wait_for_server; then | ||||
|       echo "" | ||||
|         echo "vLLM server is up and running." | ||||
|       echo "vllm server is up and running." | ||||
|     else | ||||
|       echo "" | ||||
|         echo "vLLM failed to start within the timeout period." | ||||
|       fi | ||||
|     else | ||||
|       server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT" | ||||
|       if [[ ${REMOTE_PORT} ]]; then | ||||
|         client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT " | ||||
|       else | ||||
|         client_remote_args=" --host=$REMOTE_HOST " | ||||
|       fi | ||||
|       echo "vllm failed to start within the timeout period." | ||||
|     fi | ||||
|  | ||||
|     # iterate over different QPS | ||||
| @ -402,20 +307,14 @@ run_serving_tests() { | ||||
|         echo "now qps is $qps" | ||||
|       fi | ||||
|  | ||||
|       # iterate over different max_concurrency | ||||
|       for max_concurrency in $max_concurrency_list; do | ||||
|         new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency | ||||
|         echo " new test name $new_test_name" | ||||
|         # pass the tensor parallel size to the client so that it can be displayed | ||||
|         # on the benchmark dashboard | ||||
|         client_command="vllm bench serve \ | ||||
|       new_test_name=$test_name"_qps_"$qps | ||||
|  | ||||
|       client_command="python3 benchmark_serving.py \ | ||||
|         --save-result \ | ||||
|         --result-dir $RESULTS_FOLDER \ | ||||
|         --result-filename ${new_test_name}.json \ | ||||
|         --request-rate $qps \ | ||||
|           --max-concurrency $max_concurrency \ | ||||
|           --metadata "tensor_parallel_size=$tp" \ | ||||
|           $client_args $client_remote_args " | ||||
|         $client_args" | ||||
|  | ||||
|       echo "Running test case $test_name with qps $qps" | ||||
|       echo "Client command: $client_command" | ||||
| @ -435,7 +334,6 @@ run_serving_tests() { | ||||
|       echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" | ||||
|  | ||||
|     done | ||||
|     done | ||||
|  | ||||
|     # clean up | ||||
|     kill -9 $server_pid | ||||
| @ -444,14 +342,7 @@ run_serving_tests() { | ||||
| } | ||||
|  | ||||
| main() { | ||||
|   local ARCH | ||||
|   ARCH='' | ||||
|   if [ "$ON_CPU" == "1" ];then | ||||
|      check_cpus | ||||
|      ARCH='-cpu' | ||||
|   else | ||||
|   check_gpus | ||||
|   fi | ||||
|   check_hf_token | ||||
|  | ||||
|   # dependencies | ||||
| @ -459,10 +350,10 @@ main() { | ||||
|   (which jq) || (apt-get update && apt-get -y install jq) | ||||
|   (which lsof) || (apt-get update && apt-get install -y lsof) | ||||
|  | ||||
|   # get the current IP address, required by `vllm bench serve` command | ||||
|   # get the current IP address, required by benchmark_serving.py | ||||
|   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') | ||||
|   # turn of the reporting of the status of each request, to clean up the terminal output | ||||
|   export VLLM_LOGGING_LEVEL="WARNING" | ||||
|   export VLLM_LOG_LEVEL="WARNING" | ||||
|  | ||||
|   # prepare for benchmarking | ||||
|   cd benchmarks || exit 1 | ||||
| @ -472,9 +363,9 @@ main() { | ||||
|   QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ | ||||
|  | ||||
|   # benchmarking | ||||
|   run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" | ||||
|   run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" | ||||
|   run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}" | ||||
|   run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json | ||||
|   run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json | ||||
|   run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json | ||||
|  | ||||
|   # postprocess benchmarking results | ||||
|   pip install tabulate pandas | ||||
|  | ||||
| @ -1,6 +1,3 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import datetime | ||||
| import json | ||||
| import os | ||||
| @ -35,8 +32,10 @@ serving_column_mapping = { | ||||
| } | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|  | ||||
|     # collect results | ||||
|     for test_file in results_folder.glob("*.json"): | ||||
|  | ||||
|         with open(test_file) as f: | ||||
|             raw_result = json.loads(f.read()) | ||||
|  | ||||
| @ -55,16 +54,17 @@ if __name__ == "__main__": | ||||
|     serving_results = pd.DataFrame.from_dict(serving_results) | ||||
|  | ||||
|     if not serving_results.empty: | ||||
|         serving_results = serving_results[list(serving_column_mapping.keys())].rename( | ||||
|             columns=serving_column_mapping | ||||
|         ) | ||||
|         serving_results = serving_results[list( | ||||
|             serving_column_mapping.keys())].rename( | ||||
|                 columns=serving_column_mapping) | ||||
|  | ||||
|     serving_md_table_with_headers = tabulate( | ||||
|         serving_results, headers="keys", tablefmt="pipe", showindex=False | ||||
|     ) | ||||
|     serving_md_table_with_headers = tabulate(serving_results, | ||||
|                                              headers='keys', | ||||
|                                              tablefmt='pipe', | ||||
|                                              showindex=False) | ||||
|     # remove the first line of header | ||||
|     serving_md_table_lines = serving_md_table_with_headers.split("\n") | ||||
|     serving_md_table_without_header = "\n".join(serving_md_table_lines[2:]) | ||||
|     serving_md_table_lines = serving_md_table_with_headers.split('\n') | ||||
|     serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:]) | ||||
|  | ||||
|     prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") | ||||
|     prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") | ||||
| @ -74,9 +74,10 @@ if __name__ == "__main__": | ||||
|         # document results with header. | ||||
|         # for those who wants to reproduce our benchmark. | ||||
|         f.write(serving_md_table_with_headers) | ||||
|         f.write("\n") | ||||
|         f.write('\n') | ||||
|  | ||||
|     # document benchmarking results in json | ||||
|     with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: | ||||
|         results = serving_results.to_dict(orient="records") | ||||
|  | ||||
|         results = serving_results.to_dict(orient='records') | ||||
|         f.write(json.dumps(results)) | ||||
|  | ||||
| @ -1,10 +1,6 @@ | ||||
| #!/bin/sh | ||||
| TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) | ||||
| if [[ "$BUILDKITE_BRANCH" == "main" ]]; then | ||||
|     URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" | ||||
| else | ||||
|     URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" | ||||
| fi | ||||
| URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" | ||||
|  | ||||
| TIMEOUT_SECONDS=10 | ||||
|  | ||||
|  | ||||
| @ -1,21 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "llama8B_tp1_genai_perf", | ||||
|         "qps_list": [4,8,16,32], | ||||
|         "common_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3-8B-Instruct", | ||||
|             "tp": 1, | ||||
|             "port": 8000, | ||||
|             "num_prompts": 500, | ||||
|             "reuse_server": false | ||||
|         }, | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|         "genai_perf_input_parameters": { | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,30 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "latency_llama8B_tp1", | ||||
|         "environment_variables": { | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
|             "load_format": "dummy", | ||||
|             "num_iters_warmup": 5, | ||||
|             "num_iters": 15 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "latency_llama8B_tp4", | ||||
|         "environment_variables": { | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
|             "load_format": "dummy", | ||||
|             "num_iters_warmup": 5, | ||||
|             "num_iters": 15 | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -35,7 +35,9 @@ | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
| @ -88,7 +90,9 @@ | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
| @ -141,7 +145,9 @@ | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
| @ -191,7 +197,9 @@ | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
| @ -243,7 +251,9 @@ | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
| @ -295,7 +305,9 @@ | ||||
|         },  | ||||
|         "vllm_server_parameters": { | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "gpu_memory_utilization": 0.9, | ||||
|             "num_scheduler_steps": 10, | ||||
|             "max_num_seqs": 512, | ||||
|             "dtype": "bfloat16" | ||||
|         }, | ||||
|  | ||||
| @ -1,610 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp1_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp2_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp4_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp1_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp2_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp4_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp1_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp2_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp4_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp1_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp2_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp4_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp1_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp2_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp4_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp1_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp2_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp4_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,820 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_pp1_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "pipeline_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp2_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_pp3_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 2, | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_pp1_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "pipeline_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp2_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_pp3_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 2, | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_pp1_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "pipeline_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp2_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_pp3_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 2, | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_pp1_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "pipeline_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp2_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_pp3_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "tensor_parallel_size": 2, | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_pp1_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "pipeline_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp2_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_pp3_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 2, | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_pp1_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "pipeline_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp2_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_pp3_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", | ||||
|         "qps_list": ["inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
| 	    "quantization": "awq", | ||||
|             "tensor_parallel_size": 2, | ||||
|             "pipeline_parallel_size": 3, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 128, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 1000 | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,168 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp1_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp2_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 2, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp4_sharegpt", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "sharegpt", | ||||
|             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_tp4_random_1024_128", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 1024, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 100 | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "serving_llama8B_pp6_random_1024_128", | ||||
|         "qps_list": [1, 4, 16, "inf"], | ||||
|         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], | ||||
|         "server_environment_variables": { | ||||
|             "VLLM_RPC_TIMEOUT": 100000, | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, | ||||
| 	    "VLLM_CPU_SGL_KERNEL": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "pipeline_parallel_size": 6, | ||||
| 	    "dtype": "bfloat16", | ||||
| 	    "distributed_executor_backend": "mp", | ||||
| 	    "block_size": 128, | ||||
| 	    "trust_remote_code": "", | ||||
| 	    "enable_chunked_prefill": "", | ||||
|             "disable_log_stats": "", | ||||
| 	    "enforce_eager": "", | ||||
| 	    "max_num_batched_tokens": 2048, | ||||
| 	    "max_num_seqs": 256, | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "backend": "vllm", | ||||
|             "dataset_name": "random", | ||||
| 	    "random-input-len": 1024, | ||||
| 	    "random-output-len": 128, | ||||
| 	    "ignore-eos": "", | ||||
|             "num_prompts": 100 | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -7,6 +7,7 @@ | ||||
|             "tensor_parallel_size": 1, | ||||
|             "swap_space": 16, | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
| @ -25,6 +26,7 @@ | ||||
|             "tensor_parallel_size": 4, | ||||
|             "swap_space": 16, | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
| @ -43,6 +45,7 @@ | ||||
|             "tensor_parallel_size": 2, | ||||
|             "swap_space": 16, | ||||
|             "disable_log_stats": "", | ||||
|             "disable_log_requests": "", | ||||
|             "load_format": "dummy" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
| @ -58,13 +61,13 @@ | ||||
|         "qps_list": [2], | ||||
|         "server_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", | ||||
|             "disable_log_requests": "",  | ||||
|             "tensor_parallel_size": 4, | ||||
|             "swap_space": 16,  | ||||
|             "speculative_config": { | ||||
|                 "model": "turboderp/Qwama-0.5B-Instruct", | ||||
|             "speculative_model": "turboderp/Qwama-0.5B-Instruct", | ||||
|             "num_speculative_tokens": 4, | ||||
|                 "draft_tensor_parallel_size": 1 | ||||
|             } | ||||
|             "speculative_draft_tensor_parallel_size": 1, | ||||
|             "use_v2_block_manager": "" | ||||
|         }, | ||||
|         "client_parameters": { | ||||
|             "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", | ||||
|  | ||||
| @ -1,32 +0,0 @@ | ||||
| [ | ||||
|     { | ||||
|         "test_name": "throughput_llama8B_tp1", | ||||
|         "environment_variables": { | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 1, | ||||
|             "load_format": "dummy", | ||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200, | ||||
|             "backend": "vllm" | ||||
|         } | ||||
|     }, | ||||
|     { | ||||
|         "test_name": "throughput_llama8B_tp4", | ||||
|         "environment_variables": { | ||||
| 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, | ||||
| 	    "VLLM_CPU_KVCACHE_SPACE": 40 | ||||
|         }, | ||||
|         "parameters": { | ||||
|             "model": "meta-llama/Llama-3.1-8B-Instruct", | ||||
|             "tensor_parallel_size": 4, | ||||
|             "load_format": "dummy", | ||||
|             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", | ||||
|             "num_prompts": 200, | ||||
|             "backend": "vllm" | ||||
|         } | ||||
|     } | ||||
| ] | ||||
| @ -1,145 +1,61 @@ | ||||
| steps: | ||||
|   # aarch64 + CUDA builds | ||||
|   - label: "Build arm64 wheel - CUDA 12.9" | ||||
|     depends_on: ~ | ||||
|     id: build-wheel-arm64-cuda-12-9 | ||||
|     agents: | ||||
|       queue: arm64_cpu_queue_postmerge | ||||
|     commands: | ||||
|       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: | ||||
|       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   # aarch64 build | ||||
|   - label: "Build arm64 CPU wheel" | ||||
|     depends_on: ~ | ||||
|     id: build-wheel-arm64-cpu | ||||
|     agents: | ||||
|       queue: arm64_cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   # x86 + CUDA builds | ||||
|   - label: "Build wheel - CUDA 12.8" | ||||
|     depends_on: ~ | ||||
|     id: build-wheel-cuda-12-8 | ||||
|   - label: "Build wheel - CUDA 12.1" | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
|       - "bash .buildkite/upload-wheels.sh" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   - label: "Build wheel - CUDA 12.9" | ||||
|     depends_on: ~ | ||||
|     id: build-wheel-cuda-12-9 | ||||
|   # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working. | ||||
|   # However, this block can be uncommented to save some compute hours. | ||||
|   # - block: "Build CUDA 11.8 wheel" | ||||
|   #   key: block-build-cu118-wheel | ||||
|  | ||||
|   - label: "Build wheel - CUDA 11.8" | ||||
|     # depends_on: block-build-cu118-wheel | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
|       - "bash .buildkite/upload-wheels.sh" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   - label: "Build wheel - CUDA 13.0" | ||||
|   - block: "Build release image" | ||||
|     depends_on: ~ | ||||
|     id: build-wheel-cuda-13-0 | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|     key: block-release-image-build | ||||
|  | ||||
|   # Build release images (12.9) | ||||
|   - label: "Build release image (x86)" | ||||
|     depends_on: ~ | ||||
|     id: build-release-image-x86 | ||||
|   - label: "Build release image" | ||||
|     depends_on: block-release-image-build | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" | ||||
|       # re-tag to default image tag and push, just in case arm64 build fails | ||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" | ||||
|  | ||||
|   - label: "Build release image (arm64)" | ||||
|     depends_on: ~ | ||||
|     id: build-release-image-arm64 | ||||
|     agents: | ||||
|       queue: arm64_cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" | ||||
|  | ||||
|   # Add job to create multi-arch manifest | ||||
|   - label: "Create multi-arch manifest" | ||||
|     depends_on: | ||||
|       - build-release-image-x86 | ||||
|       - build-release-image-arm64 | ||||
|     id: create-multi-arch-manifest | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend" | ||||
|       - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" | ||||
|  | ||||
|   - label: "Annotate release workflow" | ||||
|     depends_on: | ||||
|       - create-multi-arch-manifest | ||||
|       - build-wheel-cuda-12-8 | ||||
|     id: annotate-release-workflow | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "bash .buildkite/scripts/annotate-release.sh" | ||||
|  | ||||
|   - label: "Build and publish TPU release image" | ||||
|     depends_on: ~ | ||||
|     if: build.env("NIGHTLY") == "1" | ||||
|     agents: | ||||
|       queue: tpu_queue_postmerge | ||||
|     commands: | ||||
|       - "yes | docker system prune -a" | ||||
|       - "git fetch --all" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ." | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ." | ||||
|       - "docker push vllm/vllm-tpu:nightly" | ||||
|       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT" | ||||
|     plugins: | ||||
|       - docker-login#v3.0.0: | ||||
|           username: vllmbot | ||||
|           username: vllm | ||||
|           password-env: DOCKERHUB_TOKEN | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   - input: "Provide Release version here" | ||||
|     id: input-release-version | ||||
|     fields: | ||||
|       - text: "What is the release version?" | ||||
|         key: release-version | ||||
|  | ||||
|   - block: "Build CPU release image" | ||||
|     key: block-cpu-release-image-build | ||||
|     depends_on: ~ | ||||
| @ -150,52 +66,7 @@ steps: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest" | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   - block: "Build arm64 CPU release image" | ||||
|     key: block-arm64-cpu-release-image-build | ||||
|     depends_on: ~ | ||||
|  | ||||
|   - label: "Build and publish arm64 CPU release image" | ||||
|     depends_on: block-arm64-cpu-release-image-build | ||||
|     agents: | ||||
|       queue: arm64_cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest" | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)" | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|  | ||||
|   - label: "Build and publish nightly multi-arch image to DockerHub" | ||||
|     depends_on: | ||||
|       - create-multi-arch-manifest | ||||
|     if: build.env("NIGHTLY") == "1" | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64" | ||||
|       - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64" | ||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64" | ||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64" | ||||
|       - "docker push vllm/vllm-openai:nightly-x86_64" | ||||
|       - "docker push vllm/vllm-openai:nightly-aarch64" | ||||
|       - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" | ||||
|       - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" | ||||
|       - "docker manifest push vllm/vllm-openai:nightly" | ||||
|       - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" | ||||
|       # Clean up old nightly builds (keep only last 14) | ||||
|       - "bash .buildkite/scripts/cleanup-nightly-builds.sh" | ||||
|     plugins: | ||||
|       - docker-login#v3.0.0: | ||||
|           username: vllmbot | ||||
|           password-env: DOCKERHUB_TOKEN | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|       DOCKERHUB_USERNAME: "vllmbot" | ||||
|  | ||||
							
								
								
									
										156
									
								
								.buildkite/run-amd-test.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										156
									
								
								.buildkite/run-amd-test.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,156 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script runs test inside the corresponding ROCm docker container. | ||||
| set -o pipefail | ||||
|  | ||||
| # Print ROCm version | ||||
| echo "--- Confirming Clean Initial State" | ||||
| while true; do | ||||
|         sleep 3 | ||||
|         if grep -q clean /opt/amdgpu/etc/gpu_state; then | ||||
|                 echo "GPUs state is \"clean\"" | ||||
|                 break | ||||
|         fi | ||||
| done | ||||
|  | ||||
| echo "--- ROCm info" | ||||
| rocminfo | ||||
|  | ||||
| # cleanup older docker images | ||||
| cleanup_docker() { | ||||
|   # Get Docker's root directory | ||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||
|   if [ -z "$docker_root" ]; then | ||||
|     echo "Failed to determine Docker root directory." | ||||
|     exit 1 | ||||
|   fi | ||||
|   echo "Docker root directory: $docker_root" | ||||
|   # Check disk usage of the filesystem where Docker's root directory is located | ||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||
|   # Define the threshold | ||||
|   threshold=70 | ||||
|   if [ "$disk_usage" -gt "$threshold" ]; then | ||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||
|     # Remove dangling images (those that are not tagged and not used by any container) | ||||
|     docker image prune -f | ||||
|     # Remove unused volumes / force the system prune for old images as well. | ||||
|     docker volume prune -f && docker system prune --force --filter "until=72h" --all | ||||
|     echo "Docker images and volumes cleanup completed." | ||||
|   else | ||||
|     echo "Disk usage is below $threshold%. No cleanup needed." | ||||
|   fi | ||||
| } | ||||
|  | ||||
| # Call the cleanup docker function | ||||
| cleanup_docker | ||||
|  | ||||
| echo "--- Resetting GPUs" | ||||
|  | ||||
| echo "reset" > /opt/amdgpu/etc/gpu_state | ||||
|  | ||||
| while true; do | ||||
|         sleep 3 | ||||
|         if grep -q clean /opt/amdgpu/etc/gpu_state; then | ||||
|                 echo "GPUs state is \"clean\"" | ||||
|                 break | ||||
|         fi | ||||
| done | ||||
|  | ||||
| echo "--- Pulling container"  | ||||
| image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" | ||||
| container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" | ||||
| docker pull "${image_name}" | ||||
|  | ||||
| remove_docker_container() { | ||||
|    docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| echo "--- Running container" | ||||
|  | ||||
| HF_CACHE="$(realpath ~)/huggingface" | ||||
| mkdir -p "${HF_CACHE}" | ||||
| HF_MOUNT="/root/.cache/huggingface" | ||||
|  | ||||
| commands=$@ | ||||
| echo "Commands:$commands" | ||||
| #ignore certain kernels tests | ||||
| if [[ $commands == *" kernels "* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/test_attention.py \ | ||||
|   --ignore=kernels/test_attention_selector.py \ | ||||
|   --ignore=kernels/test_blocksparse_attention.py \ | ||||
|   --ignore=kernels/test_causal_conv1d.py \ | ||||
|   --ignore=kernels/test_cutlass.py \ | ||||
|   --ignore=kernels/test_encoder_decoder_attn.py \ | ||||
|   --ignore=kernels/test_flash_attn.py \ | ||||
|   --ignore=kernels/test_flashinfer.py \ | ||||
|   --ignore=kernels/test_int8_quant.py \ | ||||
|   --ignore=kernels/test_machete_gemm.py \ | ||||
|   --ignore=kernels/test_mamba_ssm.py \ | ||||
|   --ignore=kernels/test_marlin_gemm.py \ | ||||
|   --ignore=kernels/test_moe.py \ | ||||
|   --ignore=kernels/test_prefix_prefill.py \ | ||||
|   --ignore=kernels/test_rand.py \ | ||||
|   --ignore=kernels/test_sampler.py" | ||||
| fi | ||||
|  | ||||
| #ignore certain Entrypoints tests | ||||
| if [[ $commands == *" entrypoints/openai "* ]]; then | ||||
|   commands=${commands//" entrypoints/openai "/" entrypoints/openai \ | ||||
|   --ignore=entrypoints/openai/test_accuracy.py \ | ||||
|   --ignore=entrypoints/openai/test_audio.py \ | ||||
|   --ignore=entrypoints/openai/test_encoder_decoder.py \ | ||||
|   --ignore=entrypoints/openai/test_embedding.py \ | ||||
|   --ignore=entrypoints/openai/test_oot_registration.py "} | ||||
| fi | ||||
|  | ||||
| PARALLEL_JOB_COUNT=8 | ||||
| # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.  | ||||
| if [[ $commands == *"--shard-id="* ]]; then | ||||
|   # assign job count as the number of shards used    | ||||
|   commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} | ||||
|   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do | ||||
|     # assign shard-id for each shard | ||||
|     commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} | ||||
|     echo "Shard ${GPU} commands:$commands_gpu" | ||||
|     docker run \ | ||||
|         --device /dev/kfd --device /dev/dri \ | ||||
|         --network host \ | ||||
|         --shm-size=16gb \ | ||||
|         --rm \ | ||||
|         -e HIP_VISIBLE_DEVICES="${GPU}" \ | ||||
|         -e HF_TOKEN \ | ||||
|         -v "${HF_CACHE}:${HF_MOUNT}" \ | ||||
|         -e "HF_HOME=${HF_MOUNT}" \ | ||||
|         --name "${container_name}_${GPU}" \ | ||||
|         "${image_name}" \ | ||||
|         /bin/bash -c "${commands_gpu}" \ | ||||
|         |& while read -r line; do echo ">>Shard $GPU: $line"; done & | ||||
|     PIDS+=($!) | ||||
|   done | ||||
|   #wait for all processes to finish and collect exit codes | ||||
|   for pid in "${PIDS[@]}"; do | ||||
|     wait "${pid}" | ||||
|     STATUS+=($?) | ||||
|   done | ||||
|   for st in "${STATUS[@]}"; do | ||||
|     if [[ ${st} -ne 0 ]]; then | ||||
|       echo "One of the processes failed with $st" | ||||
|       exit "${st}" | ||||
|     fi | ||||
|   done | ||||
| else | ||||
|   docker run \ | ||||
|           --device /dev/kfd --device /dev/dri \ | ||||
|           --network host \ | ||||
|           --shm-size=16gb \ | ||||
|           --rm \ | ||||
|           -e HIP_VISIBLE_DEVICES=0 \ | ||||
|           -e HF_TOKEN \ | ||||
|           -v "${HF_CACHE}:${HF_MOUNT}" \ | ||||
|           -e "HF_HOME=${HF_MOUNT}" \ | ||||
|           --name "${container_name}" \ | ||||
|           "${image_name}" \ | ||||
|           /bin/bash -c "${commands}" | ||||
| fi | ||||
| @ -5,26 +5,26 @@ | ||||
| set -ex | ||||
| set -o pipefail | ||||
| 
 | ||||
| # cd 2 levels into the working directory | ||||
| cd "$(dirname "${BASH_SOURCE[0]}")/../.." | ||||
| # cd into parent directory of this file | ||||
| cd "$(dirname "${BASH_SOURCE[0]}")/.." | ||||
| 
 | ||||
| (which wget && which curl) || (apt-get update && apt-get install -y wget curl) | ||||
| 
 | ||||
| # run python-based benchmarks and upload the result to buildkite | ||||
| vllm bench latency --output-json latency_results.json 2>&1 | tee benchmark_latency.txt | ||||
| python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt | ||||
| bench_latency_exit_code=$? | ||||
| 
 | ||||
| vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt | ||||
| python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt | ||||
| bench_throughput_exit_code=$? | ||||
| 
 | ||||
| # run server-based benchmarks and upload the result to buildkite | ||||
| vllm serve meta-llama/Llama-2-7b-chat-hf & | ||||
| python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf & | ||||
| server_pid=$! | ||||
| wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||||
| 
 | ||||
| # wait for server to start, timeout after 600 seconds | ||||
| timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 | ||||
| vllm bench serve \ | ||||
| python3 benchmarks/benchmark_serving.py \ | ||||
|     --backend vllm \ | ||||
|     --dataset-name sharegpt \ | ||||
|     --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \ | ||||
							
								
								
									
										14
									
								
								.buildkite/run-cpu-test-ppc64le.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										14
									
								
								.buildkite/run-cpu-test-ppc64le.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,14 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Try building the docker image | ||||
| docker build -t cpu-test -f Dockerfile.ppc64le . | ||||
|  | ||||
							
								
								
									
										85
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										85
									
								
								.buildkite/run-cpu-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,85 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # allow to bind to different cores | ||||
| CORE_RANGE=${CORE_RANGE:-48-95} | ||||
| NUMA_NODE=${NUMA_NODE:-1} | ||||
|  | ||||
| # Try building the docker image | ||||
| numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu . | ||||
| numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image, setting --shm-size=4g for tensor parallel. | ||||
| docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \ | ||||
|  --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test | ||||
| docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ | ||||
|  --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2 | ||||
|  | ||||
| function cpu_tests() { | ||||
|   set -e | ||||
|   export NUMA_NODE=$2 | ||||
|  | ||||
|   # offline inference | ||||
|   docker exec cpu-test-avx2-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     python3 examples/offline_inference.py" | ||||
|  | ||||
|   # Run basic model test | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pip install pytest pytest-asyncio \ | ||||
|       decord einops librosa peft Pillow sentence-transformers soundfile \ | ||||
|       transformers_stream_generator matplotlib datamodel_code_generator | ||||
|     pip install torchvision --index-url https://download.pytorch.org/whl/cpu | ||||
|     pytest -v -s tests/models/decoder_only/language -m cpu_model | ||||
|     pytest -v -s tests/models/embedding/language -m cpu_model | ||||
|     pytest -v -s tests/models/encoder_decoder/language -m cpu_model | ||||
|     pytest -v -s tests/models/decoder_only/audio_language -m cpu_model | ||||
|     pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" | ||||
|  | ||||
|   # Run compressed-tensor test | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pytest -s -v \ | ||||
|     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ | ||||
|     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" | ||||
|  | ||||
|   # Run AWQ test | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pytest -s -v \ | ||||
|     tests/quantization/test_ipex_quant.py" | ||||
|  | ||||
|   # Run chunked-prefill and prefix-cache test | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pytest -s -v -k cpu_model \ | ||||
|     tests/basic_correctness/test_chunked_prefill.py"   | ||||
|  | ||||
|   # online inference | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     export VLLM_CPU_KVCACHE_SPACE=10  | ||||
|     export VLLM_CPU_OMP_THREADS_BIND=$1 | ||||
|     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &  | ||||
|     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 | ||||
|     python3 benchmarks/benchmark_serving.py \ | ||||
|       --backend vllm \ | ||||
|       --dataset-name random \ | ||||
|       --model facebook/opt-125m \ | ||||
|       --num-prompts 20 \ | ||||
|       --endpoint /v1/completions \ | ||||
|       --tokenizer facebook/opt-125m" | ||||
| } | ||||
|  | ||||
| # All of CPU tests are expected to be finished less than 25 mins. | ||||
| export -f cpu_tests | ||||
| timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" | ||||
							
								
								
									
										28
									
								
								.buildkite/run-gh200-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								.buildkite/run-gh200-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,28 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the GH200 docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile | ||||
| python3 use_existing_torch.py | ||||
|  | ||||
| # Try building the docker image | ||||
| DOCKER_BUILDKIT=1 docker build . \ | ||||
|   --target vllm-openai \ | ||||
|   --platform "linux/arm64" \ | ||||
|   -t gh200-test \ | ||||
|   --build-arg max_jobs=66 \ | ||||
|   --build-arg nvcc_threads=2 \ | ||||
|   --build-arg torch_cuda_arch_list="9.0+PTX" \ | ||||
|   --build-arg vllm_fa_cmake_gpu_arches="90-real" | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f gh200-test || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image and test offline inference | ||||
| docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' | ||||
|     python3 examples/offline_inference.py | ||||
| ' | ||||
							
								
								
									
										16
									
								
								.buildkite/run-hpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								.buildkite/run-hpu-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,16 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Try building the docker image | ||||
| docker build -t hpu-test-env -f Dockerfile.hpu . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f hpu-test || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image and launch offline inference | ||||
| docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py | ||||
| @ -3,7 +3,7 @@ | ||||
| set -euox pipefail | ||||
| 
 | ||||
| if [[ $# -lt 4 ]]; then | ||||
|     echo "Usage: .buildkite/scripts/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" | ||||
|     echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN" | ||||
|     exit 1 | ||||
| fi | ||||
| 
 | ||||
							
								
								
									
										53
									
								
								.buildkite/run-neuron-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								.buildkite/run-neuron-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,53 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the Neuron docker image and run the API server inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -e | ||||
|  | ||||
| # Try building the docker image | ||||
| aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com | ||||
|  | ||||
| # prune old image and containers to save disk space, and only once a day | ||||
| # by using a timestamp file in tmp. | ||||
| if [ -f /tmp/neuron-docker-build-timestamp ]; then | ||||
|     last_build=$(cat /tmp/neuron-docker-build-timestamp) | ||||
|     current_time=$(date +%s) | ||||
|     if [ $((current_time - last_build)) -gt 86400 ]; then | ||||
|         docker system prune -f | ||||
|         echo "$current_time" > /tmp/neuron-docker-build-timestamp | ||||
|     fi | ||||
| else | ||||
|     date "+%s" > /tmp/neuron-docker-build-timestamp | ||||
| fi | ||||
|  | ||||
| docker build -t neuron -f Dockerfile.neuron . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f neuron || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image | ||||
| docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \ | ||||
|        --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 & | ||||
|  | ||||
| # Wait for the server to start | ||||
| wait_for_server_to_start() { | ||||
|     timeout=300 | ||||
|     counter=0 | ||||
|  | ||||
|     while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do | ||||
|         sleep 1 | ||||
|         counter=$((counter + 1)) | ||||
|         if [ $counter -ge $timeout ]; then | ||||
|             echo "Timeout after $timeout seconds" | ||||
|             break | ||||
|         fi | ||||
|     done | ||||
| } | ||||
| wait_for_server_to_start | ||||
|  | ||||
| # Test a simple prompt | ||||
| curl -X POST -H "Content-Type: application/json" \ | ||||
|     localhost:8000/generate \ | ||||
|     -d '{"prompt": "San Francisco is a"}' | ||||
							
								
								
									
										16
									
								
								.buildkite/run-openvino-test.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										16
									
								
								.buildkite/run-openvino-test.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,16 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the OpenVINO docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Try building the docker image | ||||
| docker build -t openvino-test -f Dockerfile.openvino . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f openvino-test || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image and launch offline inference | ||||
| docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py | ||||
							
								
								
									
										17
									
								
								.buildkite/run-tpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								.buildkite/run-tpu-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,17 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| # Build the docker image. | ||||
| docker build -f Dockerfile.tpu -t vllm-tpu . | ||||
|  | ||||
| # Set up cleanup. | ||||
| remove_docker_container() { docker rm -f tpu-test || true; } | ||||
| trap remove_docker_container EXIT | ||||
| # Remove the container that might not be cleaned up in the previous run. | ||||
| remove_docker_container | ||||
|  | ||||
| # For HF_TOKEN. | ||||
| source /etc/environment | ||||
| # Run a simple end-to-end example. | ||||
| docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" | ||||
							
								
								
									
										19
									
								
								.buildkite/run-xpu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								.buildkite/run-xpu-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,19 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Try building the docker image | ||||
| docker build -t xpu-test -f Dockerfile.xpu . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f xpu-test || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image and test offline inference/tensor parallel | ||||
| docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' | ||||
|     python3 examples/offline_inference.py | ||||
|     python3 examples/offline_inference_cli.py -tp 2 | ||||
| ' | ||||
| @ -1,46 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -ex | ||||
|  | ||||
| # Get release version and strip leading 'v' if present | ||||
| RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//') | ||||
|  | ||||
| if [ -z "$RELEASE_VERSION" ]; then | ||||
|   echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid." | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF | ||||
| To download the wheel: | ||||
| \`\`\` | ||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl . | ||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl . | ||||
|  | ||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl . | ||||
| aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl . | ||||
| \`\`\` | ||||
|  | ||||
| To download and upload the image: | ||||
|  | ||||
| \`\`\` | ||||
| docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 | ||||
| docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 | ||||
|  | ||||
| docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64 | ||||
| docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64 | ||||
| docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 | ||||
| docker push vllm/vllm-openai:latest-x86_64 | ||||
| docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 | ||||
|  | ||||
| docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64 | ||||
| docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64 | ||||
| docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 | ||||
| docker push vllm/vllm-openai:latest-aarch64 | ||||
| docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 | ||||
|  | ||||
| docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend | ||||
| docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend | ||||
| docker manifest push vllm/vllm-openai:latest | ||||
| docker manifest push vllm/vllm-openai:v${RELEASE_VERSION} | ||||
| \`\`\` | ||||
| EOF  | ||||
| @ -1,17 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # Usage: ./ci_clean_log.sh ci.log | ||||
| # This script strips timestamps and color codes from CI log files. | ||||
|  | ||||
| # Check if argument is given | ||||
| if [ $# -lt 1 ]; then | ||||
|     echo "Usage: $0 ci.log" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| INPUT_FILE="$1" | ||||
|  | ||||
| # Strip timestamps | ||||
| sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE" | ||||
|  | ||||
| # Strip colorization | ||||
| sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE" | ||||
| @ -1,120 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -ex | ||||
|  | ||||
| # Clean up old nightly builds from DockerHub, keeping only the last 14 builds | ||||
| # This script uses DockerHub API to list and delete old tags with "nightly-" prefix | ||||
|  | ||||
| # DockerHub API endpoint for vllm/vllm-openai repository | ||||
| REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" | ||||
|  | ||||
| # Get DockerHub credentials from environment | ||||
| if [ -z "$DOCKERHUB_TOKEN" ]; then | ||||
|     echo "Error: DOCKERHUB_TOKEN environment variable is not set" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| if [ -z "$DOCKERHUB_USERNAME" ]; then | ||||
|     echo "Error: DOCKERHUB_USERNAME environment variable is not set" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| # Get DockerHub bearer token | ||||
| echo "Getting DockerHub bearer token..." | ||||
| set +x | ||||
| BEARER_TOKEN=$(curl -s -X POST \ | ||||
|     -H "Content-Type: application/json" \ | ||||
|     -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \ | ||||
|     "https://hub.docker.com/v2/users/login" | jq -r '.token') | ||||
| set -x | ||||
|  | ||||
| if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then | ||||
|     echo "Error: Failed to get DockerHub bearer token" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| # Function to get all tags from DockerHub | ||||
| get_all_tags() { | ||||
|     local page=1 | ||||
|     local all_tags="" | ||||
|      | ||||
|     while true; do | ||||
|         set +x | ||||
|         local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \ | ||||
|             "$REPO_API_URL?page=$page&page_size=100") | ||||
|         set -x | ||||
|          | ||||
|         # Get both last_updated timestamp and tag name, separated by | | ||||
|         local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') | ||||
|          | ||||
|         if [ -z "$tags" ]; then | ||||
|             break | ||||
|         fi | ||||
|          | ||||
|         all_tags="$all_tags$tags"$'\n' | ||||
|         page=$((page + 1)) | ||||
|     done | ||||
|      | ||||
|     # Sort by timestamp (newest first) and extract just the tag names | ||||
|     echo "$all_tags" | sort -r | cut -d'|' -f2 | ||||
| } | ||||
|  | ||||
| delete_tag() { | ||||
|     local tag_name="$1" | ||||
|     echo "Deleting tag: $tag_name" | ||||
|      | ||||
|     local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" | ||||
|     set +x | ||||
|     local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url") | ||||
|     set -x | ||||
|      | ||||
|     if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then | ||||
|         echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')" | ||||
|     else | ||||
|         echo "Successfully deleted tag: $tag_name" | ||||
|     fi | ||||
| } | ||||
|  | ||||
| # Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first) | ||||
| echo "Fetching all tags from DockerHub..." | ||||
| all_tags=$(get_all_tags) | ||||
|  | ||||
| if [ -z "$all_tags" ]; then | ||||
|     echo "No tags found to clean up" | ||||
|     exit 0 | ||||
| fi | ||||
|  | ||||
| # Count total tags | ||||
| total_tags=$(echo "$all_tags" | wc -l) | ||||
| echo "Found $total_tags tags" | ||||
|  | ||||
| # Keep only the last 14 builds (including the current one) | ||||
| tags_to_keep=14 | ||||
| tags_to_delete=$((total_tags - tags_to_keep)) | ||||
|  | ||||
| if [ $tags_to_delete -le 0 ]; then | ||||
|     echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)" | ||||
|     exit 0 | ||||
| fi | ||||
|  | ||||
| echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep" | ||||
|  | ||||
| # Get tags to delete (skip the first $tags_to_keep tags) | ||||
| tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1))) | ||||
|  | ||||
| if [ -z "$tags_to_delete_list" ]; then | ||||
|     echo "No tags to delete" | ||||
|     exit 0 | ||||
| fi | ||||
|  | ||||
| # Delete old tags | ||||
| echo "Deleting old tags..." | ||||
| while IFS= read -r tag; do | ||||
|     if [ -n "$tag" ]; then | ||||
|         delete_tag "$tag" | ||||
|         # Add a small delay to avoid rate limiting | ||||
|         sleep 1 | ||||
|     fi | ||||
| done <<< "$tags_to_delete_list" | ||||
|  | ||||
| echo "Cleanup completed successfully" | ||||
| @ -1,231 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script runs test inside the corresponding ROCm docker container. | ||||
| set -o pipefail | ||||
|  | ||||
| # Export Python path | ||||
| export PYTHONPATH=".." | ||||
|  | ||||
| # Print ROCm version | ||||
| echo "--- Confirming Clean Initial State" | ||||
| while true; do | ||||
|         sleep 3 | ||||
|         if grep -q clean /opt/amdgpu/etc/gpu_state; then | ||||
|                 echo "GPUs state is \"clean\"" | ||||
|                 break | ||||
|         fi | ||||
| done | ||||
|  | ||||
| echo "--- ROCm info" | ||||
| rocminfo | ||||
|  | ||||
| # cleanup older docker images | ||||
| cleanup_docker() { | ||||
|   # Get Docker's root directory | ||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||
|   if [ -z "$docker_root" ]; then | ||||
|     echo "Failed to determine Docker root directory." | ||||
|     exit 1 | ||||
|   fi | ||||
|   echo "Docker root directory: $docker_root" | ||||
|   # Check disk usage of the filesystem where Docker's root directory is located | ||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||
|   # Define the threshold | ||||
|   threshold=70 | ||||
|   if [ "$disk_usage" -gt "$threshold" ]; then | ||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||
|     # Remove dangling images (those that are not tagged and not used by any container) | ||||
|     docker image prune -f | ||||
|     # Remove unused volumes / force the system prune for old images as well. | ||||
|     docker volume prune -f && docker system prune --force --filter "until=72h" --all | ||||
|     echo "Docker images and volumes cleanup completed." | ||||
|   else | ||||
|     echo "Disk usage is below $threshold%. No cleanup needed." | ||||
|   fi | ||||
| } | ||||
|  | ||||
| # Call the cleanup docker function | ||||
| cleanup_docker | ||||
|  | ||||
| echo "--- Resetting GPUs" | ||||
|  | ||||
| echo "reset" > /opt/amdgpu/etc/gpu_state | ||||
|  | ||||
| while true; do | ||||
|         sleep 3 | ||||
|         if grep -q clean /opt/amdgpu/etc/gpu_state; then | ||||
|                 echo "GPUs state is \"clean\"" | ||||
|                 break | ||||
|         fi | ||||
| done | ||||
|  | ||||
| echo "--- Pulling container"  | ||||
| image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" | ||||
| container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" | ||||
| docker pull "${image_name}" | ||||
|  | ||||
| remove_docker_container() { | ||||
|    docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| echo "--- Running container" | ||||
|  | ||||
| HF_CACHE="$(realpath ~)/huggingface" | ||||
| mkdir -p "${HF_CACHE}" | ||||
| HF_MOUNT="/root/.cache/huggingface" | ||||
|  | ||||
| commands=$@ | ||||
| echo "Commands:$commands" | ||||
|  | ||||
| if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then | ||||
|   commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"} | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then | ||||
|   commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then | ||||
|   commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *"pytest -v -s lora"* ]]; then | ||||
|   commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"} | ||||
| fi | ||||
|  | ||||
| #ignore certain kernels tests | ||||
| if [[ $commands == *" kernels/core"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/core/test_fused_quant_layernorm.py \ | ||||
|   --ignore=kernels/core/test_permute_cols.py" | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *" kernels/attention"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/attention/test_attention_selector.py \ | ||||
|   --ignore=kernels/attention/test_encoder_decoder_attn.py \ | ||||
|   --ignore=kernels/attention/test_flash_attn.py \ | ||||
|   --ignore=kernels/attention/test_flashinfer.py \ | ||||
|   --ignore=kernels/attention/test_prefix_prefill.py \ | ||||
|   --ignore=kernels/attention/test_cascade_flash_attn.py \ | ||||
|   --ignore=kernels/attention/test_mha_attn.py \ | ||||
|   --ignore=kernels/attention/test_lightning_attn.py \ | ||||
|   --ignore=kernels/attention/test_attention.py" | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *" kernels/quantization"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/quantization/test_int8_quant.py \ | ||||
|   --ignore=kernels/quantization/test_machete_mm.py \ | ||||
|   --ignore=kernels/quantization/test_block_fp8.py \ | ||||
|   --ignore=kernels/quantization/test_block_int8.py \ | ||||
|   --ignore=kernels/quantization/test_marlin_gemm.py \ | ||||
|   --ignore=kernels/quantization/test_cutlass_scaled_mm.py \ | ||||
|   --ignore=kernels/quantization/test_int8_kernel.py" | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *" kernels/mamba"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/mamba/test_mamba_mixer2.py \ | ||||
|   --ignore=kernels/mamba/test_causal_conv1d.py \ | ||||
|   --ignore=kernels/mamba/test_mamba_ssm_ssd.py" | ||||
| fi | ||||
|  | ||||
| if [[ $commands == *" kernels/moe"* ]]; then | ||||
|   commands="${commands} \ | ||||
|   --ignore=kernels/moe/test_moe.py \ | ||||
|   --ignore=kernels/moe/test_cutlass_moe.py \ | ||||
|   --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" | ||||
| fi | ||||
|  | ||||
| #ignore certain Entrypoints/openai tests | ||||
| if [[ $commands == *" entrypoints/openai "* ]]; then | ||||
|   commands=${commands//" entrypoints/openai "/" entrypoints/openai \ | ||||
|   --ignore=entrypoints/openai/test_audio.py \ | ||||
|   --ignore=entrypoints/openai/test_shutdown.py \ | ||||
|   --ignore=entrypoints/openai/test_completion.py \ | ||||
|   --ignore=entrypoints/openai/test_sleep.py \ | ||||
|   --ignore=entrypoints/openai/test_models.py \ | ||||
|   --ignore=entrypoints/openai/test_lora_adapters.py \ | ||||
|   --ignore=entrypoints/openai/test_return_tokens_as_ids.py \ | ||||
|   --ignore=entrypoints/openai/test_root_path.py \ | ||||
|   --ignore=entrypoints/openai/test_tokenization.py \ | ||||
|   --ignore=entrypoints/openai/test_prompt_validation.py "} | ||||
| fi | ||||
|  | ||||
| #ignore certain Entrypoints/llm tests | ||||
| if [[ $commands == *" entrypoints/llm "* ]]; then | ||||
|   commands=${commands//" entrypoints/llm "/" entrypoints/llm \ | ||||
|   --ignore=entrypoints/llm/test_chat.py \ | ||||
|   --ignore=entrypoints/llm/test_accuracy.py \ | ||||
|   --ignore=entrypoints/llm/test_init.py \ | ||||
|   --ignore=entrypoints/llm/test_prompt_validation.py "} | ||||
| fi | ||||
|  | ||||
| # --ignore=entrypoints/openai/test_encoder_decoder.py \ | ||||
| # --ignore=entrypoints/openai/test_embedding.py \ | ||||
| # --ignore=entrypoints/openai/test_oot_registration.py | ||||
| # --ignore=entrypoints/openai/test_accuracy.py \ | ||||
| # --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13 | ||||
|  | ||||
|  | ||||
| PARALLEL_JOB_COUNT=8 | ||||
| MYPYTHONPATH=".." | ||||
|  | ||||
| # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.  | ||||
| if [[ $commands == *"--shard-id="* ]]; then | ||||
|   # assign job count as the number of shards used    | ||||
|   commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} | ||||
|   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do | ||||
|     # assign shard-id for each shard | ||||
|     commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} | ||||
|     echo "Shard ${GPU} commands:$commands_gpu" | ||||
|     echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" | ||||
|     docker run \ | ||||
|         --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ | ||||
|         --network=host \ | ||||
|         --shm-size=16gb \ | ||||
|         --rm \ | ||||
|         -e HIP_VISIBLE_DEVICES="${GPU}" \ | ||||
|         -e HF_TOKEN \ | ||||
|         -e AWS_ACCESS_KEY_ID \ | ||||
|         -e AWS_SECRET_ACCESS_KEY \ | ||||
|         -v "${HF_CACHE}:${HF_MOUNT}" \ | ||||
|         -e "HF_HOME=${HF_MOUNT}" \ | ||||
|         -e "PYTHONPATH=${MYPYTHONPATH}" \ | ||||
|         --name "${container_name}_${GPU}" \ | ||||
|         "${image_name}" \ | ||||
|         /bin/bash -c "${commands_gpu}" \ | ||||
|         |& while read -r line; do echo ">>Shard $GPU: $line"; done & | ||||
|     PIDS+=($!) | ||||
|   done | ||||
|   #wait for all processes to finish and collect exit codes | ||||
|   for pid in "${PIDS[@]}"; do | ||||
|     wait "${pid}" | ||||
|     STATUS+=($?) | ||||
|   done | ||||
|   for st in "${STATUS[@]}"; do | ||||
|     if [[ ${st} -ne 0 ]]; then | ||||
|       echo "One of the processes failed with $st" | ||||
|       exit "${st}" | ||||
|     fi | ||||
|   done | ||||
| else | ||||
|   echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES" | ||||
|   docker run \ | ||||
|           --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ | ||||
|           --network=host \ | ||||
|           --shm-size=16gb \ | ||||
|           --rm \ | ||||
|           -e HIP_VISIBLE_DEVICES=0 \ | ||||
|           -e HF_TOKEN \ | ||||
|           -e AWS_ACCESS_KEY_ID \ | ||||
|           -e AWS_SECRET_ACCESS_KEY \ | ||||
|           -v "${HF_CACHE}:${HF_MOUNT}" \ | ||||
|           -e "HF_HOME=${HF_MOUNT}" \ | ||||
|           -e "PYTHONPATH=${MYPYTHONPATH}" \ | ||||
|           --name "${container_name}" \ | ||||
|           "${image_name}" \ | ||||
|           /bin/bash -c "${commands}" | ||||
| fi | ||||
| @ -1,52 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { | ||||
|   if [[ -n "$container_id" ]]; then | ||||
|       podman stop --all -t0 | ||||
|       podman rm -f "$container_id" || true | ||||
|   fi | ||||
|   podman system prune -f | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Try building the docker image | ||||
| podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le . | ||||
|  | ||||
| # Run the image | ||||
| container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc) | ||||
|  | ||||
| function cpu_tests() { | ||||
|  | ||||
|   # offline inference | ||||
|   podman exec -it "$container_id" bash -c " | ||||
|     set -xve | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log | ||||
|  | ||||
|   # Run basic model test | ||||
|   podman exec -it "$container_id" bash -c " | ||||
|     set -evx | ||||
|     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib | ||||
|     pip install sentence-transformers datamodel_code_generator | ||||
|  | ||||
|     # Note: disable Bart until supports V1 | ||||
|     # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model | ||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] | ||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] | ||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] | ||||
|     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] | ||||
|     # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. | ||||
|     # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log | ||||
| } | ||||
|  | ||||
| # All of CPU tests are expected to be finished less than 40 mins. | ||||
|  | ||||
| export container_id | ||||
| export -f cpu_tests | ||||
| timeout 120m bash -c cpu_tests | ||||
|  | ||||
| @ -1,13 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Try building the docker image | ||||
| docker build -t cpu-test -f docker/Dockerfile.s390x . | ||||
| @ -1,119 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # allow to bind to different cores | ||||
| CORE_RANGE=${CORE_RANGE:-48-95} | ||||
| # used for TP/PP E2E test | ||||
| OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} | ||||
| NUMA_NODE=${NUMA_NODE:-1} | ||||
|  | ||||
| export CMAKE_BUILD_PARALLEL_LEVEL=32 | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { | ||||
|     set -e; | ||||
|     docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true; | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Try building the docker image | ||||
| numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu . | ||||
| numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . | ||||
|  | ||||
| # Run the image, setting --shm-size=4g for tensor parallel. | ||||
| docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" | ||||
| docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 | ||||
|  | ||||
| function cpu_tests() { | ||||
|   set -e | ||||
|   export NUMA_NODE=$2 | ||||
|  | ||||
|   # list packages | ||||
|   docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " | ||||
|     set -e | ||||
|     pip list" | ||||
|  | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pip list" | ||||
|  | ||||
|   # offline inference | ||||
|   docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c " | ||||
|     set -e | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" | ||||
|  | ||||
|   # Run kernel tests | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pytest -x -v -s tests/kernels/test_onednn.py" | ||||
|  | ||||
|   # Run basic model test | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     # Note: disable until supports V1 | ||||
|     # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model | ||||
|     # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model | ||||
|  | ||||
|     pytest -x -v -s tests/models/language/generation -m cpu_model | ||||
|     VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model | ||||
|  | ||||
|     pytest -x -v -s tests/models/language/pooling -m cpu_model | ||||
|     pytest -x -v -s tests/models/multimodal/generation \ | ||||
|                 --ignore=tests/models/multimodal/generation/test_pixtral.py \ | ||||
|                 -m cpu_model" | ||||
|  | ||||
|   # Run compressed-tensor test | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pytest -x -s -v \ | ||||
|     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" | ||||
|  | ||||
|   # Note: disable it until supports V1 | ||||
|   # Run AWQ test | ||||
|   # docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|   #   set -e | ||||
|   #   VLLM_USE_V1=0 pytest -x -s -v \ | ||||
|   #   tests/quantization/test_ipex_quant.py" | ||||
|  | ||||
|   # Run multi-lora tests | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pytest -x -s -v \ | ||||
|     tests/lora/test_qwen2vl.py" | ||||
|  | ||||
|   # online serving: tp+pp | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c ' | ||||
|     set -e | ||||
|     VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & | ||||
|     server_pid=$! | ||||
|     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 | ||||
|     vllm bench serve \ | ||||
|       --backend vllm \ | ||||
|       --dataset-name random \ | ||||
|       --model meta-llama/Llama-3.2-3B-Instruct \ | ||||
|       --num-prompts 20 \ | ||||
|       --endpoint /v1/completions | ||||
|     kill -s SIGTERM $server_pid &' | ||||
|  | ||||
|   # online serving: tp+dp | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c ' | ||||
|     set -e | ||||
|     VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 & | ||||
|     server_pid=$! | ||||
|     timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 | ||||
|     vllm bench serve \ | ||||
|       --backend vllm \ | ||||
|       --dataset-name random \ | ||||
|       --model meta-llama/Llama-3.2-3B-Instruct \ | ||||
|       --num-prompts 20 \ | ||||
|       --endpoint /v1/completions | ||||
|     kill -s SIGTERM $server_pid &' | ||||
| } | ||||
|  | ||||
| # All of CPU tests are expected to be finished less than 40 mins. | ||||
| export -f cpu_tests | ||||
| timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" | ||||
| @ -1,29 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the GH200 docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile | ||||
| python3 use_existing_torch.py | ||||
|  | ||||
| # Try building the docker image | ||||
| DOCKER_BUILDKIT=1 docker build . \ | ||||
|   --file docker/Dockerfile \ | ||||
|   --target vllm-openai \ | ||||
|   --platform "linux/arm64" \ | ||||
|   -t gh200-test \ | ||||
|   --build-arg max_jobs=66 \ | ||||
|   --build-arg nvcc_threads=2 \ | ||||
|   --build-arg RUN_WHEEL_CHECK=false \ | ||||
|   --build-arg torch_cuda_arch_list="9.0+PTX" | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { docker rm -f gh200-test || true; } | ||||
| trap remove_docker_container EXIT | ||||
| remove_docker_container | ||||
|  | ||||
| # Run the image and test offline inference | ||||
| docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c ' | ||||
|     python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B | ||||
| ' | ||||
| @ -1,56 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -exuo pipefail | ||||
|  | ||||
| # Try building the docker image | ||||
| cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - . | ||||
| FROM gaudi-base-image:latest | ||||
|  | ||||
| COPY ./ /workspace/vllm | ||||
|  | ||||
| WORKDIR /workspace/vllm | ||||
|  | ||||
| ENV no_proxy=localhost,127.0.0.1 | ||||
| ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true | ||||
|  | ||||
| RUN VLLM_TARGET_DEVICE=empty pip install . | ||||
| RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git | ||||
|  | ||||
| # install development dependencies (for testing) | ||||
| RUN python3 -m pip install -e tests/vllm_test_utils | ||||
|  | ||||
| WORKDIR /workspace/ | ||||
|  | ||||
| RUN git clone https://github.com/vllm-project/vllm-gaudi.git | ||||
|  | ||||
| RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks | ||||
|  | ||||
| EOF | ||||
|  | ||||
| # Setup cleanup | ||||
| # certain versions of HPU software stack have a bug that can | ||||
| # override the exit code of the script, so we need to use | ||||
| # separate remove_docker_containers and remove_docker_containers_and_exit | ||||
| # functions, while other platforms only need one remove_docker_container | ||||
| # function. | ||||
| EXITCODE=1 | ||||
| remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; } | ||||
| trap 'remove_docker_containers; exit $EXITCODE;' EXIT | ||||
| remove_docker_containers | ||||
|  | ||||
| echo "Running HPU plugin v1 test" | ||||
| docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \ | ||||
|   -e HABANA_VISIBLE_DEVICES=all \ | ||||
|   hpu-plugin-v1-test-env \ | ||||
|   /bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh" | ||||
|  | ||||
| EXITCODE=$? | ||||
| if [ $EXITCODE -eq 0 ]; then | ||||
|   echo "Test with basic model passed" | ||||
| else | ||||
|   echo "Test with basic model FAILED with exit code: $EXITCODE" >&2 | ||||
| fi | ||||
|  | ||||
| # The trap will handle the container removal and final exit. | ||||
| @ -1,191 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the Ascend NPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Base ubuntu image with basic ascend development libraries and python installed | ||||
| VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git" | ||||
| CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg" | ||||
| TEST_RUN_CONFIG_FILE="vllm_test.cfg" | ||||
| VLLM_ASCEND_TMP_DIR= | ||||
| # Get the test run configuration file from the vllm-ascend repository | ||||
| fetch_vllm_test_cfg() { | ||||
|     VLLM_ASCEND_TMP_DIR=$(mktemp -d) | ||||
|     # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval | ||||
|     cleanup() { | ||||
|         rm -rf "${VLLM_ASCEND_TMP_DIR}" | ||||
|     } | ||||
|     trap cleanup EXIT | ||||
|  | ||||
|     GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}" | ||||
|     if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then | ||||
|         echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2 | ||||
|         exit 1 | ||||
|     fi | ||||
|  | ||||
|     # If the file already exists locally, just overwrite it | ||||
|     cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}" | ||||
|     echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}" | ||||
|  | ||||
|     # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources | ||||
|     # when the trap is abnormal has been completed, so the temporary resources are manually deleted here. | ||||
|     rm -rf "${VLLM_ASCEND_TMP_DIR}" | ||||
|     trap - EXIT | ||||
| } | ||||
|  | ||||
| # Downloads test run configuration file from a remote URL. | ||||
| # Loads the configuration into the current script environment. | ||||
| get_config() { | ||||
|     if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then | ||||
|         echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2 | ||||
|         exit 1 | ||||
|     fi | ||||
|     source "${TEST_RUN_CONFIG_FILE}" | ||||
|     echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}" | ||||
|     return 0 | ||||
| } | ||||
|  | ||||
| # get test running configuration. | ||||
| fetch_vllm_test_cfg | ||||
| get_config | ||||
| # Check if the function call was successful. If not, exit the script. | ||||
| if [ $? -ne 0 ]; then | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}" | ||||
| container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" | ||||
|  | ||||
| # BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards | ||||
| agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}') | ||||
| echo "agent_idx: ${agent_idx}" | ||||
| builder_name="cachebuilder${agent_idx}" | ||||
| builder_cache_dir="/mnt/docker-cache${agent_idx}" | ||||
| mkdir -p ${builder_cache_dir} | ||||
|  | ||||
| # Try building the docker image | ||||
| cat <<EOF | DOCKER_BUILDKIT=1 docker build \ | ||||
|     --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \ | ||||
|     --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \ | ||||
|                            --cache-to type=local,dest=${builder_cache_dir},mode=max \ | ||||
|     --progress=plain --load -t ${image_name} -f - . | ||||
| FROM ${BASE_IMAGE_NAME} | ||||
|  | ||||
| # Define environments | ||||
| ENV DEBIAN_FRONTEND=noninteractive | ||||
|  | ||||
| RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \ | ||||
|     pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \ | ||||
|     apt-get update -y && \ | ||||
|     apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ | ||||
|     rm -rf /var/cache/apt/* && \ | ||||
|     rm -rf /var/lib/apt/lists/* | ||||
|  | ||||
| # Install for pytest to make the docker build cache layer always valid | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     pip install pytest>=6.0  modelscope | ||||
|  | ||||
| WORKDIR /workspace/vllm | ||||
|  | ||||
| # Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. | ||||
| COPY requirements/common.txt /workspace/vllm/requirements/common.txt | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     pip install -r requirements/common.txt | ||||
|  | ||||
| COPY . . | ||||
|  | ||||
| # Install vLLM | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ | ||||
|     python3 -m pip uninstall -y triton | ||||
|  | ||||
| # Install vllm-ascend | ||||
| WORKDIR /workspace | ||||
| ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git | ||||
| ARG VLLM_ASCEND_TAG=main | ||||
| RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \ | ||||
|     git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend | ||||
|  | ||||
| # Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     pip install -r /workspace/vllm-ascend/requirements.txt | ||||
|  | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ | ||||
|     source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ | ||||
|     source /usr/local/Ascend/nnal/atb/set_env.sh && \ | ||||
|     export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ | ||||
|     python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ | ||||
|  | ||||
| ENV VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
| ENV VLLM_USE_MODELSCOPE=True | ||||
|  | ||||
| WORKDIR /workspace/vllm-ascend | ||||
|  | ||||
| CMD ["/bin/bash"] | ||||
|  | ||||
| EOF | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { | ||||
|   docker rm -f "${container_name}" || true; | ||||
|   docker image rm -f "${image_name}" || true; | ||||
|   docker system prune -f || true; | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Generate corresponding --device args based on BUILDKITE_AGENT_NAME | ||||
| # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1. | ||||
| #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards. | ||||
| #   returns --device /dev/davinci0 --device /dev/davinci1 | ||||
| parse_and_gen_devices() { | ||||
|     local input="$1" | ||||
|     local index cards_num | ||||
|     if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then | ||||
|         index="${BASH_REMATCH[1]}" | ||||
|         cards_num="${BASH_REMATCH[2]}" | ||||
|     else | ||||
|         echo "parse error" >&2 | ||||
|         return 1 | ||||
|     fi | ||||
|  | ||||
|     local devices="" | ||||
|     local i=0 | ||||
|     while (( i < cards_num )); do | ||||
|         local dev_idx=$(((index - 1)*cards_num + i )) | ||||
|         devices="$devices --device /dev/davinci${dev_idx}" | ||||
|         ((i++)) | ||||
|     done | ||||
|  | ||||
|     # trim leading space | ||||
|     devices="${devices#"${devices%%[![:space:]]*}"}" | ||||
|     # Output devices: assigned to the caller variable | ||||
|     printf '%s' "$devices" | ||||
| } | ||||
|  | ||||
| devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1 | ||||
|  | ||||
| # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware. | ||||
| # This test checks whether the OOT platform interface is functioning properly in conjunction with | ||||
| # the hardware plugin vllm-ascend. | ||||
| model_cache_dir=/mnt/modelscope${agent_idx} | ||||
| mkdir -p ${model_cache_dir} | ||||
| docker run \ | ||||
|     ${devices} \ | ||||
|     --device /dev/davinci_manager \ | ||||
|     --device /dev/devmm_svm \ | ||||
|     --device /dev/hisi_hdc \ | ||||
|     -v /usr/local/dcmi:/usr/local/dcmi \ | ||||
|     -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ | ||||
|     -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ | ||||
|     -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ | ||||
|     -v /etc/ascend_install.info:/etc/ascend_install.info \ | ||||
|     -v ${model_cache_dir}:/root/.cache/modelscope \ | ||||
|     --entrypoint="" \ | ||||
|     --name "${container_name}" \ | ||||
|     "${image_name}" \ | ||||
|     bash -c ' | ||||
|     set -e | ||||
|     pytest -v -s tests/e2e/vllm_interface/ | ||||
| ' | ||||
| @ -1,166 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -xu | ||||
|  | ||||
|  | ||||
| remove_docker_container() {  | ||||
|     docker rm -f tpu-test || true; | ||||
| } | ||||
|  | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Remove the container that might not be cleaned up in the previous run. | ||||
| remove_docker_container | ||||
|  | ||||
| # Build the docker image. | ||||
| docker build -f docker/Dockerfile.tpu -t vllm-tpu . | ||||
|  | ||||
| # Set up cleanup. | ||||
| cleanup_docker() { | ||||
|   # Get Docker's root directory | ||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||
|   if [ -z "$docker_root" ]; then | ||||
|     echo "Failed to determine Docker root directory." | ||||
|     exit 1 | ||||
|   fi | ||||
|   echo "Docker root directory: $docker_root" | ||||
|   # Check disk usage of the filesystem where Docker's root directory is located | ||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||
|   # Define the threshold | ||||
|   threshold=70 | ||||
|   if [ "$disk_usage" -gt "$threshold" ]; then | ||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||
|     # Remove dangling images (those that are not tagged and not used by any container) | ||||
|     docker image prune -f | ||||
|     # Remove unused volumes / force the system prune for old images as well. | ||||
|     docker volume prune -f && docker system prune --force --filter "until=72h" --all | ||||
|     echo "Docker images and volumes cleanup completed." | ||||
|   else | ||||
|     echo "Disk usage is below $threshold%. No cleanup needed." | ||||
|   fi | ||||
| } | ||||
| cleanup_docker | ||||
|  | ||||
| # For HF_TOKEN. | ||||
| source /etc/environment | ||||
|  | ||||
| docker run --privileged --net host --shm-size=16G -it \ | ||||
|     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ | ||||
|     vllm-tpu /bin/bash -c ' | ||||
| set -e # Exit immediately if a command exits with a non-zero status. | ||||
| set -u # Treat unset variables as an error. | ||||
|  | ||||
| echo "--- Starting script inside Docker container ---" | ||||
|  | ||||
| # Create results directory | ||||
| RESULTS_DIR=$(mktemp -d) | ||||
| # If mktemp fails, set -e will cause the script to exit. | ||||
| echo "Results will be stored in: $RESULTS_DIR" | ||||
|  | ||||
| # Install dependencies | ||||
| echo "--- Installing Python dependencies ---" | ||||
| python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ | ||||
|     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ | ||||
|     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ | ||||
|     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 | ||||
| echo "--- Python dependencies installed ---" | ||||
|  | ||||
| export VLLM_XLA_CHECK_RECOMPILATION=1 | ||||
| export VLLM_XLA_CACHE_PATH= | ||||
|  | ||||
| echo "--- Hardware Information ---" | ||||
| # tpu-info | ||||
| echo "--- Starting Tests ---" | ||||
| set +e | ||||
| overall_script_exit_code=0 | ||||
|  | ||||
| # --- Test Definitions --- | ||||
| # If a test fails, this function will print logs and will not cause the main script to exit. | ||||
| run_test() { | ||||
|     local test_num=$1 | ||||
|     local test_name=$2 | ||||
|     local test_command=$3 | ||||
|     local log_file="$RESULTS_DIR/test_${test_num}.log" | ||||
|     local actual_exit_code | ||||
|  | ||||
|     echo "--- TEST_$test_num: Running $test_name ---" | ||||
|      | ||||
|     # Execute the test command. | ||||
|     eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) | ||||
|     actual_exit_code=$? | ||||
|  | ||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log | ||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log | ||||
|  | ||||
|     if [ "$actual_exit_code" -ne 0 ]; then | ||||
|         echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 | ||||
|         echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 | ||||
|         if [ -f "$log_file" ]; then | ||||
|             cat "$log_file" >&2 | ||||
|         else | ||||
|             echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 | ||||
|         fi | ||||
|         echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 | ||||
|         return "$actual_exit_code" # Return the failure code | ||||
|     else | ||||
|         echo "TEST_$test_num ($test_name) PASSED." | ||||
|         return 0 # Return success | ||||
|     fi | ||||
| } | ||||
|  | ||||
| # Helper function to call run_test and update the overall script exit code | ||||
| run_and_track_test() { | ||||
|     local test_num_arg="$1" | ||||
|     local test_name_arg="$2" | ||||
|     local test_command_arg="$3" | ||||
|  | ||||
|     # Run the test | ||||
|     run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" | ||||
|     local test_specific_exit_code=$? | ||||
|  | ||||
|     # If the test failed, set the overall script exit code to 1 | ||||
|     if [ "$test_specific_exit_code" -ne 0 ]; then | ||||
|         # No need for extra echo here, run_test already logged the failure. | ||||
|         overall_script_exit_code=1 | ||||
|     fi | ||||
| } | ||||
|  | ||||
| # --- Actual Test Execution --- | ||||
| run_and_track_test 1 "test_struct_output_generate.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" | ||||
| run_and_track_test 2 "test_moe_pallas.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" | ||||
| run_and_track_test 3 "test_lora.py" \ | ||||
|     "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" | ||||
| run_and_track_test 4 "test_tpu_qkv_linear.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py" | ||||
| run_and_track_test 5 "test_spmd_model_weight_loading.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py" | ||||
| run_and_track_test 6 "test_kv_cache_update_kernel.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py" | ||||
| run_and_track_test 7 "test_tpu_int8.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py" | ||||
|  | ||||
| # After all tests have been attempted, exit with the overall status. | ||||
| if [ "$overall_script_exit_code" -ne 0 ]; then | ||||
|     echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" | ||||
| else | ||||
|     echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" | ||||
| fi | ||||
| exit "$overall_script_exit_code" | ||||
| ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. | ||||
|  | ||||
| # Capture the exit code of the docker run command | ||||
| DOCKER_RUN_EXIT_CODE=$? | ||||
|  | ||||
| # The trap will run for cleanup. | ||||
| # Exit the main script with the Docker run command's exit code. | ||||
| if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then | ||||
|     echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." | ||||
|     exit "$DOCKER_RUN_EXIT_CODE" | ||||
| else | ||||
|     echo "Docker run command completed successfully." | ||||
|     exit 0 | ||||
| fi | ||||
| # TODO: This test fails because it uses RANDOM_SEED sampling | ||||
| # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ | ||||
| @ -1,174 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -xu | ||||
|  | ||||
|  | ||||
| remove_docker_container() {  | ||||
|     docker rm -f tpu-test || true;  | ||||
| } | ||||
|  | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Remove the container that might not be cleaned up in the previous run. | ||||
| remove_docker_container | ||||
|  | ||||
| # Build the docker image. | ||||
| docker build -f docker/Dockerfile.tpu -t vllm-tpu . | ||||
|  | ||||
| # Set up cleanup. | ||||
| cleanup_docker() { | ||||
|   # Get Docker's root directory | ||||
|   docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||
|   if [ -z "$docker_root" ]; then | ||||
|     echo "Failed to determine Docker root directory." | ||||
|     exit 1 | ||||
|   fi | ||||
|   echo "Docker root directory: $docker_root" | ||||
|   # Check disk usage of the filesystem where Docker's root directory is located | ||||
|   disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||
|   # Define the threshold | ||||
|   threshold=70 | ||||
|   if [ "$disk_usage" -gt "$threshold" ]; then | ||||
|     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||
|     # Remove dangling images (those that are not tagged and not used by any container) | ||||
|     docker image prune -f | ||||
|     # Remove unused volumes / force the system prune for old images as well. | ||||
|     docker volume prune -f && docker system prune --force --filter "until=72h" --all | ||||
|     echo "Docker images and volumes cleanup completed." | ||||
|   else | ||||
|     echo "Disk usage is below $threshold%. No cleanup needed." | ||||
|   fi | ||||
| } | ||||
| cleanup_docker | ||||
|  | ||||
| # For HF_TOKEN. | ||||
| source /etc/environment | ||||
|  | ||||
| docker run --privileged --net host --shm-size=16G -it \ | ||||
|     -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \ | ||||
|     vllm-tpu /bin/bash -c ' | ||||
| set -e # Exit immediately if a command exits with a non-zero status. | ||||
| set -u # Treat unset variables as an error. | ||||
|  | ||||
| echo "--- Starting script inside Docker container ---" | ||||
|  | ||||
| # Create results directory | ||||
| RESULTS_DIR=$(mktemp -d) | ||||
| # If mktemp fails, set -e will cause the script to exit. | ||||
| echo "Results will be stored in: $RESULTS_DIR" | ||||
|  | ||||
| # Install dependencies | ||||
| echo "--- Installing Python dependencies ---" | ||||
| python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ | ||||
|     && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ | ||||
|     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ | ||||
|     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 | ||||
| echo "--- Python dependencies installed ---" | ||||
|  | ||||
| export VLLM_XLA_CHECK_RECOMPILATION=1 | ||||
| export VLLM_XLA_CACHE_PATH= | ||||
|  | ||||
| echo "--- Hardware Information ---" | ||||
| # tpu-info | ||||
| echo "--- Starting Tests ---" | ||||
| set +e | ||||
| overall_script_exit_code=0 | ||||
|  | ||||
| # --- Test Definitions --- | ||||
| # If a test fails, this function will print logs and will not cause the main script to exit. | ||||
| run_test() { | ||||
|     local test_num=$1 | ||||
|     local test_name=$2 | ||||
|     local test_command=$3 | ||||
|     local log_file="$RESULTS_DIR/test_${test_num}.log" | ||||
|     local actual_exit_code | ||||
|  | ||||
|     echo "--- TEST_$test_num: Running $test_name ---" | ||||
|      | ||||
|     # Execute the test command. | ||||
|     eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) | ||||
|     actual_exit_code=$? | ||||
|  | ||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log | ||||
|     echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log | ||||
|  | ||||
|     if [ "$actual_exit_code" -ne 0 ]; then | ||||
|         echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 | ||||
|         echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 | ||||
|         if [ -f "$log_file" ]; then | ||||
|             cat "$log_file" >&2 | ||||
|         else | ||||
|             echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 | ||||
|         fi | ||||
|         echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 | ||||
|         return "$actual_exit_code" # Return the failure code | ||||
|     else | ||||
|         echo "TEST_$test_num ($test_name) PASSED." | ||||
|         return 0 # Return success | ||||
|     fi | ||||
| } | ||||
|  | ||||
| # Helper function to call run_test and update the overall script exit code | ||||
| run_and_track_test() { | ||||
|     local test_num_arg="$1" | ||||
|     local test_name_arg="$2" | ||||
|     local test_command_arg="$3" | ||||
|  | ||||
|     # Run the test | ||||
|     run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" | ||||
|     local test_specific_exit_code=$? | ||||
|  | ||||
|     # If the test failed, set the overall script exit code to 1 | ||||
|     if [ "$test_specific_exit_code" -ne 0 ]; then | ||||
|         # No need for extra echo here, run_test already logged the failure. | ||||
|         overall_script_exit_code=1 | ||||
|     fi | ||||
| } | ||||
|  | ||||
| # --- Actual Test Execution --- | ||||
| run_and_track_test 0 "test_perf.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py" | ||||
| run_and_track_test 1 "test_compilation.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py" | ||||
| run_and_track_test 2 "test_basic.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" | ||||
| run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" | ||||
| run_and_track_test 4 "test_quantization_accuracy.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" | ||||
| run_and_track_test 5 "examples/offline_inference/tpu.py" \ | ||||
|     "python3 /workspace/vllm/examples/offline_inference/tpu.py" | ||||
| run_and_track_test 6 "test_tpu_model_runner.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" | ||||
| run_and_track_test 7 "test_sampler.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" | ||||
| run_and_track_test 8 "test_topk_topp_sampler.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" | ||||
| run_and_track_test 9 "test_multimodal.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" | ||||
| run_and_track_test 10 "test_pallas.py" \ | ||||
|     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" | ||||
|  | ||||
| # After all tests have been attempted, exit with the overall status. | ||||
| if [ "$overall_script_exit_code" -ne 0 ]; then | ||||
|     echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" | ||||
| else | ||||
|     echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" | ||||
| fi | ||||
| exit "$overall_script_exit_code" | ||||
| ' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. | ||||
|  | ||||
| # Capture the exit code of the docker run command | ||||
| DOCKER_RUN_EXIT_CODE=$? | ||||
|  | ||||
| # The trap will run for cleanup. | ||||
| # Exit the main script with the Docker run command's exit code. | ||||
| if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then | ||||
|     echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." | ||||
|     exit "$DOCKER_RUN_EXIT_CODE" | ||||
| else | ||||
|     echo "Docker run command completed successfully." | ||||
|     exit 0 | ||||
| fi | ||||
| # TODO: This test fails because it uses RANDOM_SEED sampling | ||||
| # pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ | ||||
| @ -1,48 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the CPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}" | ||||
| container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" | ||||
|  | ||||
| # Try building the docker image | ||||
| docker build -t ${image_name} -f docker/Dockerfile.xpu . | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { | ||||
|   docker rm -f "${container_name}" || true; | ||||
|   docker image rm -f "${image_name}" || true; | ||||
|   docker system prune -f || true; | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Run the image and test offline inference/tensor parallel | ||||
| docker run \ | ||||
|     --device /dev/dri \ | ||||
|     -v /dev/dri/by-path:/dev/dri/by-path \ | ||||
|     --entrypoint="" \ | ||||
|     -e "HF_TOKEN=${HF_TOKEN}" \ | ||||
|     -e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \ | ||||
|     --name "${container_name}" \ | ||||
|     "${image_name}" \ | ||||
|     bash -c ' | ||||
|     set -e | ||||
|     echo $ZE_AFFINITY_MASK | ||||
|     pip install tblib==3.1.0 | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp | ||||
|     VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager | ||||
|     cd tests | ||||
|     pytest -v -s v1/core | ||||
|     pytest -v -s v1/engine | ||||
|     pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py | ||||
|     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py | ||||
|     pytest -v -s v1/structured_output | ||||
|     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py | ||||
|     pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py | ||||
|     pytest -v -s v1/test_serial_utils.py | ||||
| ' | ||||
| @ -1,18 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # Usage: ./rerun_test.sh path/to/test.py::test_name | ||||
|  | ||||
| # Check if argument is given | ||||
| if [ $# -lt 1 ]; then | ||||
|     echo "Usage: $0 path/to/test.py::test_name" | ||||
|     echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| TEST=$1 | ||||
| COUNT=1 | ||||
|  | ||||
| while pytest -sv "$TEST"; do | ||||
|     COUNT=$((COUNT + 1)) | ||||
|     echo "RUN NUMBER ${COUNT}" | ||||
| done | ||||
| @ -1,59 +0,0 @@ | ||||
| #!/bin/bash | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| # Setup script for Prime-RL integration tests | ||||
| # This script prepares the environment for running Prime-RL tests with nightly vLLM | ||||
|  | ||||
| set -euo pipefail | ||||
|  | ||||
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | ||||
| REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" | ||||
| PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git" | ||||
| PRIME_RL_DIR="${REPO_ROOT}/prime-rl" | ||||
|  | ||||
| echo "Setting up Prime-RL integration test environment..." | ||||
|  | ||||
| # Clean up any existing Prime-RL directory | ||||
| if [ -d "${PRIME_RL_DIR}" ]; then | ||||
|     echo "Removing existing Prime-RL directory..." | ||||
|     rm -rf "${PRIME_RL_DIR}" | ||||
| fi | ||||
|  | ||||
| # Install UV if not available | ||||
| if ! command -v uv &> /dev/null; then | ||||
|     echo "Installing UV package manager..." | ||||
|     curl -LsSf https://astral.sh/uv/install.sh | sh | ||||
|     source $HOME/.local/bin/env | ||||
| fi | ||||
|  | ||||
| # Clone Prime-RL repository at specific branch for reproducible tests | ||||
| PRIME_RL_BRANCH="integ-vllm-main" | ||||
| echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..." | ||||
| git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}" | ||||
| cd "${PRIME_RL_DIR}" | ||||
|  | ||||
| echo "Setting up UV project environment..." | ||||
| export UV_PROJECT_ENVIRONMENT=/usr/local | ||||
| ln -s /usr/bin/python3 /usr/local/bin/python | ||||
|  | ||||
| # Remove vllm pin from pyproject.toml | ||||
| echo "Removing vllm pin from pyproject.toml..." | ||||
| sed -i '/vllm==/d' pyproject.toml | ||||
|  | ||||
| # Sync Prime-RL dependencies | ||||
| echo "Installing Prime-RL dependencies..." | ||||
| uv sync --inexact && uv sync --inexact --all-extras | ||||
|  | ||||
| # Verify installation | ||||
| echo "Verifying installations..." | ||||
| uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')" | ||||
| uv run python -c "import prime_rl; print('Prime-RL imported successfully')" | ||||
|  | ||||
| echo "Prime-RL integration test environment setup complete!" | ||||
|  | ||||
| echo "Running Prime-RL integration tests..." | ||||
| export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY | ||||
| uv run pytest -vs tests/integration/test_rl.py -m gpu | ||||
|  | ||||
| echo "Prime-RL integration tests completed!" | ||||
| @ -1,24 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -euo pipefail | ||||
|  | ||||
| docker_root=$(docker info -f '{{.DockerRootDir}}') | ||||
| if [ -z "$docker_root" ]; then | ||||
|   echo "Failed to determine Docker root directory." | ||||
|   exit 1 | ||||
| fi | ||||
| echo "Docker root directory: $docker_root" | ||||
| # Check disk usage of the filesystem where Docker's root directory is located | ||||
| disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') | ||||
| # Define the threshold | ||||
| threshold=70 | ||||
| if [ "$disk_usage" -gt "$threshold" ]; then | ||||
|   echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." | ||||
|   # Remove dangling images (those that are not tagged and not used by any container) | ||||
|   docker image prune -f | ||||
|   # Remove unused volumes / force the system prune for old images as well. | ||||
|   docker volume prune -f && docker system prune --force --filter "until=24h" --all | ||||
|   echo "Docker images and volumes cleanup completed." | ||||
| else | ||||
|   echo "Disk usage is below $threshold%. No cleanup needed." | ||||
| fi | ||||
| @ -1,14 +0,0 @@ | ||||
| # Environment config | ||||
| TEST_NAME=llama8b | ||||
| CONTAINER_NAME=tpu-test | ||||
|  | ||||
| # vllm config | ||||
| MODEL=meta-llama/Llama-3.1-8B-Instruct | ||||
| MAX_NUM_SEQS=256 | ||||
| MAX_NUM_BATCHED_TOKENS=1024 | ||||
| TENSOR_PARALLEL_SIZE=1 | ||||
| MAX_MODEL_LEN=2048 | ||||
| DOWNLOAD_DIR=/mnt/disks/persist | ||||
| EXPECTED_THROUGHPUT=8.0 | ||||
| INPUT_LEN=1800 | ||||
| OUTPUT_LEN=128 | ||||
| @ -1,90 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| if [ ! -f "$1" ]; then | ||||
|   echo "Error: The env file '$1' does not exist." | ||||
|   exit 1  # Exit the script with a non-zero status to indicate an error | ||||
| fi | ||||
|  | ||||
| ENV_FILE=$1 | ||||
|  | ||||
| # For testing on local vm, use `set -a` to export all variables | ||||
| source /etc/environment | ||||
| source $ENV_FILE | ||||
|  | ||||
| remove_docker_container() {  | ||||
|     docker rm -f $CONTAINER_NAME || true; | ||||
| } | ||||
|  | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Remove the container that might not be cleaned up in the previous run. | ||||
| remove_docker_container | ||||
|  | ||||
| LOG_ROOT=$(mktemp -d) | ||||
| # If mktemp fails, set -e will cause the script to exit. | ||||
| echo "Results will be stored in: $LOG_ROOT" | ||||
|  | ||||
| if [ -z "$HF_TOKEN" ]; then | ||||
|   echo "Error: HF_TOKEN is not set or is empty."   | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| # Make sure mounted disk or dir exists | ||||
| if [ ! -d "$DOWNLOAD_DIR" ]; then | ||||
|     echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder." | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| echo "Run model $MODEL" | ||||
| echo | ||||
|  | ||||
| echo "starting docker...$CONTAINER_NAME" | ||||
| echo     | ||||
| docker run \ | ||||
|  -v $DOWNLOAD_DIR:$DOWNLOAD_DIR \ | ||||
|  --env-file $ENV_FILE \ | ||||
|  -e HF_TOKEN="$HF_TOKEN" \ | ||||
|  -e TARGET_COMMIT=$BUILDKITE_COMMIT \ | ||||
|  -e MODEL=$MODEL \ | ||||
|  -e WORKSPACE=/workspace \ | ||||
|  --name $CONTAINER_NAME \ | ||||
|  -d \ | ||||
|  --privileged \ | ||||
|  --network host \ | ||||
|  -v /dev/shm:/dev/shm \ | ||||
|  vllm/vllm-tpu-bm tail -f /dev/null | ||||
|  | ||||
| echo "run script..." | ||||
| echo | ||||
| docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh" | ||||
|  | ||||
| echo "copy result back..." | ||||
| VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt | ||||
| BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt | ||||
| docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"  | ||||
| docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG" | ||||
|  | ||||
| throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') | ||||
| echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput" | ||||
|  | ||||
| if [ "$BUILDKITE" = "true" ]; then | ||||
|   echo "Running inside Buildkite" | ||||
|   buildkite-agent artifact upload "$VLLM_LOG"  | ||||
|   buildkite-agent artifact upload "$BM_LOG" | ||||
| else | ||||
|   echo "Not running inside Buildkite" | ||||
| fi | ||||
|  | ||||
| # | ||||
| # compare the throughput with EXPECTED_THROUGHPUT  | ||||
| # and assert meeting the expectation | ||||
| #  | ||||
| if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then | ||||
|   echo "Failed to get the throughput" | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then | ||||
|   echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)" | ||||
|   exit 1 | ||||
| fi | ||||
| @ -1,14 +0,0 @@ | ||||
| # Environment config | ||||
| TEST_NAME=llama8bw8a8 | ||||
| CONTAINER_NAME=tpu-test | ||||
|  | ||||
| # vllm config | ||||
| MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | ||||
| MAX_NUM_SEQS=128 | ||||
| MAX_NUM_BATCHED_TOKENS=1024 | ||||
| TENSOR_PARALLEL_SIZE=1 | ||||
| MAX_MODEL_LEN=2048 | ||||
| DOWNLOAD_DIR=/mnt/disks/persist | ||||
| EXPECTED_THROUGHPUT=8.7 | ||||
| INPUT_LEN=1800 | ||||
| OUTPUT_LEN=128 | ||||
| @ -1,93 +0,0 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -euo pipefail | ||||
|  | ||||
| VLLM_LOG="$WORKSPACE/vllm_log.txt" | ||||
| BM_LOG="$WORKSPACE/bm_log.txt" | ||||
|  | ||||
| if [ -n "$TARGET_COMMIT" ]; then | ||||
|   head_hash=$(git rev-parse HEAD) | ||||
|   if [ "$TARGET_COMMIT" != "$head_hash" ]; then | ||||
|     echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash" | ||||
|     exit 1 | ||||
|   fi | ||||
| fi | ||||
|  | ||||
| echo "model: $MODEL" | ||||
| echo | ||||
|  | ||||
| # | ||||
| # create a log folder | ||||
| # | ||||
| mkdir "$WORKSPACE/log" | ||||
|  | ||||
| # TODO: Move to image building. | ||||
| pip install pandas | ||||
| pip install datasets | ||||
|  | ||||
| # | ||||
| # create sonnet_4x | ||||
| # | ||||
| echo "Create sonnet_4x.txt" | ||||
| echo "" > benchmarks/sonnet_4x.txt | ||||
| for _ in {1..4} | ||||
|  do | ||||
|   cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt | ||||
| done | ||||
|  | ||||
| # | ||||
| # start vllm service in backend | ||||
| # | ||||
| echo "lanching vllm..." | ||||
| echo "logging to $VLLM_LOG" | ||||
| echo | ||||
|  | ||||
| vllm serve $MODEL \ | ||||
|  --seed 42 \ | ||||
|  --max-num-seqs $MAX_NUM_SEQS \ | ||||
|  --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ | ||||
|  --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ | ||||
|  --no-enable-prefix-caching \ | ||||
|  --download_dir $DOWNLOAD_DIR \ | ||||
|  --max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 & | ||||
|  | ||||
|  | ||||
| echo "wait for 20 minutes.." | ||||
| echo | ||||
| # sleep 1200 | ||||
| # wait for 10 minutes... | ||||
| for i in {1..120}; do | ||||
|     # TODO: detect other type of errors. | ||||
|     if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then | ||||
|         echo "Detected RuntimeError, exiting." | ||||
|         exit 1 | ||||
|     elif grep -Fq "Application startup complete" "$VLLM_LOG"; then | ||||
|         echo "Application started" | ||||
|         break | ||||
|     else | ||||
|         echo "wait for 10 seconds..." | ||||
|         sleep 10 | ||||
|     fi | ||||
| done | ||||
|  | ||||
| # | ||||
| # run test | ||||
| # | ||||
| echo "run benchmark test..." | ||||
| echo "logging to $BM_LOG" | ||||
| echo | ||||
| vllm bench serve \ | ||||
|     --backend vllm \ | ||||
|     --model $MODEL  \ | ||||
|     --dataset-name sonnet \ | ||||
|     --dataset-path benchmarks/sonnet_4x.txt \ | ||||
|     --sonnet-input-len $INPUT_LEN \ | ||||
|     --sonnet-output-len $OUTPUT_LEN \ | ||||
|     --ignore-eos > "$BM_LOG" | ||||
|  | ||||
| echo "completed..." | ||||
| echo | ||||
|  | ||||
| throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g') | ||||
| echo "throughput: $throughput" | ||||
| echo | ||||
| @ -1,83 +0,0 @@ | ||||
| #!/usr/bin/env bash | ||||
|  | ||||
| set -ex | ||||
|  | ||||
| # Assume wheels are in artifacts/dist/*.whl | ||||
| wheel_files=(artifacts/dist/*.whl) | ||||
|  | ||||
| # Check that exactly one wheel is found | ||||
| if [[ ${#wheel_files[@]} -ne 1 ]]; then | ||||
|   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| # Get the single wheel file | ||||
| wheel="${wheel_files[0]}" | ||||
|  | ||||
| # Detect architecture and rename 'linux' to appropriate manylinux version | ||||
| arch=$(uname -m) | ||||
| if [[ $arch == "x86_64" ]]; then | ||||
|     manylinux_version="manylinux1" | ||||
| elif [[ $arch == "aarch64" ]]; then | ||||
|     manylinux_version="manylinux2014" | ||||
| else | ||||
|     echo "Warning: Unknown architecture $arch, using manylinux1 as default" | ||||
|     manylinux_version="manylinux1" | ||||
| fi | ||||
|  | ||||
| # Rename 'linux' to the appropriate manylinux version in the wheel filename | ||||
| new_wheel="${wheel/linux/$manylinux_version}" | ||||
| mv -- "$wheel" "$new_wheel" | ||||
| wheel="$new_wheel" | ||||
|  | ||||
| # Extract the version from the wheel | ||||
| version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) | ||||
| echo "Version: $version" | ||||
|  | ||||
| normal_wheel="$wheel" # Save the original wheel filename | ||||
|  | ||||
| # If the version contains "dev", rename it to v1.0.0.dev for consistency | ||||
| if [[ $version == *dev* ]]; then | ||||
|     suffix="${version##*.}" | ||||
|     if [[ $suffix == cu* ]]; then | ||||
|         new_version="1.0.0.dev+${suffix}" | ||||
|     else | ||||
|         new_version="1.0.0.dev" | ||||
|     fi | ||||
|     new_wheel="${wheel/$version/$new_version}" | ||||
|     # use cp to keep both files in the artifacts directory | ||||
|     cp -- "$wheel" "$new_wheel" | ||||
|     wheel="$new_wheel" | ||||
|     version="$new_version" | ||||
| fi | ||||
|  | ||||
| # Upload the wheel to S3 | ||||
| python3 .buildkite/generate_index.py --wheel "$normal_wheel" | ||||
|  | ||||
| # generate index for this commit | ||||
| aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" | ||||
| aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" | ||||
|  | ||||
| if [[ $normal_wheel == *"cu129"* ]]; then | ||||
|     # only upload index.html for cu129 wheels (default wheels) as it | ||||
|     # is available on both x86 and arm64 | ||||
|     aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html" | ||||
|     aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html" | ||||
| else | ||||
|     echo "Skipping index files for non-cu129 wheels" | ||||
| fi | ||||
|  | ||||
| # generate index for nightly | ||||
| aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" | ||||
| aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/" | ||||
|  | ||||
| if [[ $normal_wheel == *"cu129"* ]]; then | ||||
|     # only upload index.html for cu129 wheels (default wheels) as it | ||||
|     # is available on both x86 and arm64 | ||||
|     aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html" | ||||
| else | ||||
|     echo "Skipping index files for non-cu129 wheels" | ||||
| fi | ||||
|  | ||||
| aws s3 cp "$wheel" "s3://vllm-wheels/$version/" | ||||
| aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html" | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	