mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-11-04 09:24:33 +08:00 
			
		
		
		
	Compare commits
	
		
			1 Commits
		
	
	
		
			wentao-bat
			...
			woosuk/rm-
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 6f47333c4e | 
@ -1,14 +0,0 @@
 | 
			
		||||
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
 | 
			
		||||
tasks:
 | 
			
		||||
  - name: "mmlu_pro"
 | 
			
		||||
    metrics:
 | 
			
		||||
      - name: "exact_match,custom-extract"
 | 
			
		||||
        value: 0.82
 | 
			
		||||
limit: 250 # will run on 250 * 14 subjects = 3500 samples
 | 
			
		||||
num_fewshot: 5
 | 
			
		||||
enforce_eager: false # we use false to speed up the eval process
 | 
			
		||||
kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
 | 
			
		||||
max_model_len: 40960
 | 
			
		||||
apply_chat_template: true
 | 
			
		||||
fewshot_as_multiturn: true
 | 
			
		||||
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
 | 
			
		||||
							
								
								
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-large-h100.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-large-h100.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
			
		||||
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
 | 
			
		||||
@ -1 +0,0 @@
 | 
			
		||||
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
 | 
			
		||||
@ -21,13 +21,10 @@ def launch_lm_eval(eval_config, tp_size):
 | 
			
		||||
    max_model_len = eval_config.get("max_model_len", 4096)
 | 
			
		||||
    batch_size = eval_config.get("batch_size", "auto")
 | 
			
		||||
    backend = eval_config.get("backend", "vllm")
 | 
			
		||||
    enforce_eager = eval_config.get("enforce_eager", "true")
 | 
			
		||||
    kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
 | 
			
		||||
    model_args = (
 | 
			
		||||
        f"pretrained={eval_config['model_name']},"
 | 
			
		||||
        f"tensor_parallel_size={tp_size},"
 | 
			
		||||
        f"enforce_eager={enforce_eager},"
 | 
			
		||||
        f"kv_cache_dtype={kv_cache_dtype},"
 | 
			
		||||
        f"enforce_eager=true,"
 | 
			
		||||
        f"add_bos_token=true,"
 | 
			
		||||
        f"trust_remote_code={trust_remote_code},"
 | 
			
		||||
        f"max_model_len={max_model_len},"
 | 
			
		||||
@ -40,13 +37,8 @@ def launch_lm_eval(eval_config, tp_size):
 | 
			
		||||
        limit=eval_config["limit"],
 | 
			
		||||
        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
 | 
			
		||||
        # text models. however, this is regressing measured strict-match for
 | 
			
		||||
        # existing text models in CI, so only apply it for mm, or explicitly set
 | 
			
		||||
        apply_chat_template=eval_config.get(
 | 
			
		||||
            "apply_chat_template", backend == "vllm-vlm"
 | 
			
		||||
        ),
 | 
			
		||||
        fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
 | 
			
		||||
        # Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
 | 
			
		||||
        gen_kwargs=eval_config.get("gen_kwargs"),
 | 
			
		||||
        # existing text models in CI, so only apply it for mm.
 | 
			
		||||
        apply_chat_template=backend == "vllm-vlm",
 | 
			
		||||
        batch_size=batch_size,
 | 
			
		||||
    )
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
@ -2,23 +2,40 @@
 | 
			
		||||
 | 
			
		||||
## Introduction
 | 
			
		||||
 | 
			
		||||
This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
 | 
			
		||||
vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
 | 
			
		||||
This directory contains two sets of benchmark for vllm.
 | 
			
		||||
 | 
			
		||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
 | 
			
		||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
 | 
			
		||||
 | 
			
		||||
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
 | 
			
		||||
 | 
			
		||||
## Performance benchmark quick overview
 | 
			
		||||
 | 
			
		||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
 | 
			
		||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
 | 
			
		||||
 | 
			
		||||
**Benchmarking Duration**: about 1hr.
 | 
			
		||||
 | 
			
		||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
 | 
			
		||||
 | 
			
		||||
## Nightly benchmark quick overview
 | 
			
		||||
 | 
			
		||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
 | 
			
		||||
 | 
			
		||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
 | 
			
		||||
 | 
			
		||||
**Benchmarking Duration**: about 3.5hrs.
 | 
			
		||||
 | 
			
		||||
## Trigger the benchmark
 | 
			
		||||
 | 
			
		||||
The benchmark needs to be triggered manually:
 | 
			
		||||
Performance benchmark will be triggered when:
 | 
			
		||||
 | 
			
		||||
- A PR being merged into vllm.
 | 
			
		||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
 | 
			
		||||
 | 
			
		||||
Manually Trigger the benchmark
 | 
			
		||||
 | 
			
		||||
```bash
 | 
			
		||||
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
Runtime environment variables:
 | 
			
		||||
@ -30,11 +47,14 @@ Runtime environment variables:
 | 
			
		||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
 | 
			
		||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
 | 
			
		||||
 | 
			
		||||
Nightly benchmark will be triggered when:
 | 
			
		||||
 | 
			
		||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
 | 
			
		||||
 | 
			
		||||
## Performance benchmark details
 | 
			
		||||
 | 
			
		||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
 | 
			
		||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
 | 
			
		||||
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
 | 
			
		||||
>
 | 
			
		||||
### Latency test
 | 
			
		||||
 | 
			
		||||
@ -132,3 +152,26 @@ Here is an example using the script to compare result_a and result_b with Model,
 | 
			
		||||
A comparison diagram will be generated below the table.
 | 
			
		||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
 | 
			
		||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
 | 
			
		||||
 | 
			
		||||
## Nightly test details
 | 
			
		||||
 | 
			
		||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
 | 
			
		||||
 | 
			
		||||
### Workflow
 | 
			
		||||
 | 
			
		||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
 | 
			
		||||
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
 | 
			
		||||
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
 | 
			
		||||
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
 | 
			
		||||
 | 
			
		||||
### Nightly tests
 | 
			
		||||
 | 
			
		||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
 | 
			
		||||
 | 
			
		||||
### Docker containers
 | 
			
		||||
 | 
			
		||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
 | 
			
		||||
 | 
			
		||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
 | 
			
		||||
 | 
			
		||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
 | 
			
		||||
							
								
								
									
										184
									
								
								.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										184
									
								
								.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,184 @@
 | 
			
		||||
steps:
 | 
			
		||||
  - label: "Wait for container to be ready"
 | 
			
		||||
    key: wait-for-container-image
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: A100
 | 
			
		||||
    plugins:
 | 
			
		||||
    - kubernetes:
 | 
			
		||||
        podSpec:
 | 
			
		||||
          containers:
 | 
			
		||||
          - image: badouralix/curl-jq
 | 
			
		||||
            command:
 | 
			
		||||
            - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
 | 
			
		||||
  - label: "Cleanup H100"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: H100
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    command: docker system prune -a --volumes --force
 | 
			
		||||
  
 | 
			
		||||
  - label: "A100"
 | 
			
		||||
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: A100
 | 
			
		||||
    depends_on: wait-for-container-image
 | 
			
		||||
    if: build.branch == "main"
 | 
			
		||||
    plugins:
 | 
			
		||||
    - kubernetes:
 | 
			
		||||
        podSpec:
 | 
			
		||||
          priorityClassName: perf-benchmark
 | 
			
		||||
          containers:
 | 
			
		||||
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
 | 
			
		||||
            command:
 | 
			
		||||
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
            resources:
 | 
			
		||||
              limits:
 | 
			
		||||
                nvidia.com/gpu: 8
 | 
			
		||||
            volumeMounts:
 | 
			
		||||
            - name: devshm
 | 
			
		||||
              mountPath: /dev/shm
 | 
			
		||||
            env:
 | 
			
		||||
            - name: VLLM_USAGE_SOURCE
 | 
			
		||||
              value: ci-test
 | 
			
		||||
            - name: HF_TOKEN
 | 
			
		||||
              valueFrom:
 | 
			
		||||
                secretKeyRef:
 | 
			
		||||
                  name: hf-token-secret
 | 
			
		||||
                  key: token
 | 
			
		||||
          nodeSelector:
 | 
			
		||||
            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
 | 
			
		||||
          volumes:
 | 
			
		||||
          - name: devshm
 | 
			
		||||
            emptyDir:
 | 
			
		||||
              medium: Memory
 | 
			
		||||
 | 
			
		||||
  - label: "H200"
 | 
			
		||||
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: H200
 | 
			
		||||
    depends_on: wait-for-container-image
 | 
			
		||||
    if: build.branch == "main"
 | 
			
		||||
    plugins:
 | 
			
		||||
    - docker#v5.12.0:
 | 
			
		||||
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
 | 
			
		||||
        command:
 | 
			
		||||
        - bash
 | 
			
		||||
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
        mount-buildkite-agent: true
 | 
			
		||||
        propagate-environment: true
 | 
			
		||||
        ipc: host
 | 
			
		||||
        gpus: 4,5,6,7
 | 
			
		||||
        volumes:
 | 
			
		||||
          - /data/benchmark-hf-cache:/root/.cache/huggingface
 | 
			
		||||
        environment:
 | 
			
		||||
        - VLLM_USAGE_SOURCE
 | 
			
		||||
        - HF_TOKEN
 | 
			
		||||
 | 
			
		||||
  #- block: "Run H100 Benchmark"
 | 
			
		||||
    #key: block-h100
 | 
			
		||||
    #depends_on: ~
 | 
			
		||||
 | 
			
		||||
  - label: "H100"
 | 
			
		||||
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: H100
 | 
			
		||||
    depends_on: wait-for-container-image
 | 
			
		||||
    if: build.branch == "main"
 | 
			
		||||
    plugins:
 | 
			
		||||
    - docker#v5.12.0:
 | 
			
		||||
        image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
 | 
			
		||||
        command:
 | 
			
		||||
        - bash
 | 
			
		||||
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
        mount-buildkite-agent: true
 | 
			
		||||
        propagate-environment: true
 | 
			
		||||
        ipc: host
 | 
			
		||||
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
 | 
			
		||||
        volumes:
 | 
			
		||||
          - /data/benchmark-hf-cache:/root/.cache/huggingface
 | 
			
		||||
        environment:
 | 
			
		||||
        - VLLM_USAGE_SOURCE
 | 
			
		||||
        - HF_TOKEN
 | 
			
		||||
 | 
			
		||||
  # Premerge benchmark
 | 
			
		||||
  - label: "A100"
 | 
			
		||||
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: A100
 | 
			
		||||
    depends_on: wait-for-container-image
 | 
			
		||||
    if: build.branch != "main"
 | 
			
		||||
    plugins:
 | 
			
		||||
    - kubernetes:
 | 
			
		||||
        podSpec:
 | 
			
		||||
          priorityClassName: perf-benchmark
 | 
			
		||||
          containers:
 | 
			
		||||
          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
 | 
			
		||||
            command:
 | 
			
		||||
            - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
            resources:
 | 
			
		||||
              limits:
 | 
			
		||||
                nvidia.com/gpu: 8
 | 
			
		||||
            volumeMounts:
 | 
			
		||||
            - name: devshm
 | 
			
		||||
              mountPath: /dev/shm
 | 
			
		||||
            env:
 | 
			
		||||
            - name: VLLM_USAGE_SOURCE
 | 
			
		||||
              value: ci-test
 | 
			
		||||
            - name: HF_TOKEN
 | 
			
		||||
              valueFrom:
 | 
			
		||||
                secretKeyRef:
 | 
			
		||||
                  name: hf-token-secret
 | 
			
		||||
                  key: token
 | 
			
		||||
          nodeSelector:
 | 
			
		||||
            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
 | 
			
		||||
          volumes:
 | 
			
		||||
          - name: devshm
 | 
			
		||||
            emptyDir:
 | 
			
		||||
              medium: Memory
 | 
			
		||||
 | 
			
		||||
  - label: "H200"
 | 
			
		||||
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: H200
 | 
			
		||||
    depends_on: wait-for-container-image
 | 
			
		||||
    if: build.branch != "main"
 | 
			
		||||
    plugins:
 | 
			
		||||
    - docker#v5.12.0:
 | 
			
		||||
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
 | 
			
		||||
        command:
 | 
			
		||||
        - bash
 | 
			
		||||
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
        mount-buildkite-agent: true
 | 
			
		||||
        propagate-environment: true
 | 
			
		||||
        ipc: host
 | 
			
		||||
        gpus: 4,5,6,7
 | 
			
		||||
        volumes:
 | 
			
		||||
          - /data/benchmark-hf-cache:/root/.cache/huggingface
 | 
			
		||||
        environment:
 | 
			
		||||
        - VLLM_USAGE_SOURCE
 | 
			
		||||
        - HF_TOKEN
 | 
			
		||||
 | 
			
		||||
  #- block: "Run H100 Benchmark"
 | 
			
		||||
    #key: block-h100
 | 
			
		||||
    #depends_on: ~
 | 
			
		||||
 | 
			
		||||
  - label: "H100"
 | 
			
		||||
    # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: H100
 | 
			
		||||
    depends_on: wait-for-container-image
 | 
			
		||||
    if: build.branch != "main"
 | 
			
		||||
    plugins:
 | 
			
		||||
    - docker#v5.12.0:
 | 
			
		||||
        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
 | 
			
		||||
        command:
 | 
			
		||||
        - bash
 | 
			
		||||
        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
 | 
			
		||||
        mount-buildkite-agent: true
 | 
			
		||||
        propagate-environment: true
 | 
			
		||||
        ipc: host
 | 
			
		||||
        gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
 | 
			
		||||
        volumes:
 | 
			
		||||
          - /data/benchmark-hf-cache:/root/.cache/huggingface
 | 
			
		||||
        environment:
 | 
			
		||||
        - VLLM_USAGE_SOURCE
 | 
			
		||||
        - HF_TOKEN
 | 
			
		||||
							
								
								
									
										28
									
								
								.buildkite/nightly-benchmarks/nightly-annotation.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								.buildkite/nightly-benchmarks/nightly-annotation.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,28 @@
 | 
			
		||||
# Nightly benchmark annotation
 | 
			
		||||
 | 
			
		||||
## Description
 | 
			
		||||
 | 
			
		||||
This file contains the downloading link for benchmarking results.
 | 
			
		||||
 | 
			
		||||
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
 | 
			
		||||
- [benchmarking results](artifact://results.zip)
 | 
			
		||||
- [benchmarking code](artifact://nightly-benchmarks.zip)
 | 
			
		||||
 | 
			
		||||
Please download the visualization scripts in the post
 | 
			
		||||
 | 
			
		||||
## Results reproduction
 | 
			
		||||
 | 
			
		||||
- Find the docker we use in `benchmarking pipeline`
 | 
			
		||||
- Deploy the docker, and inside the docker:
 | 
			
		||||
    - Download `nightly-benchmarks.zip`.
 | 
			
		||||
    - In the same folder, run the following code:
 | 
			
		||||
 | 
			
		||||
    ```bash
 | 
			
		||||
    export HF_TOKEN=<your HF token>
 | 
			
		||||
    apt update
 | 
			
		||||
    apt install -y git
 | 
			
		||||
    unzip nightly-benchmarks.zip
 | 
			
		||||
    VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
 | 
			
		||||
    ```
 | 
			
		||||
 | 
			
		||||
And the results will be inside `./benchmarks/results`.
 | 
			
		||||
							
								
								
									
										39
									
								
								.buildkite/nightly-benchmarks/nightly-descriptions.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								.buildkite/nightly-benchmarks/nightly-descriptions.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,39 @@
 | 
			
		||||
 | 
			
		||||
# Nightly benchmark
 | 
			
		||||
 | 
			
		||||
This benchmark aims to:
 | 
			
		||||
 | 
			
		||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
 | 
			
		||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
 | 
			
		||||
 | 
			
		||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 | 
			
		||||
 | 
			
		||||
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 | 
			
		||||
 | 
			
		||||
## Setup
 | 
			
		||||
 | 
			
		||||
- Docker images:
 | 
			
		||||
    - vLLM: `vllm/vllm-openai:v0.6.2`
 | 
			
		||||
    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
 | 
			
		||||
    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
 | 
			
		||||
    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
 | 
			
		||||
        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
 | 
			
		||||
    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 | 
			
		||||
- Hardware
 | 
			
		||||
    - 8x Nvidia A100 GPUs
 | 
			
		||||
- Workload:
 | 
			
		||||
    - Dataset
 | 
			
		||||
        - ShareGPT dataset
 | 
			
		||||
        - Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
 | 
			
		||||
        - Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
 | 
			
		||||
        - Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
 | 
			
		||||
    - Models: llama-3 8B, llama-3 70B.
 | 
			
		||||
        - We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
 | 
			
		||||
    - Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
 | 
			
		||||
        - Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
 | 
			
		||||
    - Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 | 
			
		||||
 | 
			
		||||
## Known issues
 | 
			
		||||
 | 
			
		||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
 | 
			
		||||
- TGI does not support `ignore-eos` flag.
 | 
			
		||||
							
								
								
									
										196
									
								
								.buildkite/nightly-benchmarks/nightly-pipeline.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										196
									
								
								.buildkite/nightly-benchmarks/nightly-pipeline.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,196 @@
 | 
			
		||||
common_pod_spec: &common_pod_spec
 | 
			
		||||
  priorityClassName: perf-benchmark
 | 
			
		||||
  nodeSelector:
 | 
			
		||||
    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
 | 
			
		||||
  volumes:
 | 
			
		||||
    - name: devshm
 | 
			
		||||
      emptyDir:
 | 
			
		||||
        medium: Memory
 | 
			
		||||
    - name: hf-cache
 | 
			
		||||
      hostPath:
 | 
			
		||||
        path: /root/.cache/huggingface
 | 
			
		||||
        type: Directory
 | 
			
		||||
 | 
			
		||||
common_container_settings: &common_container_settings
 | 
			
		||||
  command:
 | 
			
		||||
    - bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
 | 
			
		||||
  resources:
 | 
			
		||||
    limits:
 | 
			
		||||
      nvidia.com/gpu: 8
 | 
			
		||||
  volumeMounts:
 | 
			
		||||
    - name: devshm
 | 
			
		||||
      mountPath: /dev/shm
 | 
			
		||||
    - name: hf-cache
 | 
			
		||||
      mountPath: /root/.cache/huggingface
 | 
			
		||||
  env:
 | 
			
		||||
    - name: VLLM_USAGE_SOURCE
 | 
			
		||||
      value: ci-test
 | 
			
		||||
    - name: HF_HOME
 | 
			
		||||
      value: /root/.cache/huggingface
 | 
			
		||||
    - name: VLLM_SOURCE_CODE_LOC
 | 
			
		||||
      value: /workspace/build/buildkite/vllm/performance-benchmark
 | 
			
		||||
    - name: HF_TOKEN
 | 
			
		||||
      valueFrom:
 | 
			
		||||
        secretKeyRef:
 | 
			
		||||
          name: hf-token-secret
 | 
			
		||||
          key: token
 | 
			
		||||
 | 
			
		||||
steps:
 | 
			
		||||
  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  - label: "A100 vllm step 10"
 | 
			
		||||
    priority: 100
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: A100
 | 
			
		||||
    plugins:
 | 
			
		||||
      - kubernetes:
 | 
			
		||||
          podSpec:
 | 
			
		||||
            <<: *common_pod_spec
 | 
			
		||||
            containers:
 | 
			
		||||
              - image: vllm/vllm-openai:v0.6.2
 | 
			
		||||
                <<: *common_container_settings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  - label: "A100 sglang benchmark"
 | 
			
		||||
    priority: 100
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: A100
 | 
			
		||||
    plugins:
 | 
			
		||||
      - kubernetes:
 | 
			
		||||
          podSpec:
 | 
			
		||||
            <<: *common_pod_spec
 | 
			
		||||
            containers:
 | 
			
		||||
              - image: lmsysorg/sglang:v0.3.2-cu121
 | 
			
		||||
                <<: *common_container_settings
 | 
			
		||||
 | 
			
		||||
  - label: "A100 lmdeploy benchmark"
 | 
			
		||||
    priority: 100
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: A100
 | 
			
		||||
    plugins:
 | 
			
		||||
      - kubernetes:
 | 
			
		||||
          podSpec:
 | 
			
		||||
            <<: *common_pod_spec
 | 
			
		||||
            containers:
 | 
			
		||||
              - image: openmmlab/lmdeploy:v0.6.1-cu12
 | 
			
		||||
                <<: *common_container_settings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  - label: "A100 trt llama-8B"
 | 
			
		||||
    priority: 100
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: A100
 | 
			
		||||
    plugins:
 | 
			
		||||
      - kubernetes:
 | 
			
		||||
          podSpec:
 | 
			
		||||
            <<: *common_pod_spec
 | 
			
		||||
            containers:
 | 
			
		||||
              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
 | 
			
		||||
                <<: *common_container_settings
 | 
			
		||||
                env:
 | 
			
		||||
                  - name: VLLM_USAGE_SOURCE
 | 
			
		||||
                    value: ci-test
 | 
			
		||||
                  - name: HF_HOME
 | 
			
		||||
                    value: /root/.cache/huggingface
 | 
			
		||||
                  - name: VLLM_SOURCE_CODE_LOC
 | 
			
		||||
                    value: /workspace/build/buildkite/vllm/performance-benchmark
 | 
			
		||||
                  - name: HF_TOKEN
 | 
			
		||||
                    valueFrom:
 | 
			
		||||
                      secretKeyRef:
 | 
			
		||||
                        name: hf-token-secret
 | 
			
		||||
                        key: token
 | 
			
		||||
                  - name: TEST_SELECTOR
 | 
			
		||||
                    value: "llama8B"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  - label: "A100 trt llama-70B"
 | 
			
		||||
    priority: 100
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: A100
 | 
			
		||||
    plugins:
 | 
			
		||||
      - kubernetes:
 | 
			
		||||
          podSpec:
 | 
			
		||||
            <<: *common_pod_spec
 | 
			
		||||
            containers:
 | 
			
		||||
              - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
 | 
			
		||||
                <<: *common_container_settings
 | 
			
		||||
                env:
 | 
			
		||||
                  - name: VLLM_USAGE_SOURCE
 | 
			
		||||
                    value: ci-test
 | 
			
		||||
                  - name: HF_HOME
 | 
			
		||||
                    value: /root/.cache/huggingface
 | 
			
		||||
                  - name: VLLM_SOURCE_CODE_LOC
 | 
			
		||||
                    value: /workspace/build/buildkite/vllm/performance-benchmark
 | 
			
		||||
                  - name: HF_TOKEN
 | 
			
		||||
                    valueFrom:
 | 
			
		||||
                      secretKeyRef:
 | 
			
		||||
                        name: hf-token-secret
 | 
			
		||||
                        key: token
 | 
			
		||||
                  - name: TEST_SELECTOR
 | 
			
		||||
                    value: "llama70B"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image 
 | 
			
		||||
  # - label: "A100 trt benchmark"
 | 
			
		||||
  #   priority: 100
 | 
			
		||||
  #   agents:
 | 
			
		||||
  #     queue: A100
 | 
			
		||||
  #   plugins:
 | 
			
		||||
  #     - kubernetes:
 | 
			
		||||
  #         podSpec:
 | 
			
		||||
  #           <<: *common_pod_spec
 | 
			
		||||
  #           containers:
 | 
			
		||||
  #             - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
 | 
			
		||||
  #               <<: *common_container_settings
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  # FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
 | 
			
		||||
  # - label: "A100 tgi benchmark"
 | 
			
		||||
  #   priority: 100
 | 
			
		||||
  #   agents:
 | 
			
		||||
  #     queue: A100
 | 
			
		||||
  #   plugins:
 | 
			
		||||
  #     - kubernetes:
 | 
			
		||||
  #         podSpec:
 | 
			
		||||
  #           <<: *common_pod_spec
 | 
			
		||||
  #           containers:
 | 
			
		||||
  #             - image: ghcr.io/huggingface/text-generation-inference:2.2.0
 | 
			
		||||
  #               <<: *common_container_settings
 | 
			
		||||
        
 | 
			
		||||
  - wait
 | 
			
		||||
 | 
			
		||||
  - label: "Collect the results"
 | 
			
		||||
    priority: 100
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: A100
 | 
			
		||||
    plugins:
 | 
			
		||||
      - kubernetes:
 | 
			
		||||
          podSpec:
 | 
			
		||||
            <<: *common_pod_spec
 | 
			
		||||
            containers:
 | 
			
		||||
            - image: vllm/vllm-openai:v0.5.0.post1
 | 
			
		||||
              command:
 | 
			
		||||
              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
 | 
			
		||||
              resources:
 | 
			
		||||
                limits:
 | 
			
		||||
                  nvidia.com/gpu: 8
 | 
			
		||||
              volumeMounts:
 | 
			
		||||
              - name: devshm
 | 
			
		||||
                mountPath: /dev/shm
 | 
			
		||||
              env:
 | 
			
		||||
              - name: VLLM_USAGE_SOURCE
 | 
			
		||||
                value: ci-test
 | 
			
		||||
              - name: VLLM_SOURCE_CODE_LOC
 | 
			
		||||
                value: /workspace/build/buildkite/vllm/performance-benchmark
 | 
			
		||||
              - name: HF_TOKEN
 | 
			
		||||
                valueFrom:
 | 
			
		||||
                  secretKeyRef:
 | 
			
		||||
                    name: hf-token-secret
 | 
			
		||||
                    key: token
 | 
			
		||||
 | 
			
		||||
  - block: ":rocket: check the results!"
 | 
			
		||||
@ -5,7 +5,7 @@
 | 
			
		||||
- Input length: 32 tokens.
 | 
			
		||||
- Output length: 128 tokens.
 | 
			
		||||
- Batch size: fixed (8).
 | 
			
		||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- CPU Models: llama-3.1 8B.
 | 
			
		||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
 | 
			
		||||
 | 
			
		||||
@ -16,7 +16,7 @@
 | 
			
		||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
 | 
			
		||||
- Output length: the corresponding output length of these 200 prompts.
 | 
			
		||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
 | 
			
		||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- CPU Models: llama-3.1 8B.
 | 
			
		||||
- Evaluation metrics: throughput.
 | 
			
		||||
 | 
			
		||||
@ -28,7 +28,7 @@
 | 
			
		||||
- Output length: the corresponding output length of these 200 prompts.
 | 
			
		||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 | 
			
		||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 | 
			
		||||
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 | 
			
		||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 | 
			
		||||
- CPU Models: llama-3.1 8B.
 | 
			
		||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 | 
			
		||||
@ -7,7 +7,6 @@ from importlib import util
 | 
			
		||||
 | 
			
		||||
import pandas as pd
 | 
			
		||||
 | 
			
		||||
pd.options.display.float_format = "{:.2f}".format
 | 
			
		||||
plotly_found = util.find_spec("plotly.express") is not None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -110,10 +109,7 @@ def compare_data_columns(
 | 
			
		||||
        if len(compare_frames) >= 2:
 | 
			
		||||
            base = compare_frames[0]
 | 
			
		||||
            current = compare_frames[-1]
 | 
			
		||||
            if "P99" in data_column or "Median" in data_column:
 | 
			
		||||
                ratio = base / current  # for latency
 | 
			
		||||
            else:
 | 
			
		||||
                ratio = current / base
 | 
			
		||||
            ratio = current / base
 | 
			
		||||
            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
 | 
			
		||||
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
 | 
			
		||||
            frames.append(ratio)
 | 
			
		||||
@ -203,71 +199,6 @@ def split_json_by_tp_pp(
 | 
			
		||||
    return saved_paths
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _add_limit_line(fig, y_value, label):
 | 
			
		||||
    # Visible dashed line + annotation
 | 
			
		||||
    fig.add_hline(
 | 
			
		||||
        y=y_value,
 | 
			
		||||
        line_dash="dash",
 | 
			
		||||
        line_color="red" if "ttft" in label.lower() else "blue",
 | 
			
		||||
        annotation_text=f"{label}: {y_value} ms",
 | 
			
		||||
        annotation_position="top left",
 | 
			
		||||
    )
 | 
			
		||||
    # Optional: add a legend item (as a transparent helper trace)
 | 
			
		||||
    if plot and plotly_found:
 | 
			
		||||
        import plotly.graph_objects as go
 | 
			
		||||
 | 
			
		||||
        fig.add_trace(
 | 
			
		||||
            go.Scatter(
 | 
			
		||||
                x=[None],
 | 
			
		||||
                y=[None],
 | 
			
		||||
                mode="lines",
 | 
			
		||||
                line=dict(
 | 
			
		||||
                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
 | 
			
		||||
                ),
 | 
			
		||||
                name=f"{label}",
 | 
			
		||||
            )
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _find_concurrency_col(df: pd.DataFrame) -> str:
 | 
			
		||||
    for c in [
 | 
			
		||||
        "# of max concurrency.",
 | 
			
		||||
        "# of max concurrency",
 | 
			
		||||
        "Max Concurrency",
 | 
			
		||||
        "max_concurrency",
 | 
			
		||||
        "Concurrency",
 | 
			
		||||
    ]:
 | 
			
		||||
        if c in df.columns:
 | 
			
		||||
            return c
 | 
			
		||||
    # Fallback: guess an integer-like column (harmless if unused)
 | 
			
		||||
    for c in df.columns:
 | 
			
		||||
        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
 | 
			
		||||
            return c
 | 
			
		||||
    return "# of max concurrency."
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _highlight_threshold(
 | 
			
		||||
    df: pd.DataFrame, threshold: float
 | 
			
		||||
) -> "pd.io.formats.style.Styler":
 | 
			
		||||
    """Highlight numeric per-configuration columns with value <= threshold."""
 | 
			
		||||
    conc_col = _find_concurrency_col(df)
 | 
			
		||||
    key_cols = [
 | 
			
		||||
        c
 | 
			
		||||
        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
 | 
			
		||||
        if c in df.columns
 | 
			
		||||
    ]
 | 
			
		||||
    conf_cols = [
 | 
			
		||||
        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
 | 
			
		||||
    ]
 | 
			
		||||
    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
 | 
			
		||||
    return df.style.map(
 | 
			
		||||
        lambda v: "background-color:#e6ffe6;font-weight:bold;"
 | 
			
		||||
        if pd.notna(v) and v <= threshold
 | 
			
		||||
        else "",
 | 
			
		||||
        subset=conf_cols,
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    parser = argparse.ArgumentParser()
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
@ -289,26 +220,6 @@ if __name__ == "__main__":
 | 
			
		||||
        default="# of max concurrency.",
 | 
			
		||||
        help="column name to use as X Axis in comparison graph",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "-l",
 | 
			
		||||
        "--latency",
 | 
			
		||||
        type=str,
 | 
			
		||||
        default="p99",
 | 
			
		||||
        help="take median|p99 for latency like TTFT/TPOT",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--ttft-max-ms",
 | 
			
		||||
        type=float,
 | 
			
		||||
        default=3000.0,
 | 
			
		||||
        help="Reference limit for TTFT plots (ms)",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--tpot-max-ms",
 | 
			
		||||
        type=float,
 | 
			
		||||
        default=100.0,
 | 
			
		||||
        help="Reference limit for TPOT plots (ms)",
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
 | 
			
		||||
    drop_column = "P99"
 | 
			
		||||
@ -323,22 +234,12 @@ if __name__ == "__main__":
 | 
			
		||||
        "# of max concurrency.",
 | 
			
		||||
        "qps",
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    if "median" in args.latency:
 | 
			
		||||
        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
 | 
			
		||||
        html_msgs_for_data_cols = [
 | 
			
		||||
            "Compare Output Tokens /n",
 | 
			
		||||
            "Median TTFT /n",
 | 
			
		||||
            "Median TPOT /n",
 | 
			
		||||
        ]
 | 
			
		||||
        drop_column = "P99"
 | 
			
		||||
    elif "p99" in args.latency:
 | 
			
		||||
        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
 | 
			
		||||
        html_msgs_for_data_cols = [
 | 
			
		||||
            "Compare Output Tokens /n",
 | 
			
		||||
            "P99 TTFT /n",
 | 
			
		||||
            "P99 TPOT /n",
 | 
			
		||||
        ]
 | 
			
		||||
    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
 | 
			
		||||
    html_msgs_for_data_cols = [
 | 
			
		||||
        "Compare Output Tokens /n",
 | 
			
		||||
        "Median TTFT /n",
 | 
			
		||||
        "Median TPOT /n",
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    if len(args.file) == 1:
 | 
			
		||||
        files = split_json_by_tp_pp(args.file[0], output_root="splits")
 | 
			
		||||
@ -374,83 +275,33 @@ if __name__ == "__main__":
 | 
			
		||||
                    f"Expected subset: {filtered_info_cols}, "
 | 
			
		||||
                    f"but DataFrame has: {list(output_df.columns)}"
 | 
			
		||||
                )
 | 
			
		||||
            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
 | 
			
		||||
            output_df_sorted = output_df.sort_values(by=args.xaxis)
 | 
			
		||||
            output_df_sorted = output_df.sort_values(by=existing_group_cols)
 | 
			
		||||
            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
 | 
			
		||||
            for name, group in output_groups:
 | 
			
		||||
                group_name = (
 | 
			
		||||
                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
 | 
			
		||||
                )
 | 
			
		||||
                group_html_name = "perf_comparison_" + group_name + ".html"
 | 
			
		||||
 | 
			
		||||
                metric_name = str(data_cols_to_compare[i]).lower()
 | 
			
		||||
                if "tok/s" in metric_name:
 | 
			
		||||
                    html = group.to_html()
 | 
			
		||||
                elif "ttft" in metric_name:
 | 
			
		||||
                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
 | 
			
		||||
                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
 | 
			
		||||
                        na_rep="—",
 | 
			
		||||
                    )
 | 
			
		||||
                    html = styler.to_html(
 | 
			
		||||
                        table_attributes='border="1" class="dataframe"'
 | 
			
		||||
                    )
 | 
			
		||||
                elif (
 | 
			
		||||
                    "tpot" in metric_name
 | 
			
		||||
                    or "median" in metric_name
 | 
			
		||||
                    or "p99" in metric_name
 | 
			
		||||
                ):
 | 
			
		||||
                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
 | 
			
		||||
                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
 | 
			
		||||
                        na_rep="—",
 | 
			
		||||
                    )
 | 
			
		||||
                    html = styler.to_html(
 | 
			
		||||
                        table_attributes='border="1" class="dataframe"'
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
                html = group.to_html()
 | 
			
		||||
                text_file.write(html_msgs_for_data_cols[i])
 | 
			
		||||
                text_file.write(html)
 | 
			
		||||
                with open(group_html_name, "a+") as sub_text_file:
 | 
			
		||||
                    sub_text_file.write(html_msgs_for_data_cols[i])
 | 
			
		||||
                    sub_text_file.write(html)
 | 
			
		||||
 | 
			
		||||
                    if plot and plotly_found:
 | 
			
		||||
                        import plotly.express as px
 | 
			
		||||
                if plot and plotly_found:
 | 
			
		||||
                    import plotly.express as px
 | 
			
		||||
 | 
			
		||||
                        df = group[raw_data_cols]
 | 
			
		||||
                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
 | 
			
		||||
                        # Melt DataFrame for plotting
 | 
			
		||||
                        df_melted = df_sorted.melt(
 | 
			
		||||
                            id_vars=info_cols[y_axis_index],
 | 
			
		||||
                            var_name="Configuration",
 | 
			
		||||
                            value_name=data_cols_to_compare[i],
 | 
			
		||||
                        )
 | 
			
		||||
                        title = (
 | 
			
		||||
                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
 | 
			
		||||
                        )
 | 
			
		||||
                        # Create Plotly line chart
 | 
			
		||||
                        fig = px.line(
 | 
			
		||||
                            df_melted,
 | 
			
		||||
                            x=info_cols[y_axis_index],
 | 
			
		||||
                            y=data_cols_to_compare[i],
 | 
			
		||||
                            color="Configuration",
 | 
			
		||||
                            title=title,
 | 
			
		||||
                            markers=True,
 | 
			
		||||
                        )
 | 
			
		||||
 | 
			
		||||
                        # ---- Add threshold lines based on metric name ----
 | 
			
		||||
                        if "ttft" in metric_name:
 | 
			
		||||
                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
 | 
			
		||||
                        elif (
 | 
			
		||||
                            "tpot" in metric_name
 | 
			
		||||
                            or "median" in metric_name
 | 
			
		||||
                            or "p99" in metric_name
 | 
			
		||||
                        ):
 | 
			
		||||
                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
 | 
			
		||||
 | 
			
		||||
                        # Export to HTML
 | 
			
		||||
                        text_file.write(
 | 
			
		||||
                            fig.to_html(full_html=True, include_plotlyjs="cdn")
 | 
			
		||||
                        )
 | 
			
		||||
                        sub_text_file.write(
 | 
			
		||||
                            fig.to_html(full_html=True, include_plotlyjs="cdn")
 | 
			
		||||
                        )
 | 
			
		||||
                    df = group[raw_data_cols]
 | 
			
		||||
                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
 | 
			
		||||
                    # Melt DataFrame for plotting
 | 
			
		||||
                    df_melted = df_sorted.melt(
 | 
			
		||||
                        id_vars=info_cols[y_axis_index],
 | 
			
		||||
                        var_name="Configuration",
 | 
			
		||||
                        value_name=data_cols_to_compare[i],
 | 
			
		||||
                    )
 | 
			
		||||
                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
 | 
			
		||||
                    # Create Plotly line chart
 | 
			
		||||
                    fig = px.line(
 | 
			
		||||
                        df_melted,
 | 
			
		||||
                        x=info_cols[y_axis_index],
 | 
			
		||||
                        y=data_cols_to_compare[i],
 | 
			
		||||
                        color="Configuration",
 | 
			
		||||
                        title=title,
 | 
			
		||||
                        markers=True,
 | 
			
		||||
                    )
 | 
			
		||||
                    # Export to HTML
 | 
			
		||||
                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
 | 
			
		||||
@ -63,11 +63,9 @@ serving_column_mapping = {
 | 
			
		||||
    "mean_ttft_ms": "Mean TTFT (ms)",
 | 
			
		||||
    "median_ttft_ms": "Median TTFT (ms)",
 | 
			
		||||
    "p99_ttft_ms": "P99 TTFT (ms)",
 | 
			
		||||
    "std_ttft_ms": "STD TTFT (ms)",
 | 
			
		||||
    "mean_tpot_ms": "Mean TPOT (ms)",
 | 
			
		||||
    "median_tpot_ms": "Median",
 | 
			
		||||
    "p99_tpot_ms": "P99",
 | 
			
		||||
    "std_tpot_ms": "STD TPOT (ms)",
 | 
			
		||||
    "mean_itl_ms": "Mean ITL (ms)",
 | 
			
		||||
    "median_itl_ms": "Median ITL (ms)",
 | 
			
		||||
    "p99_itl_ms": "P99 ITL (ms)",
 | 
			
		||||
@ -370,7 +368,7 @@ if __name__ == "__main__":
 | 
			
		||||
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
 | 
			
		||||
        # we want to turn it into "8xGPUTYPE"
 | 
			
		||||
        df["GPU"] = df["GPU"].apply(
 | 
			
		||||
            lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
 | 
			
		||||
            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    # get markdown tables
 | 
			
		||||
@ -392,7 +390,7 @@ if __name__ == "__main__":
 | 
			
		||||
    json_file = "benchmark_results.json"
 | 
			
		||||
    with open(results_folder / md_file, "w") as f:
 | 
			
		||||
        results = read_markdown(
 | 
			
		||||
            "../.buildkite/performance-benchmarks/"
 | 
			
		||||
            "../.buildkite/nightly-benchmarks/"
 | 
			
		||||
            + "performance-benchmarks-descriptions.md"
 | 
			
		||||
        )
 | 
			
		||||
        results = results.format(
 | 
			
		||||
							
								
								
									
										26
									
								
								.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,26 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
 | 
			
		||||
from transformers import AutoTokenizer
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main(model, cachedir):
 | 
			
		||||
    # Load the tokenizer and save it to the specified directory
 | 
			
		||||
    tokenizer = AutoTokenizer.from_pretrained(model)
 | 
			
		||||
    tokenizer.save_pretrained(cachedir)
 | 
			
		||||
    print(f"Tokenizer saved to {cachedir}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    parser = argparse.ArgumentParser(
 | 
			
		||||
        description="Download and save Hugging Face tokenizer"
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument("--model", type=str, required=True, help="Name of the model")
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--cachedir", type=str, required=True, help="Directory to save the tokenizer"
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    main(args.model, args.cachedir)
 | 
			
		||||
@ -0,0 +1,97 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import json
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
import numpy as np
 | 
			
		||||
import pandas as pd
 | 
			
		||||
from tabulate import tabulate
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def parse_arguments():
 | 
			
		||||
    parser = argparse.ArgumentParser(
 | 
			
		||||
        description="Parse command line arguments for summary-nightly-results script."
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--results-folder",
 | 
			
		||||
        type=str,
 | 
			
		||||
        required=True,
 | 
			
		||||
        help="The folder where the results are stored.",
 | 
			
		||||
    )
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--description", type=str, required=True, help="Description of the results."
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    return args
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_perf(df, method, model, metric):
 | 
			
		||||
    means = []
 | 
			
		||||
 | 
			
		||||
    for qps in [2, 4, 8, 16, "inf"]:
 | 
			
		||||
        target = df["Test name"].str.contains(model)
 | 
			
		||||
        target = target & df["Engine"].str.contains(method)
 | 
			
		||||
        target = target & df["Test name"].str.contains("qps_" + str(qps))
 | 
			
		||||
        filtered_df = df[target]
 | 
			
		||||
 | 
			
		||||
        if filtered_df.empty:
 | 
			
		||||
            means.append(0.0)
 | 
			
		||||
        else:
 | 
			
		||||
            means.append(filtered_df[metric].values[0])
 | 
			
		||||
 | 
			
		||||
    return np.array(means)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_perf_w_std(df, method, model, metric):
 | 
			
		||||
    if metric in ["TTFT", "ITL"]:
 | 
			
		||||
        mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
 | 
			
		||||
        mean = mean.tolist()
 | 
			
		||||
        std = get_perf(df, method, model, "Std " + metric + " (ms)")
 | 
			
		||||
        if std.mean() == 0:
 | 
			
		||||
            std = None
 | 
			
		||||
        success = get_perf(df, method, model, "Successful req.")
 | 
			
		||||
        if std is not None:
 | 
			
		||||
            std = std / np.sqrt(success)
 | 
			
		||||
            std = std.tolist()
 | 
			
		||||
 | 
			
		||||
    else:
 | 
			
		||||
        assert metric == "Tput"
 | 
			
		||||
        mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
 | 
			
		||||
            df, method, model, "Output Tput (tok/s)"
 | 
			
		||||
        )
 | 
			
		||||
        mean = mean.tolist()
 | 
			
		||||
        std = None
 | 
			
		||||
 | 
			
		||||
    return mean, std
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main(args):
 | 
			
		||||
    results_folder = Path(args.results_folder)
 | 
			
		||||
 | 
			
		||||
    results = []
 | 
			
		||||
 | 
			
		||||
    # collect results
 | 
			
		||||
    for test_file in results_folder.glob("*_nightly_results.json"):
 | 
			
		||||
        with open(test_file) as f:
 | 
			
		||||
            results = results + json.loads(f.read())
 | 
			
		||||
 | 
			
		||||
    # generate markdown table
 | 
			
		||||
    df = pd.DataFrame.from_dict(results)
 | 
			
		||||
 | 
			
		||||
    md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
 | 
			
		||||
 | 
			
		||||
    with open(args.description) as f:
 | 
			
		||||
        description = f.read()
 | 
			
		||||
 | 
			
		||||
    description = description.format(nightly_results_benchmarking_table=md_table)
 | 
			
		||||
 | 
			
		||||
    with open("nightly_results.md", "w") as f:
 | 
			
		||||
        f.write(description)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    args = parse_arguments()
 | 
			
		||||
    main(args)
 | 
			
		||||
@ -0,0 +1,9 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
from lmdeploy.serve.openai.api_client import APIClient
 | 
			
		||||
 | 
			
		||||
api_client = APIClient("http://localhost:8000")
 | 
			
		||||
model_name = api_client.available_models[0]
 | 
			
		||||
 | 
			
		||||
print(model_name)
 | 
			
		||||
							
								
								
									
										78
									
								
								.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,78 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -ex
 | 
			
		||||
set -o pipefail
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
main() {
 | 
			
		||||
 | 
			
		||||
    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 | 
			
		||||
    (which jq) || (apt-get update && apt-get -y install jq)
 | 
			
		||||
    (which zip) || (apt-get install -y zip)
 | 
			
		||||
 | 
			
		||||
    if [ ! -f /workspace/buildkite-agent ]; then
 | 
			
		||||
        echo "buildkite-agent binary not found. Skip plotting the results."
 | 
			
		||||
        exit 0
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # initial annotation
 | 
			
		||||
    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
 | 
			
		||||
 | 
			
		||||
    # download results
 | 
			
		||||
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
 | 
			
		||||
    mkdir -p results/
 | 
			
		||||
    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
 | 
			
		||||
    ls
 | 
			
		||||
    ls results/
 | 
			
		||||
 | 
			
		||||
    # upload benchmark results
 | 
			
		||||
    zip -r results.zip results/
 | 
			
		||||
    /workspace/buildkite-agent artifact upload "results.zip"
 | 
			
		||||
 | 
			
		||||
    # upload benchmarking scripts
 | 
			
		||||
    cd "$VLLM_SOURCE_CODE_LOC/"
 | 
			
		||||
    zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
 | 
			
		||||
    /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
 | 
			
		||||
 | 
			
		||||
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
 | 
			
		||||
    # upload benchmarking pipeline
 | 
			
		||||
    /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
 | 
			
		||||
 | 
			
		||||
    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
 | 
			
		||||
    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # The figures should be generated by a separate process outside the CI/CD pipeline
 | 
			
		||||
 | 
			
		||||
    # # generate figures
 | 
			
		||||
    # python3 -m pip install tabulate pandas matplotlib
 | 
			
		||||
 | 
			
		||||
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
 | 
			
		||||
    #     --description $description \
 | 
			
		||||
    #     --results-folder results/ 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
 | 
			
		||||
    #     --description $description \
 | 
			
		||||
    #     --results-folder results/ \
 | 
			
		||||
    #     --dataset sharegpt
 | 
			
		||||
 | 
			
		||||
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
 | 
			
		||||
    #     --description $description \
 | 
			
		||||
    #     --results-folder results/ \
 | 
			
		||||
    #     --dataset sonnet_2048_128
 | 
			
		||||
 | 
			
		||||
    # python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
 | 
			
		||||
    #     --description $description \
 | 
			
		||||
    #     --results-folder results/ \
 | 
			
		||||
    #     --dataset sonnet_128_2048
 | 
			
		||||
    
 | 
			
		||||
    # # upload results and figures
 | 
			
		||||
    # /workspace/buildkite-agent artifact upload "nightly_results*.png"
 | 
			
		||||
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
 | 
			
		||||
    # /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
 | 
			
		||||
    # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
main "$@"
 | 
			
		||||
							
								
								
									
										464
									
								
								.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										464
									
								
								.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,464 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
set -o pipefail
 | 
			
		||||
set -x
 | 
			
		||||
 | 
			
		||||
check_gpus() {
 | 
			
		||||
  # check the number of GPUs and GPU type.
 | 
			
		||||
  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
 | 
			
		||||
  if [[ $gpu_count -gt 0 ]]; then
 | 
			
		||||
    echo "GPU found."
 | 
			
		||||
  else
 | 
			
		||||
    echo "Need at least 1 GPU to run benchmarking."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
 | 
			
		||||
  echo "GPU type is $gpu_type"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
check_hf_token() {
 | 
			
		||||
  # check if HF_TOKEN is available and valid
 | 
			
		||||
  if [[ -z "$HF_TOKEN" ]]; then
 | 
			
		||||
    echo "Error: HF_TOKEN is not set."
 | 
			
		||||
    exit 1
 | 
			
		||||
  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
 | 
			
		||||
    echo "Error: HF_TOKEN does not start with 'hf_'."
 | 
			
		||||
    exit 1
 | 
			
		||||
  else
 | 
			
		||||
    echo "HF_TOKEN is set and valid."
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
upload_to_buildkite() {
 | 
			
		||||
  # upload the benchmarking results to buildkite
 | 
			
		||||
 | 
			
		||||
  # if the agent binary is not found, skip uploading the results, exit 0
 | 
			
		||||
  if [ ! -f /workspace/buildkite-agent ]; then
 | 
			
		||||
    echo "buildkite-agent binary not found. Skip uploading the results."
 | 
			
		||||
    return 0
 | 
			
		||||
  fi
 | 
			
		||||
  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
 | 
			
		||||
  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
get_current_llm_serving_engine() {
 | 
			
		||||
 | 
			
		||||
  if which lmdeploy >/dev/null; then
 | 
			
		||||
    echo "Container: lmdeploy"
 | 
			
		||||
    export CURRENT_LLM_SERVING_ENGINE=lmdeploy
 | 
			
		||||
    return
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [ -e /tgi-entrypoint.sh ]; then
 | 
			
		||||
    echo "Container: tgi"
 | 
			
		||||
    export CURRENT_LLM_SERVING_ENGINE=tgi
 | 
			
		||||
    return
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if which trtllm-build >/dev/null; then
 | 
			
		||||
    echo "Container: tensorrt-llm"
 | 
			
		||||
    export CURRENT_LLM_SERVING_ENGINE=trt
 | 
			
		||||
    return
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [ -e /sgl-workspace ]; then
 | 
			
		||||
    echo "Container: sglang"
 | 
			
		||||
    export CURRENT_LLM_SERVING_ENGINE=sglang
 | 
			
		||||
    return
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [ -e /vllm-workspace ]; then
 | 
			
		||||
    echo "Container: vllm"
 | 
			
		||||
    # move to a completely irrelevant directory, to avoid import vllm from current folder
 | 
			
		||||
    export CURRENT_LLM_SERVING_ENGINE=vllm
 | 
			
		||||
 | 
			
		||||
    return
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
json2args() {
 | 
			
		||||
  # transforms the JSON string to command line args, and '_' is replaced to '-'
 | 
			
		||||
  # example:
 | 
			
		||||
  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
 | 
			
		||||
  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
 | 
			
		||||
  local json_string=$1
 | 
			
		||||
  local args=$(
 | 
			
		||||
    echo "$json_string" | jq -r '
 | 
			
		||||
      to_entries |
 | 
			
		||||
      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
 | 
			
		||||
      join(" ")
 | 
			
		||||
    '
 | 
			
		||||
  )
 | 
			
		||||
  echo "$args"
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
kill_gpu_processes() {
 | 
			
		||||
  pkill -f '[p]ython'
 | 
			
		||||
  pkill -f '[p]ython3'
 | 
			
		||||
  pkill -f '[t]ritonserver'
 | 
			
		||||
  pkill -f '[p]t_main_thread'
 | 
			
		||||
  pkill -f '[t]ext-generation'
 | 
			
		||||
  pkill -f '[l]mdeploy'
 | 
			
		||||
  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
 | 
			
		||||
  pkill -f '[V]LLM'
 | 
			
		||||
 | 
			
		||||
  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
 | 
			
		||||
    sleep 1
 | 
			
		||||
  done
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
wait_for_server() {
 | 
			
		||||
  # wait for vllm server to start
 | 
			
		||||
  # return 1 if vllm server crashes
 | 
			
		||||
  timeout 1200 bash -c '
 | 
			
		||||
    until curl -s localhost:8000/v1/completions > /dev/null; do
 | 
			
		||||
      sleep 1
 | 
			
		||||
    done' && return 0 || return 1
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
ensure_installed() {
 | 
			
		||||
  # Ensure that the given command is installed by apt-get
 | 
			
		||||
  local cmd=$1
 | 
			
		||||
  if ! which "$cmd" >/dev/null; then
 | 
			
		||||
    apt-get update && apt-get install -y "$cmd"
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
run_serving_tests() {
 | 
			
		||||
  # run serving tests using `vllm bench serve` command
 | 
			
		||||
  # $1: a json file specifying serving test cases
 | 
			
		||||
 | 
			
		||||
  local serving_test_file
 | 
			
		||||
  serving_test_file=$1
 | 
			
		||||
 | 
			
		||||
  # Iterate over serving tests
 | 
			
		||||
  jq -c '.[]' "$serving_test_file" | while read -r params; do
 | 
			
		||||
    # get the test name, and append the GPU type back to it.
 | 
			
		||||
    test_name=$(echo "$params" | jq -r '.test_name')
 | 
			
		||||
 | 
			
		||||
    # if TEST_SELECTOR is set, only run the test cases that match the selector
 | 
			
		||||
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
 | 
			
		||||
      echo "Skip test case $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # prepend the current serving engine to the test name
 | 
			
		||||
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
 | 
			
		||||
 | 
			
		||||
    # get common parameters
 | 
			
		||||
    common_params=$(echo "$params" | jq -r '.common_parameters')
 | 
			
		||||
    model=$(echo "$common_params" | jq -r '.model')
 | 
			
		||||
    tp=$(echo "$common_params" | jq -r '.tp')
 | 
			
		||||
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
 | 
			
		||||
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
 | 
			
		||||
    port=$(echo "$common_params" | jq -r '.port')
 | 
			
		||||
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
 | 
			
		||||
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
 | 
			
		||||
 | 
			
		||||
    # get client and server arguments
 | 
			
		||||
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
 | 
			
		||||
    client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
 | 
			
		||||
    client_args=$(json2args "$client_params")
 | 
			
		||||
    qps_list=$(echo "$params" | jq -r '.qps_list')
 | 
			
		||||
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
 | 
			
		||||
    echo "Running over qps list $qps_list"
 | 
			
		||||
 | 
			
		||||
    # check if there is enough GPU to run the test
 | 
			
		||||
    if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    if [[ $reuse_server == "true" ]]; then
 | 
			
		||||
      echo "Reuse previous server for test case $test_name"
 | 
			
		||||
    else
 | 
			
		||||
      kill_gpu_processes
 | 
			
		||||
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
 | 
			
		||||
        "$server_params" "$common_params"
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    if wait_for_server; then
 | 
			
		||||
      echo ""
 | 
			
		||||
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
 | 
			
		||||
    else
 | 
			
		||||
      echo ""
 | 
			
		||||
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
 | 
			
		||||
      break
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # prepare tokenizer
 | 
			
		||||
    # this is required for lmdeploy.
 | 
			
		||||
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
 | 
			
		||||
    rm -rf /tokenizer_cache
 | 
			
		||||
    mkdir /tokenizer_cache
 | 
			
		||||
    python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
 | 
			
		||||
      --model "$model" \
 | 
			
		||||
      --cachedir /tokenizer_cache
 | 
			
		||||
    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    # change model name for lmdeploy (it will not follow standard hf name)
 | 
			
		||||
    if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
 | 
			
		||||
      model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # iterate over different QPS
 | 
			
		||||
    for qps in $qps_list; do
 | 
			
		||||
      # remove the surrounding single quote from qps
 | 
			
		||||
      if [[ "$qps" == *"inf"* ]]; then
 | 
			
		||||
        echo "qps was $qps"
 | 
			
		||||
        qps="inf"
 | 
			
		||||
        echo "now qps is $qps"
 | 
			
		||||
      fi
 | 
			
		||||
 | 
			
		||||
      new_test_name=$test_name"_qps_"$qps
 | 
			
		||||
 | 
			
		||||
      backend=$CURRENT_LLM_SERVING_ENGINE
 | 
			
		||||
 | 
			
		||||
      if [[ $backend = "trt" ]]; then
 | 
			
		||||
        backend="tensorrt-llm"
 | 
			
		||||
      fi
 | 
			
		||||
 | 
			
		||||
      if [[ "$backend" == *"vllm"* ]]; then
 | 
			
		||||
        backend="vllm"
 | 
			
		||||
      fi
 | 
			
		||||
 | 
			
		||||
      if [[ "$dataset_name" = "sharegpt" ]]; then
 | 
			
		||||
 | 
			
		||||
        client_command="vllm bench serve \
 | 
			
		||||
          --backend $backend \
 | 
			
		||||
          --tokenizer /tokenizer_cache \
 | 
			
		||||
          --model $model \
 | 
			
		||||
          --dataset-name $dataset_name \
 | 
			
		||||
          --dataset-path $dataset_path \
 | 
			
		||||
          --num-prompts $num_prompts \
 | 
			
		||||
          --port $port \
 | 
			
		||||
          --save-result \
 | 
			
		||||
          --result-dir $RESULTS_FOLDER \
 | 
			
		||||
          --result-filename ${new_test_name}.json \
 | 
			
		||||
          --request-rate $qps \
 | 
			
		||||
          --ignore-eos \
 | 
			
		||||
          $client_args"
 | 
			
		||||
 | 
			
		||||
      elif [[ "$dataset_name" = "sonnet" ]]; then
 | 
			
		||||
 | 
			
		||||
        sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
 | 
			
		||||
        sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
 | 
			
		||||
        sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
 | 
			
		||||
 | 
			
		||||
        client_command="vllm bench serve \
 | 
			
		||||
          --backend $backend \
 | 
			
		||||
          --tokenizer /tokenizer_cache \
 | 
			
		||||
          --model $model \
 | 
			
		||||
          --dataset-name $dataset_name \
 | 
			
		||||
          --dataset-path $dataset_path \
 | 
			
		||||
          --num-prompts $num_prompts \
 | 
			
		||||
          --sonnet-input-len $sonnet_input_len \
 | 
			
		||||
          --sonnet-output-len $sonnet_output_len \
 | 
			
		||||
          --sonnet-prefix-len $sonnet_prefix_len \
 | 
			
		||||
          --port $port \
 | 
			
		||||
          --save-result \
 | 
			
		||||
          --result-dir $RESULTS_FOLDER \
 | 
			
		||||
          --result-filename ${new_test_name}.json \
 | 
			
		||||
          --request-rate $qps \
 | 
			
		||||
          --ignore-eos \
 | 
			
		||||
          $client_args"
 | 
			
		||||
 | 
			
		||||
      else
 | 
			
		||||
 | 
			
		||||
        echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
 | 
			
		||||
        exit 1
 | 
			
		||||
 | 
			
		||||
      fi
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
      echo "Running test case $test_name with qps $qps"
 | 
			
		||||
      echo "Client command: $client_command"
 | 
			
		||||
 | 
			
		||||
      eval "$client_command"
 | 
			
		||||
 | 
			
		||||
      server_command="None"
 | 
			
		||||
 | 
			
		||||
      # record the benchmarking commands
 | 
			
		||||
      jq_output=$(jq -n \
 | 
			
		||||
        --arg server "$server_command" \
 | 
			
		||||
        --arg client "$client_command" \
 | 
			
		||||
        --arg gpu "$gpu_type" \
 | 
			
		||||
        --arg engine "$CURRENT_LLM_SERVING_ENGINE" \
 | 
			
		||||
        '{
 | 
			
		||||
          server_command: $server,
 | 
			
		||||
          client_command: $client,
 | 
			
		||||
          gpu_type: $gpu,
 | 
			
		||||
          engine: $engine
 | 
			
		||||
        }')
 | 
			
		||||
      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
 | 
			
		||||
 | 
			
		||||
    done
 | 
			
		||||
 | 
			
		||||
  done
 | 
			
		||||
 | 
			
		||||
  kill_gpu_processes
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
run_genai_perf_tests() {
 | 
			
		||||
  # run genai-perf tests
 | 
			
		||||
 | 
			
		||||
  # $1: a json file specifying genai-perf test cases
 | 
			
		||||
  local genai_perf_test_file
 | 
			
		||||
  genai_perf_test_file=$1
 | 
			
		||||
 | 
			
		||||
  # Iterate over genai-perf tests
 | 
			
		||||
  jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
 | 
			
		||||
    # get the test name, and append the GPU type back to it.
 | 
			
		||||
    test_name=$(echo "$params" | jq -r '.test_name')
 | 
			
		||||
 | 
			
		||||
    # if TEST_SELECTOR is set, only run the test cases that match the selector
 | 
			
		||||
    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
 | 
			
		||||
      echo "Skip test case $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # prepend the current serving engine to the test name
 | 
			
		||||
    test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
 | 
			
		||||
 | 
			
		||||
    # get common parameters
 | 
			
		||||
    common_params=$(echo "$params" | jq -r '.common_parameters')
 | 
			
		||||
    model=$(echo "$common_params" | jq -r '.model')
 | 
			
		||||
    tp=$(echo "$common_params" | jq -r '.tp')
 | 
			
		||||
    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
 | 
			
		||||
    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
 | 
			
		||||
    port=$(echo "$common_params" | jq -r '.port')
 | 
			
		||||
    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
 | 
			
		||||
    reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
 | 
			
		||||
 | 
			
		||||
    # get client and server arguments
 | 
			
		||||
    server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
 | 
			
		||||
    qps_list=$(echo "$params" | jq -r '.qps_list')
 | 
			
		||||
    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
 | 
			
		||||
    echo "Running over qps list $qps_list"
 | 
			
		||||
 | 
			
		||||
    # check if there is enough GPU to run the test
 | 
			
		||||
    if [[ $gpu_count -lt $tp ]]; then
 | 
			
		||||
      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
 | 
			
		||||
      continue
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    if [[ $reuse_server == "true" ]]; then
 | 
			
		||||
      echo "Reuse previous server for test case $test_name"
 | 
			
		||||
    else
 | 
			
		||||
      kill_gpu_processes
 | 
			
		||||
      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
 | 
			
		||||
        "$server_params" "$common_params"
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    if wait_for_server; then
 | 
			
		||||
      echo ""
 | 
			
		||||
      echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
 | 
			
		||||
    else
 | 
			
		||||
      echo ""
 | 
			
		||||
      echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
 | 
			
		||||
      break
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    # iterate over different QPS
 | 
			
		||||
    for qps in $qps_list; do
 | 
			
		||||
      # remove the surrounding single quote from qps
 | 
			
		||||
      if [[ "$qps" == *"inf"* ]]; then
 | 
			
		||||
        echo "qps was $qps"
 | 
			
		||||
        qps=$num_prompts
 | 
			
		||||
        echo "now qps is $qps"
 | 
			
		||||
      fi
 | 
			
		||||
 | 
			
		||||
      new_test_name=$test_name"_qps_"$qps
 | 
			
		||||
      backend=$CURRENT_LLM_SERVING_ENGINE
 | 
			
		||||
 | 
			
		||||
      if [[ "$backend" == *"vllm"* ]]; then
 | 
			
		||||
        backend="vllm"
 | 
			
		||||
      fi
 | 
			
		||||
      #TODO: add output dir.
 | 
			
		||||
      client_command="genai-perf profile \
 | 
			
		||||
        -m $model \
 | 
			
		||||
        --service-kind openai \
 | 
			
		||||
        --backend "$backend" \
 | 
			
		||||
        --endpoint-type chat \
 | 
			
		||||
        --streaming \
 | 
			
		||||
        --url localhost:$port \
 | 
			
		||||
        --request-rate $qps \
 | 
			
		||||
        --num-prompts $num_prompts \
 | 
			
		||||
      "
 | 
			
		||||
 | 
			
		||||
    echo "Client command: $client_command"
 | 
			
		||||
 | 
			
		||||
    eval "$client_command"
 | 
			
		||||
 | 
			
		||||
    #TODO: process/record outputs
 | 
			
		||||
    done
 | 
			
		||||
  done
 | 
			
		||||
 | 
			
		||||
  kill_gpu_processes
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
prepare_dataset() {
 | 
			
		||||
 | 
			
		||||
  # download sharegpt dataset
 | 
			
		||||
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
 | 
			
		||||
  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 | 
			
		||||
 | 
			
		||||
  # duplicate sonnet by 4x, to allow benchmarking with input length 2048
 | 
			
		||||
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
 | 
			
		||||
  echo "" > sonnet_4x.txt
 | 
			
		||||
  for _ in {1..4}
 | 
			
		||||
  do
 | 
			
		||||
    cat sonnet.txt >> sonnet_4x.txt
 | 
			
		||||
  done
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
main() {
 | 
			
		||||
 | 
			
		||||
  # check if the environment variable is successfully injected from yaml
 | 
			
		||||
 | 
			
		||||
  check_gpus
 | 
			
		||||
  check_hf_token
 | 
			
		||||
  get_current_llm_serving_engine
 | 
			
		||||
 | 
			
		||||
  pip install -U transformers
 | 
			
		||||
 | 
			
		||||
  pip install -r requirements/dev.txt
 | 
			
		||||
  which genai-perf
 | 
			
		||||
 | 
			
		||||
  # check storage
 | 
			
		||||
  df -h
 | 
			
		||||
 | 
			
		||||
  ensure_installed wget
 | 
			
		||||
  ensure_installed curl
 | 
			
		||||
  ensure_installed jq
 | 
			
		||||
  # genai-perf dependency
 | 
			
		||||
  ensure_installed libb64-0d
 | 
			
		||||
 | 
			
		||||
  prepare_dataset
 | 
			
		||||
 | 
			
		||||
  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
 | 
			
		||||
  declare -g RESULTS_FOLDER=results/
 | 
			
		||||
  mkdir -p $RESULTS_FOLDER
 | 
			
		||||
  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
 | 
			
		||||
 | 
			
		||||
  # run the test
 | 
			
		||||
  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
 | 
			
		||||
 | 
			
		||||
  # run genai-perf tests
 | 
			
		||||
  run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
 | 
			
		||||
  mv artifacts/ $RESULTS_FOLDER/
 | 
			
		||||
 | 
			
		||||
  # upload benchmark results to buildkite
 | 
			
		||||
  python3 -m pip install tabulate pandas
 | 
			
		||||
  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
 | 
			
		||||
  upload_to_buildkite
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
main "$@"
 | 
			
		||||
@ -15,8 +15,6 @@ check_gpus() {
 | 
			
		||||
    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
 | 
			
		||||
  elif command -v amd-smi; then
 | 
			
		||||
    declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
 | 
			
		||||
  elif command -v hl-smi; then
 | 
			
		||||
    declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  if [[ $gpu_count -gt 0 ]]; then
 | 
			
		||||
@ -25,16 +23,10 @@ check_gpus() {
 | 
			
		||||
    echo "Need at least 1 GPU to run benchmarking."
 | 
			
		||||
    exit 1
 | 
			
		||||
  fi
 | 
			
		||||
  
 | 
			
		||||
  declare -g arch_suffix=''
 | 
			
		||||
  
 | 
			
		||||
  if command -v nvidia-smi; then
 | 
			
		||||
    declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
 | 
			
		||||
  elif command -v amd-smi; then
 | 
			
		||||
    declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
 | 
			
		||||
  elif command -v hl-smi; then
 | 
			
		||||
    declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
 | 
			
		||||
    arch_suffix='-hpu'
 | 
			
		||||
  fi
 | 
			
		||||
  echo "GPU type is $gpu_type"
 | 
			
		||||
}
 | 
			
		||||
@ -146,10 +138,6 @@ kill_gpu_processes() {
 | 
			
		||||
    while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
 | 
			
		||||
      sleep 1
 | 
			
		||||
    done
 | 
			
		||||
  elif command -v hl-smi; then
 | 
			
		||||
    while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
 | 
			
		||||
      sleep 1
 | 
			
		||||
    done
 | 
			
		||||
  fi
 | 
			
		||||
 | 
			
		||||
  # remove vllm config file
 | 
			
		||||
@ -463,7 +451,6 @@ main() {
 | 
			
		||||
     ARCH='-cpu'
 | 
			
		||||
  else
 | 
			
		||||
     check_gpus
 | 
			
		||||
     ARCH="$arch_suffix"
 | 
			
		||||
  fi
 | 
			
		||||
  check_hf_token
 | 
			
		||||
 | 
			
		||||
@ -482,12 +469,7 @@ main() {
 | 
			
		||||
  ensure_sharegpt_downloaded
 | 
			
		||||
  declare -g RESULTS_FOLDER=results/
 | 
			
		||||
  mkdir -p $RESULTS_FOLDER
 | 
			
		||||
  QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
 | 
			
		||||
 | 
			
		||||
  # dump vllm info via vllm collect-env
 | 
			
		||||
  env_output=$(vllm collect-env)
 | 
			
		||||
 | 
			
		||||
  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
 | 
			
		||||
  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 | 
			
		||||
 | 
			
		||||
  # benchmarking
 | 
			
		||||
  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
 | 
			
		||||
@ -0,0 +1,82 @@
 | 
			
		||||
# SPDX-License-Identifier: Apache-2.0
 | 
			
		||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
			
		||||
 | 
			
		||||
import datetime
 | 
			
		||||
import json
 | 
			
		||||
import os
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
import pandas as pd
 | 
			
		||||
from tabulate import tabulate
 | 
			
		||||
 | 
			
		||||
results_folder = Path("results/")
 | 
			
		||||
 | 
			
		||||
# serving results and the keys that will be printed into markdown
 | 
			
		||||
serving_results = []
 | 
			
		||||
serving_column_mapping = {
 | 
			
		||||
    "test_name": "Test name",
 | 
			
		||||
    "gpu_type": "GPU",
 | 
			
		||||
    "completed": "Successful req.",
 | 
			
		||||
    "request_throughput": "Tput (req/s)",
 | 
			
		||||
    "mean_ttft_ms": "Mean TTFT (ms)",
 | 
			
		||||
    "std_ttft_ms": "Std TTFT (ms)",
 | 
			
		||||
    "median_ttft_ms": "Median TTFT (ms)",
 | 
			
		||||
    "mean_itl_ms": "Mean ITL (ms)",
 | 
			
		||||
    "std_itl_ms": "Std ITL (ms)",
 | 
			
		||||
    "median_itl_ms": "Median ITL (ms)",
 | 
			
		||||
    "mean_tpot_ms": "Mean TPOT (ms)",
 | 
			
		||||
    "std_tpot_ms": "Std TPOT (ms)",
 | 
			
		||||
    "median_tpot_ms": "Median TPOT (ms)",
 | 
			
		||||
    "total_token_throughput": "Total Token Tput (tok/s)",
 | 
			
		||||
    "output_throughput": "Output Tput (tok/s)",
 | 
			
		||||
    "total_input_tokens": "Total input tokens",
 | 
			
		||||
    "total_output_tokens": "Total output tokens",
 | 
			
		||||
    "engine": "Engine",
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    # collect results
 | 
			
		||||
    for test_file in results_folder.glob("*.json"):
 | 
			
		||||
        with open(test_file) as f:
 | 
			
		||||
            raw_result = json.loads(f.read())
 | 
			
		||||
 | 
			
		||||
        # attach the benchmarking command to raw_result
 | 
			
		||||
        with open(test_file.with_suffix(".commands")) as f:
 | 
			
		||||
            command = json.loads(f.read())
 | 
			
		||||
        raw_result.update(command)
 | 
			
		||||
 | 
			
		||||
        # update the test name of this result
 | 
			
		||||
        raw_result.update({"test_name": test_file.stem})
 | 
			
		||||
 | 
			
		||||
        # add the result to raw_result
 | 
			
		||||
        serving_results.append(raw_result)
 | 
			
		||||
        continue
 | 
			
		||||
 | 
			
		||||
    serving_results = pd.DataFrame.from_dict(serving_results)
 | 
			
		||||
 | 
			
		||||
    if not serving_results.empty:
 | 
			
		||||
        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
 | 
			
		||||
            columns=serving_column_mapping
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    serving_md_table_with_headers = tabulate(
 | 
			
		||||
        serving_results, headers="keys", tablefmt="pipe", showindex=False
 | 
			
		||||
    )
 | 
			
		||||
    # remove the first line of header
 | 
			
		||||
    serving_md_table_lines = serving_md_table_with_headers.split("\n")
 | 
			
		||||
    serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
 | 
			
		||||
 | 
			
		||||
    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 | 
			
		||||
    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 | 
			
		||||
 | 
			
		||||
    # document benchmarking results in markdown
 | 
			
		||||
    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
 | 
			
		||||
        # document results with header.
 | 
			
		||||
        # for those who wants to reproduce our benchmark.
 | 
			
		||||
        f.write(serving_md_table_with_headers)
 | 
			
		||||
        f.write("\n")
 | 
			
		||||
 | 
			
		||||
    # document benchmarking results in json
 | 
			
		||||
    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
 | 
			
		||||
        results = serving_results.to_dict(orient="records")
 | 
			
		||||
        f.write(json.dumps(results))
 | 
			
		||||
							
								
								
									
										23
									
								
								.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,23 @@
 | 
			
		||||
#!/bin/sh
 | 
			
		||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
 | 
			
		||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
 | 
			
		||||
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
 | 
			
		||||
else
 | 
			
		||||
    URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
TIMEOUT_SECONDS=10
 | 
			
		||||
 | 
			
		||||
retries=0
 | 
			
		||||
while [ $retries -lt 1000 ]; do
 | 
			
		||||
    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
 | 
			
		||||
        exit 0
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    echo "Waiting for image to be available..."
 | 
			
		||||
 | 
			
		||||
    retries=$((retries + 1))
 | 
			
		||||
    sleep 5
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
exit 1
 | 
			
		||||
							
								
								
									
										30
									
								
								.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,30 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_llama8B_tp1",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "num_iters_warmup": 5,
 | 
			
		||||
            "num_iters": 15
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_llama8B_tp4",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "num_iters_warmup": 5,
 | 
			
		||||
            "num_iters": 15
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -95,38 +95,6 @@
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
@ -265,41 +233,6 @@
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
@ -432,38 +365,6 @@
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp4_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
@ -602,41 +503,6 @@
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp4_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
@ -772,39 +638,6 @@
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp4_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
@ -947,42 +780,6 @@
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp4_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
	    "quantization": "awq",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 1000
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
 | 
			
		||||
        "qps_list": ["inf"],
 | 
			
		||||
@ -2,7 +2,7 @@
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp1_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [32],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
@ -28,13 +28,13 @@
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 32
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp2_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [32],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
@ -60,13 +60,13 @@
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 32
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp1_random_128_128",
 | 
			
		||||
        "test_name": "serving_llama8B_tp4_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [32],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
@ -76,7 +76,39 @@
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp4_random_1024_128",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
@ -92,16 +124,16 @@
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-input-len": 1024,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 32
 | 
			
		||||
            "num_prompts": 100
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp2_random_128_128",
 | 
			
		||||
        "test_name": "serving_llama8B_pp6_random_1024_128",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [32],
 | 
			
		||||
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
@ -111,7 +143,7 @@
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "pipeline_parallel_size": 6,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
@ -127,150 +159,10 @@
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-input-len": 1024,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 32
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp1_random_128_2048",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [32],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 2048,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 32
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp2_random_128_2048",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [32],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 128,
 | 
			
		||||
	    "random-output-len": 2048,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 32
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp1_random_2048_128",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [32],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 2048,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 32
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp2_random_2048_128",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "max_concurrency_list": [32],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
	    "enable_chunked_prefill": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "load_format": "dummy"
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "random",
 | 
			
		||||
	    "random-input-len": 2048,
 | 
			
		||||
	    "random-output-len": 128,
 | 
			
		||||
	    "ignore-eos": "",
 | 
			
		||||
            "num_prompts": 32
 | 
			
		||||
            "num_prompts": 100
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -0,0 +1,32 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_llama8B_tp1",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200,
 | 
			
		||||
            "backend": "vllm"
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_llama8B_tp4",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200,
 | 
			
		||||
            "backend": "vllm"
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,26 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_llama8B_tp2",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "num_iters_warmup": 5,
 | 
			
		||||
            "num_iters": 15
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,55 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_llama8B_tp1",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
            "PT_HPU_LAZY_MODE": 1,
 | 
			
		||||
            "VLLM_CONTIGUOUS_PA": 1,
 | 
			
		||||
            "VLLM_DEFRAG": 1
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "num-iters-warmup": 5,
 | 
			
		||||
            "num-iters": 15,
 | 
			
		||||
            "max-model-len": 256,
 | 
			
		||||
            "async-scheduling": ""
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_llama70B_tp4",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
            "PT_HPU_LAZY_MODE": 1,
 | 
			
		||||
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
 | 
			
		||||
            "VLLM_CONTIGUOUS_PA": 1,
 | 
			
		||||
            "VLLM_DEFRAG": 1
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "num-iters-warmup": 5,
 | 
			
		||||
            "num-iters": 15,
 | 
			
		||||
            "max-model-len": 256,
 | 
			
		||||
            "async-scheduling": ""
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "latency_mixtral8x7B_tp2",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
            "PT_HPU_LAZY_MODE": 1,
 | 
			
		||||
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
 | 
			
		||||
            "VLLM_CONTIGUOUS_PA": 1,
 | 
			
		||||
            "VLLM_DEFRAG": 1
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "num-iters-warmup": 5,
 | 
			
		||||
            "num-iters": 15,
 | 
			
		||||
            "max-model-len": 256,
 | 
			
		||||
            "async-scheduling": ""
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,82 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama8B_tp1_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "PT_HPU_LAZY_MODE": 1,
 | 
			
		||||
            "VLLM_CONTIGUOUS_PA": 1,
 | 
			
		||||
            "VLLM_DEFRAG": 1
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "swap_space": 16,
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "max-model-len": 2048,
 | 
			
		||||
            "max-num-seqs": 256,
 | 
			
		||||
            "async-scheduling": ""
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_llama70B_tp4_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "PT_HPU_LAZY_MODE": 1,
 | 
			
		||||
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
 | 
			
		||||
            "VLLM_CONTIGUOUS_PA": 1,
 | 
			
		||||
            "VLLM_DEFRAG": 1
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "swap_space": 16,
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "max-model-len": 2048,
 | 
			
		||||
            "max-num-seqs": 256,
 | 
			
		||||
            "async-scheduling": ""
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
 | 
			
		||||
        "qps_list": [1, 4, 16, "inf"],
 | 
			
		||||
        "server_environment_variables": {
 | 
			
		||||
            "PT_HPU_LAZY_MODE": 1,
 | 
			
		||||
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
 | 
			
		||||
            "VLLM_CONTIGUOUS_PA": 1,
 | 
			
		||||
            "VLLM_DEFRAG": 1
 | 
			
		||||
        },
 | 
			
		||||
        "server_parameters": {
 | 
			
		||||
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "swap_space": 16,
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "max-model-len": 2048,
 | 
			
		||||
            "max-num-seqs": 256,
 | 
			
		||||
            "async-scheduling": ""
 | 
			
		||||
        },
 | 
			
		||||
        "client_parameters": {
 | 
			
		||||
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "dataset_name": "sharegpt",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,27 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_llama8B_tp2",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
            "VLLM_RPC_TIMEOUT": 100000,
 | 
			
		||||
	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 | 
			
		||||
	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 | 
			
		||||
	    "VLLM_CPU_SGL_KERNEL": 1,
 | 
			
		||||
	    "VLLM_CPU_KVCACHE_SPACE": 40
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
	    "dtype": "bfloat16",
 | 
			
		||||
	    "distributed_executor_backend": "mp",
 | 
			
		||||
	    "block_size": 128,
 | 
			
		||||
	    "trust_remote_code": "",
 | 
			
		||||
            "disable_log_stats": "",
 | 
			
		||||
	    "enforce_eager": "",
 | 
			
		||||
	    "max_num_batched_tokens": 2048,
 | 
			
		||||
	    "max_num_seqs": 256,
 | 
			
		||||
            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 200,
 | 
			
		||||
            "backend": "vllm"
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,61 +0,0 @@
 | 
			
		||||
[
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_llama8B_tp1",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
            "PT_HPU_LAZY_MODE": 1,
 | 
			
		||||
            "VLLM_CONTIGUOUS_PA": 1,
 | 
			
		||||
            "VLLM_DEFRAG": 1
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 1,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 1000,
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "max-model-len": 2048,
 | 
			
		||||
            "max-num-seqs": 512,
 | 
			
		||||
            "async-scheduling": ""
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_llama70B_tp4",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
            "PT_HPU_LAZY_MODE": 1,
 | 
			
		||||
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
 | 
			
		||||
            "VLLM_CONTIGUOUS_PA": 1,
 | 
			
		||||
            "VLLM_DEFRAG": 1
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
 | 
			
		||||
            "tensor_parallel_size": 4,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 1000,
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "max-model-len": 2048,
 | 
			
		||||
            "max-num-seqs": 512,
 | 
			
		||||
            "async-scheduling": ""
 | 
			
		||||
        }
 | 
			
		||||
    },
 | 
			
		||||
    {
 | 
			
		||||
        "test_name": "throughput_mixtral8x7B_tp2",
 | 
			
		||||
        "environment_variables": {
 | 
			
		||||
            "PT_HPU_LAZY_MODE": 1,
 | 
			
		||||
            "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
 | 
			
		||||
            "VLLM_CONTIGUOUS_PA": 1,
 | 
			
		||||
            "VLLM_DEFRAG": 1
 | 
			
		||||
        },
 | 
			
		||||
        "parameters": {
 | 
			
		||||
            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
 | 
			
		||||
            "tensor_parallel_size": 2,
 | 
			
		||||
            "load_format": "dummy",
 | 
			
		||||
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
 | 
			
		||||
            "num_prompts": 1000,
 | 
			
		||||
            "backend": "vllm",
 | 
			
		||||
            "max-model-len": 2048,
 | 
			
		||||
            "max-num-seqs": 512,
 | 
			
		||||
            "async-scheduling": ""
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
]
 | 
			
		||||
@ -1,5 +1,5 @@
 | 
			
		||||
steps:
 | 
			
		||||
  # aarch64 + CUDA builds
 | 
			
		||||
  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
 | 
			
		||||
  - label: "Build arm64 wheel - CUDA 12.9"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-arm64-cuda-12-9
 | 
			
		||||
@ -15,21 +15,6 @@ steps:
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  # aarch64 build
 | 
			
		||||
  - label: "Build arm64 CPU wheel"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-arm64-cpu
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: arm64_cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  # x86 + CUDA builds
 | 
			
		||||
  - label: "Build wheel - CUDA 12.8"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-cuda-12-8
 | 
			
		||||
@ -43,6 +28,20 @@ steps:
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - label: "Build wheel - CUDA 12.6"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-cuda-12-6
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  # x86 + CUDA builds
 | 
			
		||||
  - label: "Build wheel - CUDA 12.9"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-cuda-12-9
 | 
			
		||||
@ -56,20 +55,6 @@ steps:
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - label: "Build wheel - CUDA 13.0"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-wheel-cuda-13-0
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "mkdir artifacts"
 | 
			
		||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
			
		||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  # Build release images (12.9)
 | 
			
		||||
  - label: "Build release image (x86)"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-release-image-x86
 | 
			
		||||
@ -77,12 +62,13 @@ steps:
 | 
			
		||||
      queue: cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 | 
			
		||||
      # re-tag to default image tag and push, just in case arm64 build fails
 | 
			
		||||
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
			
		||||
 | 
			
		||||
  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
 | 
			
		||||
  - label: "Build release image (arm64)"
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
    id: build-release-image-arm64
 | 
			
		||||
@ -156,22 +142,6 @@ steps:
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - block: "Build arm64 CPU release image"
 | 
			
		||||
    key: block-arm64-cpu-release-image-build
 | 
			
		||||
    depends_on: ~
 | 
			
		||||
 | 
			
		||||
  - label: "Build and publish arm64 CPU release image"
 | 
			
		||||
    depends_on: block-arm64-cpu-release-image-build
 | 
			
		||||
    agents:
 | 
			
		||||
      queue: arm64_cpu_queue_postmerge
 | 
			
		||||
    commands:
 | 
			
		||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
			
		||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
 | 
			
		||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
 | 
			
		||||
    env:
 | 
			
		||||
      DOCKER_BUILDKIT: "1"
 | 
			
		||||
 | 
			
		||||
  - label: "Build and publish nightly multi-arch image to DockerHub"
 | 
			
		||||
    depends_on:
 | 
			
		||||
      - create-multi-arch-manifest
 | 
			
		||||
 | 
			
		||||
@ -20,10 +20,7 @@ trap remove_docker_container EXIT
 | 
			
		||||
 | 
			
		||||
# Run the image and test offline inference/tensor parallel
 | 
			
		||||
docker run \
 | 
			
		||||
    --device /dev/dri:/dev/dri \
 | 
			
		||||
    --net=host \
 | 
			
		||||
    --ipc=host \
 | 
			
		||||
    --privileged \
 | 
			
		||||
    --device /dev/dri \
 | 
			
		||||
    -v /dev/dri/by-path:/dev/dri/by-path \
 | 
			
		||||
    --entrypoint="" \
 | 
			
		||||
    -e "HF_TOKEN=${HF_TOKEN}" \
 | 
			
		||||
@ -45,7 +42,7 @@ docker run \
 | 
			
		||||
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
 | 
			
		||||
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
 | 
			
		||||
    pytest -v -s v1/structured_output
 | 
			
		||||
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
 | 
			
		||||
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
 | 
			
		||||
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
 | 
			
		||||
    pytest -v -s v1/test_serial_utils.py
 | 
			
		||||
'
 | 
			
		||||
 | 
			
		||||
@ -1,62 +0,0 @@
 | 
			
		||||
#!/usr/bin/env bash
 | 
			
		||||
set -euxo pipefail
 | 
			
		||||
 | 
			
		||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
 | 
			
		||||
THRESHOLD=${1:-0.25}
 | 
			
		||||
NUM_Q=${2:-1319}
 | 
			
		||||
PORT=${3:-8010}
 | 
			
		||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 | 
			
		||||
mkdir -p "${OUT_DIR}"
 | 
			
		||||
 | 
			
		||||
wait_for_server() {
 | 
			
		||||
  local port=$1
 | 
			
		||||
  timeout 600 bash -c '
 | 
			
		||||
    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
 | 
			
		||||
      sleep 1
 | 
			
		||||
    done'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
 | 
			
		||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 | 
			
		||||
 | 
			
		||||
cleanup() {
 | 
			
		||||
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
 | 
			
		||||
    kill "${SERVER_PID}" 2>/dev/null || true
 | 
			
		||||
    for _ in {1..20}; do
 | 
			
		||||
      kill -0 "${SERVER_PID}" 2>/dev/null || break
 | 
			
		||||
      sleep 0.5
 | 
			
		||||
    done
 | 
			
		||||
    kill -9 "${SERVER_PID}" 2>/dev/null || true
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
trap cleanup EXIT
 | 
			
		||||
 | 
			
		||||
for BACK in "${BACKENDS[@]}"; do
 | 
			
		||||
  VLLM_DEEP_GEMM_WARMUP=skip \
 | 
			
		||||
  VLLM_ALL2ALL_BACKEND=$BACK \
 | 
			
		||||
  vllm serve "$MODEL" \
 | 
			
		||||
    --enforce-eager \
 | 
			
		||||
    --tensor-parallel-size 2 \
 | 
			
		||||
    --data-parallel-size 2 \
 | 
			
		||||
    --enable-expert-parallel \
 | 
			
		||||
    --enable-eplb \
 | 
			
		||||
    --trust-remote-code \
 | 
			
		||||
    --max-model-len 2048 \
 | 
			
		||||
    --port $PORT &
 | 
			
		||||
  SERVER_PID=$!
 | 
			
		||||
  wait_for_server $PORT
 | 
			
		||||
 | 
			
		||||
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
 | 
			
		||||
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
 | 
			
		||||
  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
 | 
			
		||||
  python3 - <<PY
 | 
			
		||||
import json; acc=json.load(open('${OUT}'))['accuracy']
 | 
			
		||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
 | 
			
		||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
 | 
			
		||||
PY
 | 
			
		||||
 | 
			
		||||
  cleanup
 | 
			
		||||
  SERVER_PID=
 | 
			
		||||
  sleep 1
 | 
			
		||||
  PORT=$((PORT+1))
 | 
			
		||||
done
 | 
			
		||||
@ -1,61 +0,0 @@
 | 
			
		||||
#!/usr/bin/env bash
 | 
			
		||||
set -euxo pipefail
 | 
			
		||||
 | 
			
		||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
 | 
			
		||||
THRESHOLD=${1:-0.8}
 | 
			
		||||
NUM_Q=${2:-1319}
 | 
			
		||||
PORT=${3:-8020}
 | 
			
		||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 | 
			
		||||
mkdir -p "${OUT_DIR}"
 | 
			
		||||
 | 
			
		||||
wait_for_server() {
 | 
			
		||||
  local port=$1
 | 
			
		||||
  timeout 600 bash -c '
 | 
			
		||||
    until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
 | 
			
		||||
      sleep 1
 | 
			
		||||
    done'
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
MODEL="QWen/Qwen3-30B-A3B-FP8"
 | 
			
		||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
 | 
			
		||||
 | 
			
		||||
cleanup() {
 | 
			
		||||
  if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
 | 
			
		||||
    kill "${SERVER_PID}" 2>/dev/null || true
 | 
			
		||||
    for _ in {1..20}; do
 | 
			
		||||
      kill -0 "${SERVER_PID}" 2>/dev/null || break
 | 
			
		||||
      sleep 0.5
 | 
			
		||||
    done
 | 
			
		||||
    kill -9 "${SERVER_PID}" 2>/dev/null || true
 | 
			
		||||
  fi
 | 
			
		||||
}
 | 
			
		||||
trap cleanup EXIT
 | 
			
		||||
 | 
			
		||||
for BACK in "${BACKENDS[@]}"; do
 | 
			
		||||
  VLLM_DEEP_GEMM_WARMUP=skip \
 | 
			
		||||
  VLLM_ALL2ALL_BACKEND=$BACK \
 | 
			
		||||
  vllm serve "$MODEL" \
 | 
			
		||||
    --enforce-eager \
 | 
			
		||||
    --tensor-parallel-size 2 \
 | 
			
		||||
    --data-parallel-size 2 \
 | 
			
		||||
    --enable-expert-parallel \
 | 
			
		||||
    --trust-remote-code \
 | 
			
		||||
    --max-model-len 2048 \
 | 
			
		||||
    --port $PORT &
 | 
			
		||||
  SERVER_PID=$!
 | 
			
		||||
  wait_for_server $PORT
 | 
			
		||||
 | 
			
		||||
  TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
 | 
			
		||||
  OUT="${OUT_DIR}/${TAG}_${BACK}.json"
 | 
			
		||||
  python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
 | 
			
		||||
  python3 - <<PY
 | 
			
		||||
import json; acc=json.load(open('${OUT}'))['accuracy']
 | 
			
		||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
 | 
			
		||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
 | 
			
		||||
PY
 | 
			
		||||
 | 
			
		||||
  cleanup
 | 
			
		||||
  SERVER_PID=
 | 
			
		||||
  sleep 1
 | 
			
		||||
  PORT=$((PORT+1))
 | 
			
		||||
done
 | 
			
		||||
@ -58,25 +58,33 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 | 
			
		||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
			
		||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
			
		||||
 | 
			
		||||
if [[ $normal_wheel == *"cu129"* ]]; then
 | 
			
		||||
if [[ $normal_wheel == *"cu126"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu126 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu128"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu128, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu128 wheels"
 | 
			
		||||
else
 | 
			
		||||
    # only upload index.html for cu129 wheels (default wheels) as it
 | 
			
		||||
    # is available on both x86 and arm64
 | 
			
		||||
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
 | 
			
		||||
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 | 
			
		||||
else
 | 
			
		||||
    echo "Skipping index files for non-cu129 wheels"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
# generate index for nightly
 | 
			
		||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 | 
			
		||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 | 
			
		||||
 | 
			
		||||
if [[ $normal_wheel == *"cu129"* ]]; then
 | 
			
		||||
if [[ $normal_wheel == *"cu126"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu126 wheels"
 | 
			
		||||
elif [[ $normal_wheel == *"cu128"* ]]; then
 | 
			
		||||
    # if $normal_wheel matches cu128, do not upload the index.html
 | 
			
		||||
    echo "Skipping index files for cu128 wheels"
 | 
			
		||||
else
 | 
			
		||||
    # only upload index.html for cu129 wheels (default wheels) as it
 | 
			
		||||
    # is available on both x86 and arm64
 | 
			
		||||
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 | 
			
		||||
else
 | 
			
		||||
    echo "Skipping index files for non-cu129 wheels"
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 | 
			
		||||
 | 
			
		||||
@ -38,7 +38,7 @@ steps:
 | 
			
		||||
- label: Pytorch Nightly Dependency Override Check # 2min
 | 
			
		||||
  # if this test fails, it means the nightly torch version is not compatible with some
 | 
			
		||||
  # of the dependencies. Please check the error message and add the package to whitelist
 | 
			
		||||
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
 | 
			
		||||
  # in /vllm/tools/generate_nightly_torch_test.py
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  agent_pool: mi325_1
 | 
			
		||||
  # grade: Blocking
 | 
			
		||||
@ -50,7 +50,7 @@ steps:
 | 
			
		||||
 | 
			
		||||
- label: Async Engine, Inputs, Utils, Worker Test # 36min
 | 
			
		||||
  timeout_in_minutes: 50
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  agent_pool: mi325_1
 | 
			
		||||
  # grade: Blocking
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -286,7 +286,7 @@ steps:
 | 
			
		||||
 | 
			
		||||
- label: Engine Test # 25min
 | 
			
		||||
  timeout_in_minutes: 40
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  agent_pool: mi325_1
 | 
			
		||||
  #grade: Blocking
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -318,7 +318,7 @@ steps:
 | 
			
		||||
 | 
			
		||||
- label: V1 Test entrypoints # 35min
 | 
			
		||||
  timeout_in_minutes: 50
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  agent_pool: mi325_1
 | 
			
		||||
  # grade: Blocking
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -395,9 +395,7 @@ steps:
 | 
			
		||||
    - python3 offline_inference/basic/embed.py
 | 
			
		||||
    - python3 offline_inference/basic/score.py
 | 
			
		||||
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 | 
			
		||||
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
 | 
			
		||||
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 | 
			
		||||
    #- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 | 
			
		||||
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 | 
			
		||||
 | 
			
		||||
- label: Platform Tests (CUDA) # 4min
 | 
			
		||||
  timeout_in_minutes: 15
 | 
			
		||||
@ -438,11 +436,7 @@ steps:
 | 
			
		||||
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
 | 
			
		||||
      --ignore=lora/test_chatglm3_tp.py \
 | 
			
		||||
      --ignore=lora/test_llama_tp.py \
 | 
			
		||||
      --ignore=lora/test_llm_with_multi_loras.py \
 | 
			
		||||
      --ignore=lora/test_olmoe_tp.py \
 | 
			
		||||
      --ignore=lora/test_deepseekv2_tp.py \
 | 
			
		||||
      --ignore=lora/test_gptoss_tp.py \
 | 
			
		||||
      --ignore=lora/test_qwen3moe_tp.py
 | 
			
		||||
      --ignore=lora/test_llm_with_multi_loras.py
 | 
			
		||||
  parallelism: 4
 | 
			
		||||
 | 
			
		||||
- label: PyTorch Compilation Unit Tests # 15min
 | 
			
		||||
@ -460,8 +454,8 @@ steps:
 | 
			
		||||
    - pytest -v -s compile/test_fusion_attn.py
 | 
			
		||||
    - pytest -v -s compile/test_functionalization.py
 | 
			
		||||
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
 | 
			
		||||
  #  - pytest -v -s compile/test_sequence_parallelism.py
 | 
			
		||||
  #  - pytest -v -s compile/test_async_tp.py
 | 
			
		||||
    - pytest -v -s compile/test_sequence_parallelism.py
 | 
			
		||||
    - pytest -v -s compile/test_async_tp.py
 | 
			
		||||
    - pytest -v -s compile/test_fusion_all_reduce.py
 | 
			
		||||
    - pytest -v -s compile/test_decorator.py
 | 
			
		||||
    - pytest -v -s compile/test_noop_elimination.py
 | 
			
		||||
@ -480,8 +474,8 @@ steps:
 | 
			
		||||
  - pytest -v -s compile/test_basic_correctness.py
 | 
			
		||||
  - pytest -v -s compile/piecewise/
 | 
			
		||||
 | 
			
		||||
- label: PyTorch Fullgraph Test # 22min
 | 
			
		||||
  timeout_in_minutes: 35
 | 
			
		||||
- label: PyTorch Fullgraph Test # 20min
 | 
			
		||||
  timeout_in_minutes: 30
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  agent_pool: mi325_1
 | 
			
		||||
  # grade: Blocking
 | 
			
		||||
@ -491,7 +485,6 @@ steps:
 | 
			
		||||
  - tests/compile
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -v -s compile/test_full_graph.py
 | 
			
		||||
  - pytest -v -s compile/test_fusions_e2e.py
 | 
			
		||||
 | 
			
		||||
- label: Kernels Core Operation Test # 48min
 | 
			
		||||
  timeout_in_minutes: 75
 | 
			
		||||
@ -501,7 +494,6 @@ steps:
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - tests/kernels/core
 | 
			
		||||
  - tests/kernels/test_top_k_per_row.py
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 | 
			
		||||
 | 
			
		||||
@ -561,7 +553,7 @@ steps:
 | 
			
		||||
 | 
			
		||||
- label: Model Executor Test # 23min
 | 
			
		||||
  timeout_in_minutes: 35
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  agent_pool: mi325_1
 | 
			
		||||
  # grade: Blocking
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -614,7 +606,7 @@ steps:
 | 
			
		||||
  # we can only upgrade after this is resolved
 | 
			
		||||
  # TODO(jerryzh168): resolve the above comment
 | 
			
		||||
  - uv pip install --system torchao==0.13.0
 | 
			
		||||
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 | 
			
		||||
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
 | 
			
		||||
 | 
			
		||||
- label: LM Eval Small Models # 53min
 | 
			
		||||
  timeout_in_minutes: 75
 | 
			
		||||
@ -789,10 +781,8 @@ steps:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/language/generation
 | 
			
		||||
  commands:
 | 
			
		||||
    # Install fast path packages for testing against transformers
 | 
			
		||||
    # Note: also needed to run plamo2 model in vLLM
 | 
			
		||||
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
 | 
			
		||||
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
 | 
			
		||||
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
 | 
			
		||||
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
 | 
			
		||||
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 | 
			
		||||
 | 
			
		||||
- label: Language Models Test (PPL)
 | 
			
		||||
@ -858,18 +848,6 @@ steps:
 | 
			
		||||
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
 | 
			
		||||
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  agent_pool: mi325_1
 | 
			
		||||
  timeout_in_minutes: 70
 | 
			
		||||
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - vllm/multimodal/
 | 
			
		||||
  - vllm/inputs/
 | 
			
		||||
  - vllm/v1/core/
 | 
			
		||||
  commands:
 | 
			
		||||
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 | 
			
		||||
 | 
			
		||||
- label: Multi-Modal Models Test (Extended) 1
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  agent_pool: mi325_1
 | 
			
		||||
@ -908,7 +886,7 @@ steps:
 | 
			
		||||
 | 
			
		||||
- label: Quantized Models Test # 45 min
 | 
			
		||||
  timeout_in_minutes: 60
 | 
			
		||||
  mirror_hardwares: [amdexperimental, amdproduction]
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  agent_pool: mi325_1
 | 
			
		||||
  # grade: Blocking
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
@ -945,8 +923,8 @@ steps:
 | 
			
		||||
    # Whisper needs spawn method to avoid deadlock
 | 
			
		||||
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 | 
			
		||||
 | 
			
		||||
- label: Blackwell Test # 21 min
 | 
			
		||||
  timeout_in_minutes: 30
 | 
			
		||||
- label: Blackwell Test # 38 min
 | 
			
		||||
  timeout_in_minutes: 60
 | 
			
		||||
  working_dir: "/vllm-workspace/"
 | 
			
		||||
  gpu: b200
 | 
			
		||||
  # optional: true
 | 
			
		||||
@ -959,6 +937,8 @@ steps:
 | 
			
		||||
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
 | 
			
		||||
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
 | 
			
		||||
  - vllm/v1/attention/backends/flashinfer.py
 | 
			
		||||
  - vllm/compilation/fusion.py
 | 
			
		||||
  - vllm/compilation/fusion_attn.py
 | 
			
		||||
  commands:
 | 
			
		||||
    - nvidia-smi
 | 
			
		||||
    - python3 examples/offline_inference/basic/chat.py
 | 
			
		||||
@ -975,32 +955,13 @@ steps:
 | 
			
		||||
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
 | 
			
		||||
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
 | 
			
		||||
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
 | 
			
		||||
    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
 | 
			
		||||
    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
 | 
			
		||||
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
 | 
			
		||||
    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
 | 
			
		||||
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 | 
			
		||||
 | 
			
		||||
- label: Blackwell Fusion Tests # 30 min
 | 
			
		||||
  timeout_in_minutes: 40
 | 
			
		||||
  working_dir: "/vllm-workspace/"
 | 
			
		||||
  gpu: b200
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/quantization/fp4/
 | 
			
		||||
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
 | 
			
		||||
  - vllm/v1/attention/backends/flashinfer.py
 | 
			
		||||
  - vllm/compilation/
 | 
			
		||||
  # can affect pattern matching
 | 
			
		||||
  - vllm/model_executor/layers/layernorm.py
 | 
			
		||||
  - vllm/model_executor/layers/activation.py
 | 
			
		||||
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
 | 
			
		||||
  commands:
 | 
			
		||||
    - nvidia-smi
 | 
			
		||||
    - pytest -v -s tests/compile/test_fusion_attn.py
 | 
			
		||||
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 | 
			
		||||
    # this runner has 2 GPUs available even though num_gpus=2 is not set
 | 
			
		||||
    # Fusion
 | 
			
		||||
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
 | 
			
		||||
    - pytest -v -s tests/compile/test_fusions_e2e.py
 | 
			
		||||
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
 | 
			
		||||
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 | 
			
		||||
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 | 
			
		||||
 | 
			
		||||
- label: Blackwell GPT-OSS Eval
 | 
			
		||||
  timeout_in_minutes: 60
 | 
			
		||||
@ -1120,7 +1081,6 @@ steps:
 | 
			
		||||
  - pytest -v -s ./compile/test_basic_correctness.py
 | 
			
		||||
  - pytest -v -s ./compile/test_wrapper.py
 | 
			
		||||
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
 | 
			
		||||
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
 | 
			
		||||
  - pytest -v -s distributed/test_sequence_parallel.py
 | 
			
		||||
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 | 
			
		||||
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 | 
			
		||||
@ -1168,11 +1128,6 @@ steps:
 | 
			
		||||
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
 | 
			
		||||
  - pip uninstall prithvi_io_processor_plugin -y
 | 
			
		||||
  # end io_processor plugins test
 | 
			
		||||
  # begin stat_logger plugins test
 | 
			
		||||
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
 | 
			
		||||
  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
 | 
			
		||||
  - pip uninstall dummy_stat_logger -y
 | 
			
		||||
  # end stat_logger plugins test
 | 
			
		||||
  # other tests continue here:
 | 
			
		||||
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
 | 
			
		||||
  - pip install -e ./plugins/vllm_add_dummy_model
 | 
			
		||||
@ -1216,8 +1171,6 @@ steps:
 | 
			
		||||
    - pytest -v -s -x lora/test_chatglm3_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_llama_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
 | 
			
		||||
    - pytest -v -s -x lora/test_olmoe_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_gptoss_tp.py
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
- label: Weight Loading Multiple GPU Test  # 33min
 | 
			
		||||
@ -1248,18 +1201,6 @@ steps:
 | 
			
		||||
  commands:
 | 
			
		||||
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 | 
			
		||||
 | 
			
		||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  agent_pool: mi325_4
 | 
			
		||||
  timeout_in_minutes: 30
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
 | 
			
		||||
    - tests/v1/kv_connector/nixl_integration/
 | 
			
		||||
  commands:
 | 
			
		||||
    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
 | 
			
		||||
    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 | 
			
		||||
 | 
			
		||||
##### multi gpus test #####
 | 
			
		||||
##### A100 test #####
 | 
			
		||||
@ -1291,16 +1232,12 @@ steps:
 | 
			
		||||
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 | 
			
		||||
 | 
			
		||||
##### H200 test #####
 | 
			
		||||
- label: Distributed Tests (H200) # optional
 | 
			
		||||
- label: Distrubted Tests (H200) # optional
 | 
			
		||||
  gpu: h200
 | 
			
		||||
  optional: true
 | 
			
		||||
  working_dir: "/vllm-workspace/"
 | 
			
		||||
  num_gpus: 2
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s tests/compile/test_async_tp.py
 | 
			
		||||
    - pytest -v -s tests/compile/test_sequence_parallelism.py
 | 
			
		||||
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
 | 
			
		||||
    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
 | 
			
		||||
    - pytest -v -s tests/distributed/test_context_parallel.py
 | 
			
		||||
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -38,7 +38,7 @@ steps:
 | 
			
		||||
- label: Pytorch Nightly Dependency Override Check # 2min
 | 
			
		||||
  # if this test fails, it means the nightly torch version is not compatible with some
 | 
			
		||||
  # of the dependencies. Please check the error message and add the package to whitelist
 | 
			
		||||
  # in /vllm/tools/pre_commit/generate_nightly_torch_test.py
 | 
			
		||||
  # in /vllm/tools/generate_nightly_torch_test.py
 | 
			
		||||
  soft_fail: true
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - requirements/nightly_torch_test.txt
 | 
			
		||||
@ -172,8 +172,6 @@ steps:
 | 
			
		||||
  - tests/v1/engine/test_engine_core_client.py
 | 
			
		||||
  - tests/distributed/test_symm_mem_allreduce.py
 | 
			
		||||
  commands:
 | 
			
		||||
  # https://github.com/NVIDIA/nccl/issues/1838
 | 
			
		||||
  - export NCCL_CUMEM_HOST_ENABLE=0
 | 
			
		||||
  # test with torchrun tp=2 and external_dp=2
 | 
			
		||||
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
			
		||||
  # test with torchrun tp=2 and pp=2
 | 
			
		||||
@ -205,24 +203,6 @@ steps:
 | 
			
		||||
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
 | 
			
		||||
  - popd
 | 
			
		||||
 | 
			
		||||
- label: Distributed Tests (8 GPUs) # 4min
 | 
			
		||||
  timeout_in_minutes: 10
 | 
			
		||||
  gpu: h100
 | 
			
		||||
  num_gpus: 8
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - examples/offline_inference/torchrun_dp_example.py
 | 
			
		||||
  - vllm/config/parallel.py
 | 
			
		||||
  - vllm/distributed/
 | 
			
		||||
  - vllm/v1/engine/llm_engine.py
 | 
			
		||||
  - vllm/v1/executor/uniproc_executor.py
 | 
			
		||||
  - vllm/v1/worker/gpu_worker.py
 | 
			
		||||
  commands:
 | 
			
		||||
  # https://github.com/NVIDIA/nccl/issues/1838
 | 
			
		||||
  - export NCCL_CUMEM_HOST_ENABLE=0
 | 
			
		||||
  # test with torchrun tp=2 and dp=4 with ep
 | 
			
		||||
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
 | 
			
		||||
 | 
			
		||||
- label: EPLB Algorithm Test # 5min
 | 
			
		||||
  timeout_in_minutes: 15
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
@ -331,15 +311,6 @@ steps:
 | 
			
		||||
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
 | 
			
		||||
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 | 
			
		||||
 | 
			
		||||
- label: V1 Test attention (H100) # 10min
 | 
			
		||||
  timeout_in_minutes: 30
 | 
			
		||||
  gpu: h100
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
    - vllm/v1/attention
 | 
			
		||||
    - tests/v1/attention
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s v1/attention
 | 
			
		||||
 | 
			
		||||
- label: V1 Test others (CPU) # 5 mins
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
    - vllm/
 | 
			
		||||
@ -378,8 +349,7 @@ steps:
 | 
			
		||||
    - python3 offline_inference/basic/embed.py
 | 
			
		||||
    - python3 offline_inference/basic/score.py
 | 
			
		||||
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 | 
			
		||||
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
 | 
			
		||||
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 | 
			
		||||
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 | 
			
		||||
 | 
			
		||||
- label: Platform Tests (CUDA) # 4min
 | 
			
		||||
  timeout_in_minutes: 15
 | 
			
		||||
@ -414,12 +384,7 @@ steps:
 | 
			
		||||
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
 | 
			
		||||
      --ignore=lora/test_chatglm3_tp.py \
 | 
			
		||||
      --ignore=lora/test_llama_tp.py \
 | 
			
		||||
      --ignore=lora/test_llm_with_multi_loras.py \
 | 
			
		||||
      --ignore=lora/test_olmoe_tp.py \
 | 
			
		||||
      --ignore=lora/test_deepseekv2_tp.py \
 | 
			
		||||
      --ignore=lora/test_gptoss_tp.py \
 | 
			
		||||
      --ignore=lora/test_qwen3moe_tp.py
 | 
			
		||||
 | 
			
		||||
      --ignore=lora/test_llm_with_multi_loras.py
 | 
			
		||||
  parallelism: 4
 | 
			
		||||
 | 
			
		||||
- label: PyTorch Compilation Unit Tests # 15min
 | 
			
		||||
@ -462,18 +427,6 @@ steps:
 | 
			
		||||
  - pytest -v -s compile/test_full_graph.py
 | 
			
		||||
  - pytest -v -s compile/test_fusions_e2e.py
 | 
			
		||||
 | 
			
		||||
- label: Cudagraph test
 | 
			
		||||
  timeout_in_minutes: 20
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - tests/v1/cudagraph
 | 
			
		||||
  - vllm/v1/cudagraph_dispatcher.py
 | 
			
		||||
  - vllm/config/compilation.py
 | 
			
		||||
  - vllm/compilation
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
 | 
			
		||||
    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
 | 
			
		||||
 | 
			
		||||
- label: Kernels Core Operation Test # 48min
 | 
			
		||||
  timeout_in_minutes: 75
 | 
			
		||||
  mirror_hardwares: [amdexperimental]
 | 
			
		||||
@ -516,8 +469,6 @@ steps:
 | 
			
		||||
  - tests/kernels/moe
 | 
			
		||||
  - vllm/model_executor/layers/fused_moe/
 | 
			
		||||
  - vllm/distributed/device_communicators/
 | 
			
		||||
  - vllm/envs.py
 | 
			
		||||
  - vllm/config
 | 
			
		||||
  commands:
 | 
			
		||||
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
 | 
			
		||||
  parallelism: 2
 | 
			
		||||
@ -578,7 +529,7 @@ steps:
 | 
			
		||||
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
 | 
			
		||||
  # we can only upgrade after this is resolved
 | 
			
		||||
  # TODO(jerryzh168): resolve the above comment
 | 
			
		||||
  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
 | 
			
		||||
  - uv pip install --system torchao==0.13.0
 | 
			
		||||
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 | 
			
		||||
 | 
			
		||||
- label: LM Eval Small Models # 53min
 | 
			
		||||
@ -728,10 +679,8 @@ steps:
 | 
			
		||||
  - vllm/
 | 
			
		||||
  - tests/models/language/generation
 | 
			
		||||
  commands:
 | 
			
		||||
    # Install fast path packages for testing against transformers
 | 
			
		||||
    # Note: also needed to run plamo2 model in vLLM
 | 
			
		||||
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
 | 
			
		||||
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
 | 
			
		||||
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
 | 
			
		||||
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
 | 
			
		||||
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 | 
			
		||||
 | 
			
		||||
- label: Language Models Test (PPL)
 | 
			
		||||
@ -1021,8 +970,6 @@ steps:
 | 
			
		||||
  - tests/v1/shutdown
 | 
			
		||||
  - tests/v1/worker/test_worker_memory_snapshot.py
 | 
			
		||||
  commands:
 | 
			
		||||
  # https://github.com/NVIDIA/nccl/issues/1838
 | 
			
		||||
  - export NCCL_CUMEM_HOST_ENABLE=0
 | 
			
		||||
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
 | 
			
		||||
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
 | 
			
		||||
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
 | 
			
		||||
@ -1030,7 +977,6 @@ steps:
 | 
			
		||||
  - pytest -v -s ./compile/test_basic_correctness.py
 | 
			
		||||
  - pytest -v -s ./compile/test_wrapper.py
 | 
			
		||||
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
 | 
			
		||||
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
 | 
			
		||||
  - pytest -v -s distributed/test_sequence_parallel.py
 | 
			
		||||
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 | 
			
		||||
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 | 
			
		||||
@ -1118,8 +1064,6 @@ steps:
 | 
			
		||||
    - pytest -v -s -x lora/test_chatglm3_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_llama_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
 | 
			
		||||
    - pytest -v -s -x lora/test_olmoe_tp.py
 | 
			
		||||
    - pytest -v -s -x lora/test_gptoss_tp.py
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
- label: Weight Loading Multiple GPU Test  # 33min
 | 
			
		||||
@ -1145,7 +1089,7 @@ steps:
 | 
			
		||||
  - tests/weight_loading
 | 
			
		||||
  commands:
 | 
			
		||||
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 | 
			
		||||
 | 
			
		||||
  
 | 
			
		||||
- label: NixlConnector PD accuracy tests (Distributed) # 30min
 | 
			
		||||
  timeout_in_minutes: 30
 | 
			
		||||
  working_dir: "/vllm-workspace/tests"
 | 
			
		||||
@ -1187,19 +1131,6 @@ steps:
 | 
			
		||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
			
		||||
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 | 
			
		||||
 | 
			
		||||
##### H100 test #####
 | 
			
		||||
- label: LM Eval Large Models (H100) # optional
 | 
			
		||||
  gpu: h100
 | 
			
		||||
  optional: true
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
 | 
			
		||||
  source_file_dependencies:
 | 
			
		||||
  - csrc/
 | 
			
		||||
  - vllm/model_executor/layers/quantization
 | 
			
		||||
  commands:
 | 
			
		||||
    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
 | 
			
		||||
    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
 | 
			
		||||
 | 
			
		||||
##### H200 test #####
 | 
			
		||||
- label: Distributed Tests (H200) # optional
 | 
			
		||||
  gpu: h200
 | 
			
		||||
@ -1235,21 +1166,3 @@ steps:
 | 
			
		||||
  - .buildkite/scripts/run-prime-rl-test.sh
 | 
			
		||||
  commands:
 | 
			
		||||
    - bash .buildkite/scripts/run-prime-rl-test.sh
 | 
			
		||||
 | 
			
		||||
- label: DeepSeek V2-Lite Accuracy
 | 
			
		||||
  timeout_in_minutes: 60
 | 
			
		||||
  gpu: h100
 | 
			
		||||
  optional: true
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  working_dir: "/vllm-workspace"
 | 
			
		||||
  commands:
 | 
			
		||||
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 | 
			
		||||
 | 
			
		||||
- label: Qwen3-30B-A3B-FP8-block Accuracy
 | 
			
		||||
  timeout_in_minutes: 60
 | 
			
		||||
  gpu: h100
 | 
			
		||||
  optional: true
 | 
			
		||||
  num_gpus: 4
 | 
			
		||||
  working_dir: "/vllm-workspace"
 | 
			
		||||
  commands:
 | 
			
		||||
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										9
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							@ -5,8 +5,8 @@
 | 
			
		||||
/vllm/attention @LucasWilkinson
 | 
			
		||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
			
		||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 | 
			
		||||
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 | 
			
		||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 | 
			
		||||
/vllm/model_executor/layers/fused_moe @mgoin
 | 
			
		||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 | 
			
		||||
/vllm/model_executor/layers/mamba @tdoublep
 | 
			
		||||
/vllm/model_executor/model_loader @22quinn
 | 
			
		||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
 | 
			
		||||
@ -25,8 +25,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 | 
			
		||||
 | 
			
		||||
# vLLM V1
 | 
			
		||||
/vllm/v1/attention @LucasWilkinson
 | 
			
		||||
/vllm/v1/attention/backends/mla @pavanimajety
 | 
			
		||||
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 | 
			
		||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
 | 
			
		||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
 | 
			
		||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 | 
			
		||||
/vllm/v1/sample @22quinn @houseroad @njhill
 | 
			
		||||
@ -45,7 +44,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 | 
			
		||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 | 
			
		||||
/tests/models @DarkLight1337 @ywang96
 | 
			
		||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 | 
			
		||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 | 
			
		||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 | 
			
		||||
/tests/test_inputs.py @DarkLight1337 @ywang96
 | 
			
		||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 | 
			
		||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										2
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							@ -108,7 +108,7 @@ pull_request_rules:
 | 
			
		||||
      - files~=^benchmarks/
 | 
			
		||||
      - files~=^vllm/benchmarks/
 | 
			
		||||
      - files~=^tests/benchmarks/
 | 
			
		||||
      - files~=^\.buildkite/performance-benchmarks/
 | 
			
		||||
      - files~=^\.buildkite/nightly-benchmarks/
 | 
			
		||||
  actions:
 | 
			
		||||
    label:
 | 
			
		||||
      add:
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -94,9 +94,6 @@ ipython_config.py
 | 
			
		||||
# generated files
 | 
			
		||||
**/generated/**
 | 
			
		||||
 | 
			
		||||
# uv
 | 
			
		||||
uv.lock
 | 
			
		||||
 | 
			
		||||
# pyenv
 | 
			
		||||
#   For a library or package, you might want to ignore these files since the code is
 | 
			
		||||
#   intended to run in multiple environments; otherwise, check them in:
 | 
			
		||||
 | 
			
		||||
@ -38,18 +38,18 @@ repos:
 | 
			
		||||
  rev: 0.9.1
 | 
			
		||||
  hooks:
 | 
			
		||||
    - id: pip-compile
 | 
			
		||||
      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
 | 
			
		||||
      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
 | 
			
		||||
      files: ^requirements/test\.(in|txt)$
 | 
			
		||||
- repo: local
 | 
			
		||||
  hooks:
 | 
			
		||||
  - id: format-torch-nightly-test
 | 
			
		||||
    name: reformat nightly_torch_test.txt to be in sync with test.in
 | 
			
		||||
    language: python
 | 
			
		||||
    entry: python tools/pre_commit/generate_nightly_torch_test.py
 | 
			
		||||
    entry: python tools/generate_nightly_torch_test.py
 | 
			
		||||
    files: ^requirements/test\.(in|txt)$
 | 
			
		||||
  - id: mypy-local
 | 
			
		||||
    name: Run mypy locally for lowest supported Python version
 | 
			
		||||
    entry: python tools/pre_commit/mypy.py 0 "3.10"
 | 
			
		||||
    name: Run mypy for local Python installation
 | 
			
		||||
    entry: python tools/pre_commit/mypy.py 0 "local"
 | 
			
		||||
    stages: [pre-commit] # Don't run in CI
 | 
			
		||||
    <<: &mypy_common
 | 
			
		||||
      language: python
 | 
			
		||||
@ -78,12 +78,12 @@ repos:
 | 
			
		||||
    stages: [manual] # Only run in CI
 | 
			
		||||
  - id: shellcheck
 | 
			
		||||
    name: Lint shell scripts
 | 
			
		||||
    entry: tools/pre_commit/shellcheck.sh
 | 
			
		||||
    entry: tools/shellcheck.sh
 | 
			
		||||
    language: script
 | 
			
		||||
    types: [shell]
 | 
			
		||||
  - id: png-lint
 | 
			
		||||
    name: Lint PNG exports from excalidraw
 | 
			
		||||
    entry: tools/pre_commit/png-lint.sh
 | 
			
		||||
    entry: tools/png-lint.sh
 | 
			
		||||
    language: script
 | 
			
		||||
    types: [png]
 | 
			
		||||
  - id: signoff-commit
 | 
			
		||||
@ -100,12 +100,12 @@ repos:
 | 
			
		||||
    stages: [commit-msg]
 | 
			
		||||
  - id: check-spdx-header
 | 
			
		||||
    name: Check SPDX headers
 | 
			
		||||
    entry: python tools/pre_commit/check_spdx_header.py
 | 
			
		||||
    entry: python tools/check_spdx_header.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
  - id: check-root-lazy-imports
 | 
			
		||||
    name: Check root lazy imports
 | 
			
		||||
    entry: python tools/pre_commit/check_init_lazy_imports.py
 | 
			
		||||
    entry: python tools/check_init_lazy_imports.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
  - id: check-filenames
 | 
			
		||||
@ -119,11 +119,11 @@ repos:
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
  - id: update-dockerfile-graph
 | 
			
		||||
    name: Update Dockerfile dependency graph
 | 
			
		||||
    entry: tools/pre_commit/update-dockerfile-graph.sh
 | 
			
		||||
    entry: tools/update-dockerfile-graph.sh
 | 
			
		||||
    language: script
 | 
			
		||||
  - id: enforce-import-regex-instead-of-re
 | 
			
		||||
    name: Enforce import regex as re
 | 
			
		||||
    entry: python tools/pre_commit/enforce_regex_import.py
 | 
			
		||||
    entry: python tools/enforce_regex_import.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
@ -131,7 +131,7 @@ repos:
 | 
			
		||||
  # forbid directly import triton
 | 
			
		||||
  - id: forbid-direct-triton-import
 | 
			
		||||
    name: "Forbid direct 'import triton'"
 | 
			
		||||
    entry: python tools/pre_commit/check_triton_import.py
 | 
			
		||||
    entry: python tools/check_triton_import.py
 | 
			
		||||
    language: python
 | 
			
		||||
    types: [python]
 | 
			
		||||
    pass_filenames: false
 | 
			
		||||
@ -144,7 +144,7 @@ repos:
 | 
			
		||||
    additional_dependencies: [regex]
 | 
			
		||||
  - id: validate-config
 | 
			
		||||
    name: Validate configuration has default values and that each field has a docstring
 | 
			
		||||
    entry: python tools/pre_commit/validate_config.py
 | 
			
		||||
    entry: python tools/validate_config.py
 | 
			
		||||
    language: python
 | 
			
		||||
    additional_dependencies: [regex]
 | 
			
		||||
  # Keep `suggestion` last
 | 
			
		||||
 | 
			
		||||
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 | 
			
		||||
# requirements.txt files and should be kept consistent.  The ROCm torch
 | 
			
		||||
# versions are derived from docker/Dockerfile.rocm
 | 
			
		||||
#
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
 | 
			
		||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Try to find python package with an executable that exactly matches
 | 
			
		||||
@ -883,7 +883,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 | 
			
		||||
set(VLLM_MOE_EXT_SRC
 | 
			
		||||
  "csrc/moe/torch_bindings.cpp"
 | 
			
		||||
  "csrc/moe/moe_align_sum_kernels.cu"
 | 
			
		||||
  "csrc/moe/moe_lora_align_sum_kernels.cu"
 | 
			
		||||
  "csrc/moe/topk_softmax_kernels.cu")
 | 
			
		||||
 | 
			
		||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
			
		||||
 | 
			
		||||
@ -21,7 +21,6 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 | 
			
		||||
 | 
			
		||||
*Latest News* 🔥
 | 
			
		||||
 | 
			
		||||
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
 | 
			
		||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 | 
			
		||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 | 
			
		||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 | 
			
		||||
 | 
			
		||||
@ -5,7 +5,7 @@ import gc
 | 
			
		||||
from benchmark_utils import TimeCollector
 | 
			
		||||
from tabulate import tabulate
 | 
			
		||||
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.v1.core.block_pool import BlockPool
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -46,7 +46,7 @@ import time
 | 
			
		||||
 | 
			
		||||
from vllm import LLM, SamplingParams
 | 
			
		||||
from vllm.engine.arg_utils import EngineArgs
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
 | 
			
		||||
 | 
			
		||||
@ -19,7 +19,7 @@ from vllm.config import (
 | 
			
		||||
    VllmConfig,
 | 
			
		||||
)
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 | 
			
		||||
from vllm.v1.worker.gpu_input_batch import InputBatch
 | 
			
		||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 | 
			
		||||
 | 
			
		||||
@ -37,7 +37,7 @@ from transformers import PreTrainedTokenizerBase
 | 
			
		||||
 | 
			
		||||
from vllm import LLM, SamplingParams
 | 
			
		||||
from vllm.engine.arg_utils import EngineArgs
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    from vllm.transformers_utils.tokenizer import get_tokenizer
 | 
			
		||||
 | 
			
		||||
@ -11,7 +11,7 @@ import time
 | 
			
		||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
 | 
			
		||||
 | 
			
		||||
from vllm.engine.arg_utils import EngineArgs
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Select a equi-probable random priority
 | 
			
		||||
 | 
			
		||||
@ -51,7 +51,7 @@ except ImportError:
 | 
			
		||||
    from backend_request_func import get_tokenizer
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
    from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
    from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
except ImportError:
 | 
			
		||||
    from argparse import ArgumentParser as FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -15,7 +15,7 @@ from utils import make_rand_sparse_tensors
 | 
			
		||||
from weight_shapes import WEIGHT_SHAPES
 | 
			
		||||
 | 
			
		||||
from vllm import _custom_ops as ops
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 | 
			
		||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 | 
			
		||||
 | 
			
		||||
@ -18,8 +18,7 @@ from vllm import _custom_ops as ops
 | 
			
		||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 | 
			
		||||
    w8a8_triton_block_scaled_mm,
 | 
			
		||||
)
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils.math_utils import cdiv
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser, cdiv
 | 
			
		||||
 | 
			
		||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 | 
			
		||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
 | 
			
		||||
 | 
			
		||||
@ -10,7 +10,7 @@ import torch
 | 
			
		||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 | 
			
		||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 | 
			
		||||
from vllm.triton_utils import triton
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -10,7 +10,7 @@ import vllm.model_executor.layers.activation  # noqa F401
 | 
			
		||||
from vllm.model_executor.custom_op import CustomOp
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.triton_utils import triton
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 | 
			
		||||
 | 
			
		||||
batch_size_range = [1, 16, 32, 64, 128]
 | 
			
		||||
 | 
			
		||||
@ -28,7 +28,7 @@ except ImportError as e:
 | 
			
		||||
 | 
			
		||||
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
 | 
			
		||||
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
parser = FlexibleArgumentParser(
 | 
			
		||||
    description="Benchmark BitBLAS int4 on a specific target."
 | 
			
		||||
 | 
			
		||||
@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
 | 
			
		||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 | 
			
		||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 | 
			
		||||
from vllm.scalar_type import scalar_types
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
WEIGHT_SHAPES_MOE = {
 | 
			
		||||
    "nvidia/DeepSeek-R1-FP4": [
 | 
			
		||||
 | 
			
		||||
@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
 | 
			
		||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 | 
			
		||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
# Weight shapes for different models: [num_experts, topk, hidden_size,
 | 
			
		||||
# intermediate_size]
 | 
			
		||||
 | 
			
		||||
@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
 | 
			
		||||
)
 | 
			
		||||
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 | 
			
		||||
from vllm.logger import init_logger
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
logger = init_logger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
 | 
			
		||||
    fused_experts,
 | 
			
		||||
    fused_topk,
 | 
			
		||||
)
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
DEFAULT_MODELS = [
 | 
			
		||||
    "nm-testing/Mixtral-8x7B-Instruct-v0.1",
 | 
			
		||||
 | 
			
		||||
@ -7,7 +7,7 @@ import torch
 | 
			
		||||
 | 
			
		||||
from vllm.model_executor.layers.layernorm import RMSNorm
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -25,7 +25,7 @@ if HAS_TRITON:
 | 
			
		||||
    from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
 | 
			
		||||
    from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
 | 
			
		||||
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
 | 
			
		||||
DEFAULT_TP_SIZES = [1]
 | 
			
		||||
 | 
			
		||||
@ -33,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 | 
			
		||||
    quantize_weights,
 | 
			
		||||
)
 | 
			
		||||
from vllm.scalar_type import ScalarType, scalar_types
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
 | 
			
		||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
 | 
			
		||||
 | 
			
		||||
@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 | 
			
		||||
    sort_weights,
 | 
			
		||||
)
 | 
			
		||||
from vllm.scalar_type import ScalarType, scalar_types
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 | 
			
		||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
 | 
			
		||||
 | 
			
		||||
@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.transformers_utils.config import get_config
 | 
			
		||||
from vllm.triton_utils import triton
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
FP8_DTYPE = current_platform.fp8_dtype()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
 | 
			
		||||
)
 | 
			
		||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
FP8_DTYPE = current_platform.fp8_dtype()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -39,7 +39,7 @@ import torch
 | 
			
		||||
from vllm.model_executor.layers.rotary_embedding import get_rope
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.transformers_utils.config import get_config
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -9,7 +9,7 @@ import torch
 | 
			
		||||
from vllm import _custom_ops as ops
 | 
			
		||||
from vllm.logger import init_logger
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils.torch_utils import (
 | 
			
		||||
    STR_DTYPE_TO_TORCH_DTYPE,
 | 
			
		||||
    create_kv_caches_with_random,
 | 
			
		||||
 | 
			
		||||
@ -7,7 +7,7 @@ import torch
 | 
			
		||||
 | 
			
		||||
from vllm import _custom_ops as ops
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -9,7 +9,7 @@ from tabulate import tabulate
 | 
			
		||||
from vllm import _custom_ops as ops
 | 
			
		||||
from vllm.logger import init_logger
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils.torch_utils import (
 | 
			
		||||
    STR_DTYPE_TO_TORCH_DTYPE,
 | 
			
		||||
    create_kv_caches_with_random,
 | 
			
		||||
 | 
			
		||||
@ -12,7 +12,7 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import (
 | 
			
		||||
)
 | 
			
		||||
from vllm.logger import init_logger
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils.torch_utils import (
 | 
			
		||||
    STR_DTYPE_TO_TORCH_DTYPE,
 | 
			
		||||
    create_kv_caches_with_random_flash,
 | 
			
		||||
 | 
			
		||||
@ -8,7 +8,7 @@ import torch
 | 
			
		||||
 | 
			
		||||
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding, get_rope
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def benchmark_rope_kernels_multi_lora(
 | 
			
		||||
 | 
			
		||||
@ -8,7 +8,7 @@ from datetime import datetime
 | 
			
		||||
import flashinfer
 | 
			
		||||
import torch
 | 
			
		||||
 | 
			
		||||
from vllm.utils.math_utils import round_up
 | 
			
		||||
from vllm.utils import round_up
 | 
			
		||||
 | 
			
		||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 | 
			
		||||
FP8_DTYPE = torch.float8_e4m3fn
 | 
			
		||||
 | 
			
		||||
@ -8,7 +8,7 @@ from datetime import datetime
 | 
			
		||||
import flashinfer
 | 
			
		||||
import torch
 | 
			
		||||
 | 
			
		||||
from vllm.utils.math_utils import round_up
 | 
			
		||||
from vllm.utils import round_up
 | 
			
		||||
 | 
			
		||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 | 
			
		||||
FP8_DTYPE = torch.float8_e4m3fn
 | 
			
		||||
 | 
			
		||||
@ -18,7 +18,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 | 
			
		||||
)
 | 
			
		||||
from vllm.platforms import current_platform
 | 
			
		||||
from vllm.triton_utils import triton
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
mp.set_start_method("spawn", force=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -11,7 +11,7 @@ import regex as re
 | 
			
		||||
import seaborn as sns
 | 
			
		||||
from torch.utils.benchmark import Measurement as TMeasurement
 | 
			
		||||
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    parser = FlexibleArgumentParser(
 | 
			
		||||
 | 
			
		||||
@ -5,7 +5,7 @@ import cProfile
 | 
			
		||||
import pstats
 | 
			
		||||
 | 
			
		||||
from vllm import LLM, SamplingParams
 | 
			
		||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
 | 
			
		||||
from vllm.utils import FlexibleArgumentParser
 | 
			
		||||
 | 
			
		||||
# A very long prompt, total number of tokens is about 15k.
 | 
			
		||||
LONG_PROMPT = ["You are an expert in large language models, aren't you?"] * 1000
 | 
			
		||||
 | 
			
		||||
@ -188,60 +188,16 @@ else()
 | 
			
		||||
    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
 | 
			
		||||
# Flag to enable ACL kernels for AARCH64 platforms
 | 
			
		||||
if (VLLM_BUILD_ACL STREQUAL "ON")
 | 
			
		||||
    set(USE_ACL ON)
 | 
			
		||||
else()
 | 
			
		||||
    set(USE_ACL OFF)
 | 
			
		||||
endif()
 | 
			
		||||
 | 
			
		||||
# Build oneDNN for GEMM kernels (only for x86-AVX512 /ARM platforms)
 | 
			
		||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
 | 
			
		||||
    # Fetch and build Arm Compute Library (ACL) as oneDNN's backend for AArch64
 | 
			
		||||
    # TODO [fadara01]: remove this once ACL can be fetched and built automatically as a dependency of oneDNN
 | 
			
		||||
    if(ASIMD_FOUND)
 | 
			
		||||
        if(DEFINED ENV{ACL_ROOT_DIR} AND IS_DIRECTORY "$ENV{ACL_ROOT_DIR}")
 | 
			
		||||
            message(STATUS "Using ACL from specified source directory: $ENV{ACL_ROOT_DIR}")
 | 
			
		||||
        else()
 | 
			
		||||
            message(STATUS "Downloading Arm Compute Library (ACL) from GitHub")
 | 
			
		||||
            FetchContent_Populate(arm_compute
 | 
			
		||||
                SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
 | 
			
		||||
                SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
 | 
			
		||||
                GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
 | 
			
		||||
                GIT_TAG        v52.2.0
 | 
			
		||||
                GIT_SHALLOW    TRUE
 | 
			
		||||
                GIT_PROGRESS   TRUE
 | 
			
		||||
            )
 | 
			
		||||
            set(ENV{ACL_ROOT_DIR} "${arm_compute_SOURCE_DIR}")
 | 
			
		||||
        endif()
 | 
			
		||||
 | 
			
		||||
        # Build ACL with scons
 | 
			
		||||
        include(ProcessorCount)
 | 
			
		||||
        ProcessorCount(_NPROC)
 | 
			
		||||
        set(_scons_cmd
 | 
			
		||||
        scons -j${_NPROC}
 | 
			
		||||
            Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
 | 
			
		||||
            arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
 | 
			
		||||
            multi_isa=1 openmp=1 cppthreads=0
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        # locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
 | 
			
		||||
        # and create a local shim dir with it
 | 
			
		||||
        include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
 | 
			
		||||
        vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
 | 
			
		||||
 | 
			
		||||
        if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
 | 
			
		||||
            list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
 | 
			
		||||
        endif()
 | 
			
		||||
 | 
			
		||||
        execute_process(
 | 
			
		||||
            COMMAND ${_scons_cmd}
 | 
			
		||||
            WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
 | 
			
		||||
            RESULT_VARIABLE _acl_rc
 | 
			
		||||
        )
 | 
			
		||||
        if(NOT _acl_rc EQUAL 0)
 | 
			
		||||
            message(FATAL_ERROR "ACL SCons build failed (exit ${_acl_rc}).")
 | 
			
		||||
        endif()
 | 
			
		||||
 | 
			
		||||
        set(ONEDNN_AARCH64_USE_ACL "ON")
 | 
			
		||||
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
 | 
			
		||||
        add_compile_definitions(VLLM_USE_ACL)
 | 
			
		||||
    endif()
 | 
			
		||||
 | 
			
		||||
    set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.")
 | 
			
		||||
 | 
			
		||||
    if(FETCHCONTENT_SOURCE_DIR_ONEDNN)
 | 
			
		||||
@ -261,6 +217,16 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
 | 
			
		||||
        )
 | 
			
		||||
    endif()
 | 
			
		||||
 | 
			
		||||
    if(USE_ACL)
 | 
			
		||||
        find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/)
 | 
			
		||||
        if(NOT ARM_COMPUTE_LIBRARY)
 | 
			
		||||
            message(FATAL_ERROR "Could not find ARM Compute Library: please set ACL_ROOT_DIR")
 | 
			
		||||
        endif()
 | 
			
		||||
        set(ONEDNN_AARCH64_USE_ACL "ON")
 | 
			
		||||
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
 | 
			
		||||
        add_compile_definitions(VLLM_USE_ACL)
 | 
			
		||||
    endif()
 | 
			
		||||
 | 
			
		||||
    set(ONEDNN_LIBRARY_TYPE "STATIC")
 | 
			
		||||
    set(ONEDNN_BUILD_DOC "OFF")
 | 
			
		||||
    set(ONEDNN_BUILD_EXAMPLES "OFF")
 | 
			
		||||
 | 
			
		||||
@ -19,7 +19,7 @@ else()
 | 
			
		||||
  FetchContent_Declare(
 | 
			
		||||
        flashmla
 | 
			
		||||
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
 | 
			
		||||
        GIT_TAG 46d64a8ebef03fa50b4ae74937276a5c940e3f95
 | 
			
		||||
        GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
 | 
			
		||||
        GIT_PROGRESS TRUE
 | 
			
		||||
        CONFIGURE_COMMAND ""
 | 
			
		||||
        BUILD_COMMAND ""
 | 
			
		||||
@ -66,7 +66,6 @@ if(FLASH_MLA_ARCHS)
 | 
			
		||||
        ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
 | 
			
		||||
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
 | 
			
		||||
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
 | 
			
		||||
        ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_metadata.cu
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
    set(FlashMLA_INCLUDES
 | 
			
		||||
 | 
			
		||||
@ -129,44 +129,6 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
 | 
			
		||||
  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
 | 
			
		||||
endfunction()
 | 
			
		||||
 | 
			
		||||
# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
 | 
			
		||||
#   libgomp.so    -> libgomp-<hash>.so...
 | 
			
		||||
#   libgomp.so.1  -> libgomp-<hash>.so...
 | 
			
		||||
# OUTPUT: TORCH_GOMP_SHIM_DIR  ("" if not found)
 | 
			
		||||
function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
 | 
			
		||||
  set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
 | 
			
		||||
 | 
			
		||||
  # Use run_python to locate vendored libgomp; never throw on failure.
 | 
			
		||||
  run_python(_VLLM_TORCH_GOMP_PATH
 | 
			
		||||
    "
 | 
			
		||||
import os, glob
 | 
			
		||||
try:
 | 
			
		||||
  import torch
 | 
			
		||||
  torch_pkg = os.path.dirname(torch.__file__)
 | 
			
		||||
  site_root = os.path.dirname(torch_pkg)
 | 
			
		||||
  torch_libs = os.path.join(site_root, 'torch.libs')
 | 
			
		||||
  print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
 | 
			
		||||
except:
 | 
			
		||||
  print('')
 | 
			
		||||
"
 | 
			
		||||
    "failed to probe torch.libs for libgomp")
 | 
			
		||||
 | 
			
		||||
  if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
 | 
			
		||||
    return()
 | 
			
		||||
  endif()
 | 
			
		||||
 | 
			
		||||
  # Create shim under the build tree
 | 
			
		||||
  set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
 | 
			
		||||
  file(MAKE_DIRECTORY "${_shim}")
 | 
			
		||||
 | 
			
		||||
  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
 | 
			
		||||
  execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
 | 
			
		||||
  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
 | 
			
		||||
  execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
 | 
			
		||||
 | 
			
		||||
  set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
 | 
			
		||||
endfunction()
 | 
			
		||||
 | 
			
		||||
# Macro for converting a `gencode` version number to a cmake version number.
 | 
			
		||||
macro(string_to_ver OUT_VER IN_STR)
 | 
			
		||||
  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
 | 
			
		||||
 | 
			
		||||
@ -187,8 +187,7 @@ template <>
 | 
			
		||||
struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
 | 
			
		||||
  size_t operator()(
 | 
			
		||||
      const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
 | 
			
		||||
    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
 | 
			
		||||
           hash<int>()(static_cast<int>(val.b_type));
 | 
			
		||||
    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
 | 
			
		||||
  }
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
@ -217,8 +216,7 @@ bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
 | 
			
		||||
 | 
			
		||||
bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
 | 
			
		||||
                const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
 | 
			
		||||
  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
 | 
			
		||||
         l.b_type == r.b_type;
 | 
			
		||||
  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
 | 
			
		||||
@ -495,10 +493,8 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) {
 | 
			
		||||
dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
 | 
			
		||||
    const MSizeCacheKey& key) {
 | 
			
		||||
  if (m_size_cache_.get() == nullptr) {
 | 
			
		||||
    ClassMatmulCacheKey class_key = {
 | 
			
		||||
        .b_n_size = b_n_size_, .b_k_size = b_k_size_, .b_type = b_type_};
 | 
			
		||||
    m_size_cache_ =
 | 
			
		||||
        get_matul_class_primitive_cache(class_key, primitive_cache_size_);
 | 
			
		||||
    ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
 | 
			
		||||
    m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
 | 
			
		||||
  }
 | 
			
		||||
  return m_size_cache_->get_or_create(key, [&]() {
 | 
			
		||||
    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
 | 
			
		||||
 | 
			
		||||
@ -199,7 +199,6 @@ class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
 | 
			
		||||
  struct ClassMatmulCacheKey {
 | 
			
		||||
    dnnl_dim_t b_n_size;
 | 
			
		||||
    dnnl_dim_t b_k_size;
 | 
			
		||||
    dnnl::memory::data_type b_type;
 | 
			
		||||
 | 
			
		||||
    friend bool operator==(const ClassMatmulCacheKey& l,
 | 
			
		||||
                           const ClassMatmulCacheKey& r);
 | 
			
		||||
 | 
			
		||||
@ -1,169 +0,0 @@
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <time.h>
 | 
			
		||||
#include <torch/all.h>
 | 
			
		||||
#include <ATen/cuda/CUDAContext.h>
 | 
			
		||||
#include <c10/cuda/CUDAGuard.h>
 | 
			
		||||
 | 
			
		||||
#include <ATen/ATen.h>
 | 
			
		||||
#include <ATen/cuda/Atomic.cuh>
 | 
			
		||||
 | 
			
		||||
#include "../cuda_compat.h"
 | 
			
		||||
#include "../dispatch_utils.h"
 | 
			
		||||
#include "core/math.hpp"
 | 
			
		||||
 | 
			
		||||
namespace {
 | 
			
		||||
 | 
			
		||||
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
 | 
			
		||||
                                         int32_t col) {
 | 
			
		||||
  return row * total_col + col;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}  // namespace
 | 
			
		||||
 | 
			
		||||
// TODO: Refactor common parts with moe_align_sum_kernels
 | 
			
		||||
template <typename scalar_t, typename token_cnts_t>
 | 
			
		||||
__global__ void moe_lora_align_sum_kernel(
 | 
			
		||||
    scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping,
 | 
			
		||||
    int64_t block_size, int num_experts, int max_loras, size_t numel,
 | 
			
		||||
    int max_num_tokens_padded, int max_num_m_blocks,
 | 
			
		||||
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
 | 
			
		||||
    int topk_num, int32_t* total_tokens_post_pad) {
 | 
			
		||||
  const size_t tokens_per_thread = div_ceil(numel, blockDim.x);
 | 
			
		||||
  const size_t start_idx = threadIdx.x * tokens_per_thread;
 | 
			
		||||
 | 
			
		||||
  int lora_id = blockIdx.x;
 | 
			
		||||
  extern __shared__ int32_t shared_mem[];
 | 
			
		||||
  int32_t* cumsum = shared_mem;
 | 
			
		||||
  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1);
 | 
			
		||||
 | 
			
		||||
  // Initialize sorted_token_ids with numel
 | 
			
		||||
  for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) {
 | 
			
		||||
    sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Initialize expert_ids with -1
 | 
			
		||||
  for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) {
 | 
			
		||||
    expert_ids[lora_id * max_num_m_blocks + it] = -1;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Initialize total_tokens_post_pad with 0
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    total_tokens_post_pad[lora_id] = 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (int i = 0; i < num_experts; ++i) {
 | 
			
		||||
    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
 | 
			
		||||
    int mask = token_lora_mapping[i / topk_num] == lora_id;
 | 
			
		||||
    int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]);
 | 
			
		||||
    tokens_cnts[idx] += mask;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
 | 
			
		||||
  // For each expert we accumulate the token counts from the different threads.
 | 
			
		||||
  if (threadIdx.x < num_experts) {
 | 
			
		||||
    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
 | 
			
		||||
    for (int i = 1; i <= blockDim.x; ++i) {
 | 
			
		||||
      tokens_cnts[index(num_experts, i, threadIdx.x)] +=
 | 
			
		||||
          tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
 | 
			
		||||
  // We accumulate the token counts of all experts in thread 0.
 | 
			
		||||
  if (threadIdx.x == 0) {
 | 
			
		||||
    cumsum[0] = 0;
 | 
			
		||||
    for (int i = 1; i <= num_experts; ++i) {
 | 
			
		||||
      cumsum[i] = cumsum[i - 1] +
 | 
			
		||||
                  div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
 | 
			
		||||
                           block_size) *
 | 
			
		||||
                      block_size;
 | 
			
		||||
    }
 | 
			
		||||
    total_tokens_post_pad[lora_id] = static_cast<int32_t>(cumsum[num_experts]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
 | 
			
		||||
  /**
 | 
			
		||||
   * For each expert, each thread processes the tokens of the corresponding
 | 
			
		||||
   * blocks and stores the corresponding expert_id for each block.
 | 
			
		||||
   */
 | 
			
		||||
  if (threadIdx.x < num_experts) {
 | 
			
		||||
    for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
 | 
			
		||||
         i += block_size) {
 | 
			
		||||
      expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] =
 | 
			
		||||
          threadIdx.x;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
 | 
			
		||||
    int32_t expert_id = topk_ids[i];
 | 
			
		||||
    /** The cumsum[expert_id] stores the starting index of the tokens that the
 | 
			
		||||
     * expert with expert_id needs to process, and
 | 
			
		||||
     * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
 | 
			
		||||
     * processed by the expert with expert_id within the current thread's token
 | 
			
		||||
     * shard.
 | 
			
		||||
     */
 | 
			
		||||
    int32_t rank_post_pad =
 | 
			
		||||
        tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
 | 
			
		||||
        cumsum[expert_id];
 | 
			
		||||
 | 
			
		||||
    int mask = (int)token_lora_mapping[i / topk_num] == lora_id;
 | 
			
		||||
    atomicAdd(
 | 
			
		||||
        &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)],
 | 
			
		||||
        (i - numel) * mask);
 | 
			
		||||
    tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask;
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void moe_lora_align_block_size(torch::Tensor topk_ids,
 | 
			
		||||
                               torch::Tensor token_lora_mapping,
 | 
			
		||||
                               int64_t num_experts, int64_t block_size,
 | 
			
		||||
                               int64_t max_loras, int64_t max_num_tokens_padded,
 | 
			
		||||
                               int64_t max_num_m_blocks,
 | 
			
		||||
                               torch::Tensor sorted_token_ids,
 | 
			
		||||
                               torch::Tensor expert_ids,
 | 
			
		||||
                               torch::Tensor num_tokens_post_pad) {
 | 
			
		||||
  const int topk_num = topk_ids.size(1);
 | 
			
		||||
 | 
			
		||||
  TORCH_CHECK(block_size > 0, "block_size should be greater than 0. ");
 | 
			
		||||
 | 
			
		||||
  int device_max_shared_mem;
 | 
			
		||||
  auto dev = topk_ids.get_device();
 | 
			
		||||
  cudaDeviceGetAttribute(&device_max_shared_mem,
 | 
			
		||||
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
 | 
			
		||||
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 | 
			
		||||
 | 
			
		||||
  const int32_t num_thread = max((int32_t)num_experts, 128);  // WARP_SIZE,
 | 
			
		||||
  TORCH_CHECK(num_thread <= 1024,
 | 
			
		||||
              "num_thread must be less than 1024, "
 | 
			
		||||
              "and fallback is not implemented yet.");
 | 
			
		||||
  const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) +
 | 
			
		||||
                             (num_experts + 1) * sizeof(int32_t);
 | 
			
		||||
 | 
			
		||||
  if (shared_mem > device_max_shared_mem) {
 | 
			
		||||
    TORCH_CHECK(false,
 | 
			
		||||
                "Shared memory usage exceeds device limit, and global memory "
 | 
			
		||||
                "fallback is not implemented yet.");
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  VLLM_DISPATCH_INTEGRAL_TYPES(
 | 
			
		||||
      topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] {
 | 
			
		||||
        dim3 blockDim(num_thread);
 | 
			
		||||
        auto kernel = moe_lora_align_sum_kernel<scalar_t, int32_t>;
 | 
			
		||||
        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
 | 
			
		||||
            (void*)kernel, shared_mem));
 | 
			
		||||
        kernel<<<max_loras, blockDim, shared_mem, stream>>>(
 | 
			
		||||
            topk_ids.data_ptr<scalar_t>(),
 | 
			
		||||
            token_lora_mapping.data_ptr<int32_t>(), block_size, num_experts,
 | 
			
		||||
            max_loras, topk_ids.numel(), max_num_tokens_padded,
 | 
			
		||||
            max_num_m_blocks, sorted_token_ids.data_ptr<int32_t>(),
 | 
			
		||||
            expert_ids.data_ptr<int32_t>(), topk_num,
 | 
			
		||||
            num_tokens_post_pad.data_ptr<int32_t>());
 | 
			
		||||
      });
 | 
			
		||||
}
 | 
			
		||||
@ -20,14 +20,6 @@ void batched_moe_align_block_size(int64_t max_tokens_per_batch,
 | 
			
		||||
                                  torch::Tensor expert_ids,
 | 
			
		||||
                                  torch::Tensor num_tokens_post_pad);
 | 
			
		||||
 | 
			
		||||
void moe_lora_align_block_size(torch::Tensor topk_ids,
 | 
			
		||||
                               torch::Tensor token_lora_mapping,
 | 
			
		||||
                               int64_t num_experts, int64_t block_size,
 | 
			
		||||
                               int64_t max_loras, int64_t max_num_tokens_padded,
 | 
			
		||||
                               int64_t max_num_m_blocks,
 | 
			
		||||
                               torch::Tensor sorted_token_ids,
 | 
			
		||||
                               torch::Tensor expert_ids,
 | 
			
		||||
                               torch::Tensor num_tokens_post_pad);
 | 
			
		||||
#ifndef USE_ROCM
 | 
			
		||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
 | 
			
		||||
                             torch::Tensor b_qweight, torch::Tensor b_scales,
 | 
			
		||||
 | 
			
		||||
@ -33,20 +33,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 | 
			
		||||
  m.impl("batched_moe_align_block_size", torch::kCUDA,
 | 
			
		||||
         &batched_moe_align_block_size);
 | 
			
		||||
 | 
			
		||||
  // Aligning the number of tokens to be processed by each expert such
 | 
			
		||||
  // that it is divisible by the block size.
 | 
			
		||||
  m.def(
 | 
			
		||||
      "moe_lora_align_block_size(Tensor topk_ids,"
 | 
			
		||||
      "                     Tensor token_lora_mapping,"
 | 
			
		||||
      "                     int num_experts,"
 | 
			
		||||
      "                     int block_size, int max_loras, "
 | 
			
		||||
      "                     int max_num_tokens_padded, "
 | 
			
		||||
      "                     int max_num_m_blocks, "
 | 
			
		||||
      "                     Tensor !sorted_token_ids,"
 | 
			
		||||
      "                     Tensor !experts_ids,"
 | 
			
		||||
      "                     Tensor !num_tokens_post_pad) -> () ");
 | 
			
		||||
  m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size);
 | 
			
		||||
 | 
			
		||||
#ifndef USE_ROCM
 | 
			
		||||
  m.def(
 | 
			
		||||
      "moe_wna16_gemm(Tensor input, Tensor! output, Tensor b_qweight, "
 | 
			
		||||
 | 
			
		||||
@ -99,11 +99,8 @@ void apply_repetition_penalties_(torch::Tensor& logits,
 | 
			
		||||
 | 
			
		||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
 | 
			
		||||
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
 | 
			
		||||
                   int64_t numRows, int64_t stride0, int64_t stride1);
 | 
			
		||||
 | 
			
		||||
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
 | 
			
		||||
                          const torch::Tensor& seq_lens, torch::Tensor& indices,
 | 
			
		||||
                          int64_t numRows, int64_t stride0, int64_t stride1);
 | 
			
		||||
                   torch::Tensor& values, int64_t numRows, int64_t stride0,
 | 
			
		||||
                   int64_t stride1);
 | 
			
		||||
 | 
			
		||||
void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
 | 
			
		||||
                               torch::Tensor& weight, torch::Tensor& scale,
 | 
			
		||||
@ -307,7 +304,7 @@ void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
 | 
			
		||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
 | 
			
		||||
                        torch::Tensor b_gptq_qzeros,
 | 
			
		||||
                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
 | 
			
		||||
                        bool use_exllama, bool use_v2_format, int64_t bit);
 | 
			
		||||
                        bool use_exllama, int64_t bit);
 | 
			
		||||
 | 
			
		||||
void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -185,7 +185,7 @@ typedef void (*fp_gemm_half_q_half_gptq_kernel)(const half*, const uint32_t*,
 | 
			
		||||
                                                const uint32_t*, const half*,
 | 
			
		||||
                                                half*, const int, const int,
 | 
			
		||||
                                                const int, const int,
 | 
			
		||||
                                                const bool, const int*);
 | 
			
		||||
                                                const int*);
 | 
			
		||||
 | 
			
		||||
template <bool first_block, int m_count>
 | 
			
		||||
__global__ void gemm_half_q_half_gptq_4bit_kernel(
 | 
			
		||||
@ -193,15 +193,12 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ b_gptq_qzeros,
 | 
			
		||||
    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
 | 
			
		||||
    const int size_m, const int size_n, const int size_k, const int groups,
 | 
			
		||||
    const bool use_v2_format, const int* __restrict__ b_q_perm) {
 | 
			
		||||
    const int* __restrict__ b_q_perm) {
 | 
			
		||||
  MatrixView_half a_(a, size_m, size_k);
 | 
			
		||||
  MatrixView_half_rw c_(c, size_m, size_n);
 | 
			
		||||
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
 | 
			
		||||
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  auto t = threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  // Block
 | 
			
		||||
@ -259,10 +256,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
 | 
			
		||||
  half2 y1y16[4][2];
 | 
			
		||||
  b_gptq_qzeros_.item4(zeros, group, n);
 | 
			
		||||
  b_gptq_scales_.item4_f(scales, group, n);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
 | 
			
		||||
 | 
			
		||||
  // Column result
 | 
			
		||||
  float block_c[m_count][4] = {};
 | 
			
		||||
@ -275,10 +272,10 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
 | 
			
		||||
      nextgroup += groupsize;
 | 
			
		||||
      b_gptq_qzeros_.item4(zeros, group, n);
 | 
			
		||||
      b_gptq_scales_.item4_f(scales, group, n);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
#pragma unroll
 | 
			
		||||
@ -332,15 +329,12 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ b_gptq_qzeros,
 | 
			
		||||
    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
 | 
			
		||||
    const int size_m, const int size_n, const int size_k, const int groups,
 | 
			
		||||
    const bool use_v2_format, const int* __restrict__ b_q_perm) {
 | 
			
		||||
    const int* __restrict__ b_q_perm) {
 | 
			
		||||
  MatrixView_half a_(a, size_m, size_k);
 | 
			
		||||
  MatrixView_half_rw c_(c, size_m, size_n);
 | 
			
		||||
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
 | 
			
		||||
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  auto t = threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  // Block
 | 
			
		||||
@ -415,10 +409,10 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
 | 
			
		||||
      int4 load_int4 = *b_ptr4;
 | 
			
		||||
 | 
			
		||||
      half2 dq[4][8];
 | 
			
		||||
      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
 | 
			
		||||
      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
 | 
			
		||||
      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
 | 
			
		||||
      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);
 | 
			
		||||
      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
 | 
			
		||||
      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
 | 
			
		||||
      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
 | 
			
		||||
      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
 | 
			
		||||
 | 
			
		||||
#pragma unroll
 | 
			
		||||
      for (int m = 0; m < m_count; m++) {
 | 
			
		||||
@ -454,15 +448,12 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ b_gptq_qzeros,
 | 
			
		||||
    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
 | 
			
		||||
    const int size_m, const int size_n, const int size_k, const int groups,
 | 
			
		||||
    const bool use_v2_format, const int* __restrict__ b_q_perm) {
 | 
			
		||||
    const int* __restrict__ b_q_perm) {
 | 
			
		||||
  MatrixView_half a_(a, size_m, size_k);
 | 
			
		||||
  MatrixView_half_rw c_(c, size_m, size_n);
 | 
			
		||||
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
 | 
			
		||||
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  auto t = threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  // Block
 | 
			
		||||
@ -543,13 +534,13 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
 | 
			
		||||
 | 
			
		||||
      half2 dq[4][16];
 | 
			
		||||
      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
 | 
			
		||||
                      size_n, zeros[0] + zero_offset);
 | 
			
		||||
                      size_n, zeros[0] + 1);
 | 
			
		||||
      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
 | 
			
		||||
                      size_n, zeros[1] + zero_offset);
 | 
			
		||||
                      size_n, zeros[1] + 1);
 | 
			
		||||
      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
 | 
			
		||||
                      size_n, zeros[2] + zero_offset);
 | 
			
		||||
                      size_n, zeros[2] + 1);
 | 
			
		||||
      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
 | 
			
		||||
                      size_n, zeros[3] + zero_offset);
 | 
			
		||||
                      size_n, zeros[3] + 1);
 | 
			
		||||
 | 
			
		||||
#pragma unroll
 | 
			
		||||
      for (int m = 0; m < m_count; m++) {
 | 
			
		||||
@ -583,15 +574,12 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ b_gptq_qzeros,
 | 
			
		||||
    const half* __restrict__ b_gptq_scales, half* __restrict__ c,
 | 
			
		||||
    const int size_m, const int size_n, const int size_k, const int groups,
 | 
			
		||||
    const bool use_v2_format, const int* __restrict__ b_q_perm) {
 | 
			
		||||
    const int* __restrict__ b_q_perm) {
 | 
			
		||||
  MatrixView_half a_(a, size_m, size_k);
 | 
			
		||||
  MatrixView_half_rw c_(c, size_m, size_n);
 | 
			
		||||
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
 | 
			
		||||
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  auto t = threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  // Block
 | 
			
		||||
@ -670,13 +658,13 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
 | 
			
		||||
 | 
			
		||||
      half2 dq[4][4];
 | 
			
		||||
      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
 | 
			
		||||
                     zeros[0] + zero_offset);
 | 
			
		||||
                     zeros[0] + 1);
 | 
			
		||||
      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
 | 
			
		||||
                     zeros[1] + zero_offset);
 | 
			
		||||
                     zeros[1] + 1);
 | 
			
		||||
      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
 | 
			
		||||
                     zeros[2] + zero_offset);
 | 
			
		||||
                     zeros[2] + 1);
 | 
			
		||||
      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
 | 
			
		||||
                     zeros[3] + zero_offset);
 | 
			
		||||
                     zeros[3] + 1);
 | 
			
		||||
 | 
			
		||||
      for (int m = 0; m < m_count; m++) {
 | 
			
		||||
        block_c[m][0] =
 | 
			
		||||
@ -742,8 +730,7 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
 | 
			
		||||
                                const uint32_t* b_gptq_qzeros,
 | 
			
		||||
                                const half* b_gptq_scales, const int* b_q_perm,
 | 
			
		||||
                                half* c, int size_m, int size_n, int size_k,
 | 
			
		||||
                                int m_count, int groups, bool use_v2_format,
 | 
			
		||||
                                int bit) {
 | 
			
		||||
                                int m_count, int groups, int bit) {
 | 
			
		||||
  dim3 blockDim, gridDim;
 | 
			
		||||
  blockDim.x = BLOCK_KN_SIZE;
 | 
			
		||||
  blockDim.y = 1;
 | 
			
		||||
@ -756,23 +743,20 @@ void gemm_half_q_half_cuda_part(const half* a, const uint32_t* b_q_weight,
 | 
			
		||||
      pick_gemm_half_q_half_gptq_kernel(true, m_count, bit);
 | 
			
		||||
 | 
			
		||||
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 | 
			
		||||
  kernel<<<gridDim, blockDim, 0, stream>>>(
 | 
			
		||||
      a, b_q_weight, b_gptq_qzeros, b_gptq_scales, c, size_m, size_n, size_k,
 | 
			
		||||
      groups, use_v2_format, b_q_perm);
 | 
			
		||||
  kernel<<<gridDim, blockDim, 0, stream>>>(a, b_q_weight, b_gptq_qzeros,
 | 
			
		||||
                                           b_gptq_scales, c, size_m, size_n,
 | 
			
		||||
                                           size_k, groups, b_q_perm);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void reconstruct_exllama_8bit_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
 | 
			
		||||
    const uint32_t* __restrict__ b_gptq_qzeros,
 | 
			
		||||
    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
 | 
			
		||||
    const int groups, const bool use_v2_format, half* __restrict__ b) {
 | 
			
		||||
    const int groups, half* __restrict__ b) {
 | 
			
		||||
  MatrixView_half_rw b_(b, size_k, size_n);
 | 
			
		||||
  MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
 | 
			
		||||
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
 | 
			
		||||
  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 | 
			
		||||
 | 
			
		||||
@ -828,13 +812,13 @@ __global__ void reconstruct_exllama_8bit_kernel(
 | 
			
		||||
 | 
			
		||||
      half2 dq[4][4];
 | 
			
		||||
      dequant_8bit_8(load_int4[0].x, load_int4[1].x, dq[0], size_n,
 | 
			
		||||
                     zeros[0] + zero_offset);
 | 
			
		||||
                     zeros[0] + 1);
 | 
			
		||||
      dequant_8bit_8(load_int4[0].y, load_int4[1].y, dq[1], size_n,
 | 
			
		||||
                     zeros[1] + zero_offset);
 | 
			
		||||
                     zeros[1] + 1);
 | 
			
		||||
      dequant_8bit_8(load_int4[0].z, load_int4[1].z, dq[2], size_n,
 | 
			
		||||
                     zeros[2] + zero_offset);
 | 
			
		||||
                     zeros[2] + 1);
 | 
			
		||||
      dequant_8bit_8(load_int4[0].w, load_int4[1].w, dq[3], size_n,
 | 
			
		||||
                     zeros[3] + zero_offset);
 | 
			
		||||
                     zeros[3] + 1);
 | 
			
		||||
 | 
			
		||||
      // half* dqh = (half*)dq;
 | 
			
		||||
      if (b_q_perm) {
 | 
			
		||||
@ -865,14 +849,11 @@ __global__ void reconstruct_exllama_4bit_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
 | 
			
		||||
    const uint32_t* __restrict__ b_gptq_qzeros,
 | 
			
		||||
    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
 | 
			
		||||
    const int groups, const bool use_v2_format, half* __restrict__ b) {
 | 
			
		||||
    const int groups, half* __restrict__ b) {
 | 
			
		||||
  MatrixView_half_rw b_(b, size_k, size_n);
 | 
			
		||||
  MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
 | 
			
		||||
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
 | 
			
		||||
  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 | 
			
		||||
 | 
			
		||||
@ -907,10 +888,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
 | 
			
		||||
  half2 y1y16[4][2];
 | 
			
		||||
  b_gptq_qzeros_.item4(zeros, group, n);
 | 
			
		||||
  b_gptq_scales_.item4_h2(scales, group, n);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
 | 
			
		||||
  dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
 | 
			
		||||
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
 | 
			
		||||
@ -923,10 +904,10 @@ __global__ void reconstruct_exllama_4bit_kernel(
 | 
			
		||||
      nextgroup += groupsize;
 | 
			
		||||
      b_gptq_qzeros_.item4(zeros, group, n);
 | 
			
		||||
      b_gptq_scales_.item4_h2(scales, group, n);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[0] + zero_offset, z1z16[0], y1y16[0]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[1] + zero_offset, z1z16[1], y1y16[1]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[2] + zero_offset, z1z16[2], y1y16[2]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[3] + zero_offset, z1z16[3], y1y16[3]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[0] + 1, z1z16[0], y1y16[0]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[1] + 1, z1z16[1], y1y16[1]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[2] + 1, z1z16[2], y1y16[2]);
 | 
			
		||||
      dequant_4bit_8_prep_zero(zeros[3] + 1, z1z16[3], y1y16[3]);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for (int p = 0; p < 4; p++) {
 | 
			
		||||
@ -973,14 +954,11 @@ __global__ void reconstruct_exllama_3bit_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
 | 
			
		||||
    const uint32_t* __restrict__ b_gptq_qzeros,
 | 
			
		||||
    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
 | 
			
		||||
    const int groups, const bool use_v2_format, half* __restrict__ b) {
 | 
			
		||||
    const int groups, half* __restrict__ b) {
 | 
			
		||||
  MatrixView_half_rw b_(b, size_k, size_n);
 | 
			
		||||
  MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
 | 
			
		||||
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
 | 
			
		||||
  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 | 
			
		||||
 | 
			
		||||
@ -1038,13 +1016,13 @@ __global__ void reconstruct_exllama_3bit_kernel(
 | 
			
		||||
 | 
			
		||||
      half2 dq[4][16];
 | 
			
		||||
      dequant_3bit_32(load_int4[0].x, load_int4[1].x, load_int4[2].x, dq[0],
 | 
			
		||||
                      size_n, zeros[0] + zero_offset);
 | 
			
		||||
                      size_n, zeros[0] + 1);
 | 
			
		||||
      dequant_3bit_32(load_int4[0].y, load_int4[1].y, load_int4[2].y, dq[1],
 | 
			
		||||
                      size_n, zeros[1] + zero_offset);
 | 
			
		||||
                      size_n, zeros[1] + 1);
 | 
			
		||||
      dequant_3bit_32(load_int4[0].z, load_int4[1].z, load_int4[2].z, dq[2],
 | 
			
		||||
                      size_n, zeros[2] + zero_offset);
 | 
			
		||||
                      size_n, zeros[2] + 1);
 | 
			
		||||
      dequant_3bit_32(load_int4[0].w, load_int4[1].w, load_int4[2].w, dq[3],
 | 
			
		||||
                      size_n, zeros[3] + zero_offset);
 | 
			
		||||
                      size_n, zeros[3] + 1);
 | 
			
		||||
 | 
			
		||||
      if (b_q_perm) {
 | 
			
		||||
        for (int j = 0; j < 16; j++) {
 | 
			
		||||
@ -1074,14 +1052,11 @@ __global__ void reconstruct_exllama_2bit_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ b_q_weight, const int* __restrict__ b_q_perm,
 | 
			
		||||
    const uint32_t* __restrict__ b_gptq_qzeros,
 | 
			
		||||
    const half* __restrict__ b_gptq_scales, const int size_k, const int size_n,
 | 
			
		||||
    const int groups, const bool use_v2_format, half* __restrict__ b) {
 | 
			
		||||
    const int groups, half* __restrict__ b) {
 | 
			
		||||
  MatrixView_half_rw b_(b, size_k, size_n);
 | 
			
		||||
  MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
 | 
			
		||||
  MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
 | 
			
		||||
  auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
 | 
			
		||||
 | 
			
		||||
@ -1133,10 +1108,10 @@ __global__ void reconstruct_exllama_2bit_kernel(
 | 
			
		||||
      int4 load_int4 = *b_ptr4;
 | 
			
		||||
 | 
			
		||||
      half2 dq[4][8];
 | 
			
		||||
      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + zero_offset);
 | 
			
		||||
      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + zero_offset);
 | 
			
		||||
      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + zero_offset);
 | 
			
		||||
      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + zero_offset);
 | 
			
		||||
      dequant_2bit_16(load_int4.x, dq[0], size_n, zeros[0] + 1);
 | 
			
		||||
      dequant_2bit_16(load_int4.y, dq[1], size_n, zeros[1] + 1);
 | 
			
		||||
      dequant_2bit_16(load_int4.z, dq[2], size_n, zeros[2] + 1);
 | 
			
		||||
      dequant_2bit_16(load_int4.w, dq[3], size_n, zeros[3] + 1);
 | 
			
		||||
 | 
			
		||||
      b_ptr += size_n;
 | 
			
		||||
      // half* dqh = (half*)dq;
 | 
			
		||||
@ -1168,7 +1143,7 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
 | 
			
		||||
                         const uint32_t* b_gptq_qzeros,
 | 
			
		||||
                         const half* b_gptq_scales, const int* b_q_perm,
 | 
			
		||||
                         half* out, int height, int width, int groups,
 | 
			
		||||
                         bool use_v2_format, int bit) {
 | 
			
		||||
                         int bit) {
 | 
			
		||||
  dim3 blockDim, gridDim;
 | 
			
		||||
  blockDim.x = BLOCK_KN_SIZE;
 | 
			
		||||
  blockDim.y = 1;
 | 
			
		||||
@ -1187,14 +1162,14 @@ void reconstruct_exllama(const uint32_t* b_q_weight,
 | 
			
		||||
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 | 
			
		||||
  reconstruct_exllama_kernel<<<gridDim, blockDim, 0, stream>>>(
 | 
			
		||||
      b_q_weight, b_q_perm, b_gptq_qzeros, b_gptq_scales, height, width, groups,
 | 
			
		||||
      use_v2_format, out);
 | 
			
		||||
      out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void gemm_half_q_half_alt_4bit_kernel(
 | 
			
		||||
    const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
 | 
			
		||||
    half* __restrict__ mul, const half* __restrict__ scales,
 | 
			
		||||
    const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
 | 
			
		||||
    int batch, int height, int width, bool use_v2_format) {
 | 
			
		||||
    int batch, int height, int width) {
 | 
			
		||||
  int zero_width = width / 8;
 | 
			
		||||
  int vec_height = height * 4;
 | 
			
		||||
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
 | 
			
		||||
@ -1204,9 +1179,6 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
 | 
			
		||||
  int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
 | 
			
		||||
  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
 | 
			
		||||
  if (threadIdx.x < h_end) {
 | 
			
		||||
    for (int m = 0; m < b_end; ++m) {
 | 
			
		||||
@ -1251,11 +1223,10 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
 | 
			
		||||
      half2 zero = __halves2half2(
 | 
			
		||||
          __hmul(scale_f,
 | 
			
		||||
                 __int2half_rn(-((zeros[g * zero_width + z_w] >> z_mod) & 0xF) -
 | 
			
		||||
                               zero_offset)),
 | 
			
		||||
          __hmul(
 | 
			
		||||
              scale_f2,
 | 
			
		||||
              __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) -
 | 
			
		||||
                            zero_offset)));
 | 
			
		||||
                               1)),
 | 
			
		||||
          __hmul(scale_f2,
 | 
			
		||||
                 __int2half_rn(
 | 
			
		||||
                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xF) - 1)));
 | 
			
		||||
      scales_tmp[tmp_k] = scale;
 | 
			
		||||
      zeros_tmp[tmp_k] = zero;
 | 
			
		||||
    }
 | 
			
		||||
@ -1297,7 +1268,7 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
 | 
			
		||||
    const half2* __restrict__ vec, const uint32_t* __restrict__ mat,
 | 
			
		||||
    half* __restrict__ mul, const half* __restrict__ scales,
 | 
			
		||||
    const uint32_t* __restrict__ zeros, const int* __restrict__ g_idx,
 | 
			
		||||
    int batch, int height, int width, bool use_v2_format) {
 | 
			
		||||
    int batch, int height, int width) {
 | 
			
		||||
  int zero_width = width / 4;
 | 
			
		||||
  int vec_height = height * 2;
 | 
			
		||||
  const int blockwidth2 = BLOCK_KN_SIZE / 2;
 | 
			
		||||
@ -1307,9 +1278,6 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
 | 
			
		||||
  int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
 | 
			
		||||
  auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
 | 
			
		||||
  if (threadIdx.x < h_end) {
 | 
			
		||||
    for (int m = 0; m < b_end; ++m) {
 | 
			
		||||
@ -1344,13 +1312,12 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
 | 
			
		||||
      half scale_f2 = scales[g2 * width + w];
 | 
			
		||||
      half2 scale = __halves2half2(scale_f, scale_f2);
 | 
			
		||||
      half2 zero = __halves2half2(
 | 
			
		||||
          __hmul(scale_f, __int2half_rn(
 | 
			
		||||
                              -((zeros[g * zero_width + z_w] >> z_mod) & 0xff) -
 | 
			
		||||
                              zero_offset)),
 | 
			
		||||
          __hmul(
 | 
			
		||||
              scale_f2,
 | 
			
		||||
              __int2half_rn(-((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) -
 | 
			
		||||
                            zero_offset)));
 | 
			
		||||
          __hmul(scale_f,
 | 
			
		||||
                 __int2half_rn(
 | 
			
		||||
                     -((zeros[g * zero_width + z_w] >> z_mod) & 0xff) - 1)),
 | 
			
		||||
          __hmul(scale_f2,
 | 
			
		||||
                 __int2half_rn(
 | 
			
		||||
                     -((zeros[g2 * zero_width + z_w] >> z_mod) & 0xff) - 1)));
 | 
			
		||||
      scales_tmp[tmp_k] = scale;
 | 
			
		||||
      zeros_tmp[tmp_k] = zero;
 | 
			
		||||
    }
 | 
			
		||||
@ -1388,7 +1355,7 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
 | 
			
		||||
                          const uint32_t* b_gptq_qzeros,
 | 
			
		||||
                          const half* b_gptq_scales, const int* b_g_idx,
 | 
			
		||||
                          half* c, int size_m, int size_n, int size_k,
 | 
			
		||||
                          bool use_v2_format, int bit) {
 | 
			
		||||
                          int bit) {
 | 
			
		||||
  dim3 blockDim, gridDim;
 | 
			
		||||
  blockDim.x = BLOCK_KN_SIZE;
 | 
			
		||||
  blockDim.y = 1;
 | 
			
		||||
@ -1405,15 +1372,17 @@ void gemm_half_q_half_alt(const half* a, const uint32_t* b_q_weight,
 | 
			
		||||
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 | 
			
		||||
  kernel<<<gridDim, blockDim, 0, stream>>>(
 | 
			
		||||
      (const half2*)a, b_q_weight, c, b_gptq_scales, b_gptq_qzeros, b_g_idx,
 | 
			
		||||
      size_m, size_k / 32 * bit, size_n, use_v2_format);
 | 
			
		||||
      size_m, size_k / 32 * bit, size_n);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <class T, int bit>
 | 
			
		||||
__global__ void reconstruct_gptq_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
 | 
			
		||||
    const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
 | 
			
		||||
    const int height, const int width, const int group,
 | 
			
		||||
    const bool use_v2_format, half* __restrict__ out) {
 | 
			
		||||
__global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
 | 
			
		||||
                                        const half* __restrict__ w_scales,
 | 
			
		||||
                                        const uint32_t* __restrict__ w_zeros,
 | 
			
		||||
                                        const int* __restrict__ g_idx,
 | 
			
		||||
                                        const int height, const int width,
 | 
			
		||||
                                        const int group,
 | 
			
		||||
                                        half* __restrict__ out) {
 | 
			
		||||
  // Start of block
 | 
			
		||||
 | 
			
		||||
  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
 | 
			
		||||
@ -1426,9 +1395,6 @@ __global__ void reconstruct_gptq_kernel(
 | 
			
		||||
  MatrixView_half w_scales_(w_scales, group, width);
 | 
			
		||||
  T w_zeros_(w_zeros, group, width);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  uint32_t w_read = w[blockIdx.y * width + column];
 | 
			
		||||
  half* out_ptr = out_.item_ptr(row, column);
 | 
			
		||||
 | 
			
		||||
@ -1436,7 +1402,7 @@ __global__ void reconstruct_gptq_kernel(
 | 
			
		||||
  for (int s = 0; s < 32; s += bit) {
 | 
			
		||||
    int group = g_idx[row + s / bit];
 | 
			
		||||
    half w_scale = w_scales_.item(group, column);
 | 
			
		||||
    uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
 | 
			
		||||
    uint32_t w_zero = w_zeros_.item(group, column) + 1;
 | 
			
		||||
    half w_item =
 | 
			
		||||
        __hmul(__int2half_rn((int)((w_read >> s) & ((1 << bit) - 1)) - w_zero),
 | 
			
		||||
               w_scale);
 | 
			
		||||
@ -1449,7 +1415,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
 | 
			
		||||
    const uint32_t* __restrict__ w, const half* __restrict__ w_scales,
 | 
			
		||||
    const uint32_t* __restrict__ w_zeros, const int* __restrict__ g_idx,
 | 
			
		||||
    const int height, const int width, const int group,
 | 
			
		||||
    const bool use_v2_format, half* __restrict__ out) {
 | 
			
		||||
    half* __restrict__ out) {
 | 
			
		||||
  // Start of block
 | 
			
		||||
  auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
 | 
			
		||||
  auto row = blockIdx.y * 32;
 | 
			
		||||
@ -1461,9 +1427,6 @@ __global__ void reconstruct_gptq_3bit_kernel(
 | 
			
		||||
  MatrixView_half w_scales_(w_scales, group, width);
 | 
			
		||||
  MatrixView_q3_row w_zeros_(w_zeros, group, width);
 | 
			
		||||
 | 
			
		||||
  // GPTQv2 and GPTQv1 handles zero points differently
 | 
			
		||||
  int zero_offset = use_v2_format ? 0 : 1;
 | 
			
		||||
 | 
			
		||||
  uint32_t w1 = w[(blockIdx.y * 3) * width + column];
 | 
			
		||||
  uint32_t w2 = w[(blockIdx.y * 3 + 1) * width + column];
 | 
			
		||||
  uint32_t w3 = w[(blockIdx.y * 3 + 2) * width + column];
 | 
			
		||||
@ -1473,7 +1436,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
 | 
			
		||||
  for (int i = 0; i < 32; i += 1) {
 | 
			
		||||
    int group = g_idx[row + i];
 | 
			
		||||
    half w_scale = w_scales_.item(group, column);
 | 
			
		||||
    uint32_t w_zero = w_zeros_.item(group, column) + zero_offset;
 | 
			
		||||
    uint32_t w_zero = w_zeros_.item(group, column) + 1;
 | 
			
		||||
    int w_item;
 | 
			
		||||
    if (i == 10) {
 | 
			
		||||
      w_item = (w1 >> 30) | ((w2 << 2) & 0x4);
 | 
			
		||||
@ -1493,8 +1456,7 @@ __global__ void reconstruct_gptq_3bit_kernel(
 | 
			
		||||
 | 
			
		||||
void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
 | 
			
		||||
                      const half* b_gptq_scales, const int* b_g_idx, half* out,
 | 
			
		||||
                      int height, int width, int groups, bool use_v2_format,
 | 
			
		||||
                      int bit) {
 | 
			
		||||
                      int height, int width, int groups, int bit) {
 | 
			
		||||
  dim3 blockDim, gridDim;
 | 
			
		||||
  blockDim.x = BLOCK_KN_SIZE;
 | 
			
		||||
  blockDim.y = 1;
 | 
			
		||||
@ -1514,7 +1476,7 @@ void reconstruct_gptq(const uint32_t* b_q_weight, const uint32_t* b_gptq_qzeros,
 | 
			
		||||
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 | 
			
		||||
  kernel<<<gridDim, blockDim, 0, stream>>>(b_q_weight, b_gptq_scales,
 | 
			
		||||
                                           b_gptq_qzeros, b_g_idx, height,
 | 
			
		||||
                                           width, groups, use_v2_format, out);
 | 
			
		||||
                                           width, groups, out);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
 | 
			
		||||
@ -1522,8 +1484,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
 | 
			
		||||
                           const uint32_t* b_gptq_qzeros,
 | 
			
		||||
                           const half* b_gptq_scales, const int* b_g_idx,
 | 
			
		||||
                           half* c, half* temp_dq, int size_m, int size_n,
 | 
			
		||||
                           int size_k, int groups, bool use_exllama,
 | 
			
		||||
                           bool use_v2_format, int bit) {
 | 
			
		||||
                           int size_k, int groups, bool use_exllama, int bit) {
 | 
			
		||||
  bool use_reconstruct;
 | 
			
		||||
  if (use_exllama) {
 | 
			
		||||
    use_reconstruct = ((bit == 8 && size_m > MAX_Q_GEMM_ROWS_8BIT) ||
 | 
			
		||||
@ -1537,10 +1498,10 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
 | 
			
		||||
    // Reconstruct FP16 matrix, then cuBLAS
 | 
			
		||||
    if (use_exllama) {
 | 
			
		||||
      reconstruct_exllama(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
 | 
			
		||||
                          temp_dq, size_k, size_n, groups, use_v2_format, bit);
 | 
			
		||||
                          temp_dq, size_k, size_n, groups, bit);
 | 
			
		||||
    } else {
 | 
			
		||||
      reconstruct_gptq(b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
 | 
			
		||||
                       temp_dq, size_k, size_n, groups, use_v2_format, bit);
 | 
			
		||||
                       temp_dq, size_k, size_n, groups, bit);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const half alpha = __float2half(1.0f);
 | 
			
		||||
@ -1556,18 +1517,18 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
 | 
			
		||||
    if (max_chunks) {
 | 
			
		||||
      gemm_half_q_half_cuda_part(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
 | 
			
		||||
                                 b_g_idx, c, last_chunk, size_n, size_k,
 | 
			
		||||
                                 BLOCK_M_SIZE_MAX, groups, use_v2_format, bit);
 | 
			
		||||
                                 BLOCK_M_SIZE_MAX, groups, bit);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (last_chunk_size) {
 | 
			
		||||
      gemm_half_q_half_cuda_part(
 | 
			
		||||
          a + last_chunk * size_k, b_q_weight, b_gptq_qzeros, b_gptq_scales,
 | 
			
		||||
          b_g_idx, c + last_chunk * size_n, last_chunk_size, size_n, size_k,
 | 
			
		||||
          last_chunk_size, groups, use_v2_format, bit);
 | 
			
		||||
      gemm_half_q_half_cuda_part(a + last_chunk * size_k, b_q_weight,
 | 
			
		||||
                                 b_gptq_qzeros, b_gptq_scales, b_g_idx,
 | 
			
		||||
                                 c + last_chunk * size_n, last_chunk_size,
 | 
			
		||||
                                 size_n, size_k, last_chunk_size, groups, bit);
 | 
			
		||||
    }
 | 
			
		||||
  } else {
 | 
			
		||||
    gemm_half_q_half_alt(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx,
 | 
			
		||||
                         c, size_m, size_n, size_k, use_v2_format, bit);
 | 
			
		||||
                         c, size_m, size_n, size_k, bit);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -1854,7 +1815,7 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
 | 
			
		||||
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
 | 
			
		||||
                        torch::Tensor b_gptq_qzeros,
 | 
			
		||||
                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
 | 
			
		||||
                        bool use_exllama, bool use_v2_format, int64_t bit) {
 | 
			
		||||
                        bool use_exllama, int64_t bit) {
 | 
			
		||||
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
 | 
			
		||||
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
 | 
			
		||||
  at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
 | 
			
		||||
@ -1872,7 +1833,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
 | 
			
		||||
      c.size(1),              // n
 | 
			
		||||
      a.size(1),              // k
 | 
			
		||||
      b_gptq_qzeros.size(0),  // group number
 | 
			
		||||
      use_exllama, use_v2_format, bit);
 | 
			
		||||
      use_exllama, bit);
 | 
			
		||||
  return c;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										107
									
								
								csrc/sampler.cu
									
									
									
									
									
								
							
							
						
						
									
										107
									
								
								csrc/sampler.cu
									
									
									
									
									
								
							@ -54,10 +54,15 @@ static inline __device__ uint16_t extractBinIdx(float x) {
 | 
			
		||||
  return 511 - (tmp.u16 >> 7);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
 | 
			
		||||
__device__ void topKPerRowJob(const float* logits, const int rowStart,
 | 
			
		||||
                              const int rowEnd, const int rowIdx,
 | 
			
		||||
                              int* outIndices, int stride0, int stride1) {
 | 
			
		||||
template <int kNumThreadsPerBlock = 512>
 | 
			
		||||
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
 | 
			
		||||
                                  const int* rowEnds, int* outIndices,
 | 
			
		||||
                                  float* outLogits, int stride0, int stride1) {
 | 
			
		||||
  // The number of bins in the histogram.
 | 
			
		||||
  static constexpr int kNumBins = 512;
 | 
			
		||||
 | 
			
		||||
  // The top-k width.
 | 
			
		||||
  static constexpr int kTopK = 2048;
 | 
			
		||||
  // The number of elements per thread for the final top-k sort.
 | 
			
		||||
  static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
 | 
			
		||||
  // The class to sort the elements during the final top-k sort.
 | 
			
		||||
@ -98,11 +103,17 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
 | 
			
		||||
  __shared__ int smemHistogram[kNumBins];
 | 
			
		||||
  // Shared memory to store the selected indices.
 | 
			
		||||
  __shared__ int smemIndices[kTopK];
 | 
			
		||||
  // Shared memory to store the selected logits.
 | 
			
		||||
  __shared__ float smemLogits[kTopK];
 | 
			
		||||
  // Shared memory to store the threshold bin.
 | 
			
		||||
  __shared__ int smemThresholdBinIdx[1];
 | 
			
		||||
  // Shared memory counter to register the candidates for the final phase.
 | 
			
		||||
  __shared__ int smemFinalDstIdx[1];
 | 
			
		||||
 | 
			
		||||
  // The row computed by this block.
 | 
			
		||||
  int rowIdx = blockIdx.x;
 | 
			
		||||
  // The range of logits within the row.
 | 
			
		||||
  int rowStart = rowStarts[rowIdx], rowEnd = rowEnds[rowIdx];
 | 
			
		||||
  // The length of the row.
 | 
			
		||||
  int rowLen = rowEnd - rowStart;
 | 
			
		||||
 | 
			
		||||
@ -113,10 +124,13 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
 | 
			
		||||
         rowIt += kNumThreadsPerBlock) {
 | 
			
		||||
      int idx = rowStart + rowIt;
 | 
			
		||||
      outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
 | 
			
		||||
      outLogits[rowIdx * kTopK + rowIt] =
 | 
			
		||||
          logits[rowIdx * stride0 + idx * stride1];
 | 
			
		||||
    }
 | 
			
		||||
    for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
 | 
			
		||||
         rowIt += kNumThreadsPerBlock) {
 | 
			
		||||
      outIndices[rowIdx * kTopK + rowIt] = -1;
 | 
			
		||||
      outLogits[rowIdx * kTopK + rowIt] = -FLT_MAX;
 | 
			
		||||
    }
 | 
			
		||||
    return;
 | 
			
		||||
  }
 | 
			
		||||
@ -187,6 +201,7 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
 | 
			
		||||
    uint16_t idx = extractBinIdx(logit);
 | 
			
		||||
    if (idx < thresholdBinIdx) {
 | 
			
		||||
      int dstIdx = atomicAdd(&smemHistogram[idx], 1);
 | 
			
		||||
      smemLogits[dstIdx] = logit;
 | 
			
		||||
      smemIndices[dstIdx] = rowIt;
 | 
			
		||||
    } else if (idx == thresholdBinIdx) {
 | 
			
		||||
      int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
 | 
			
		||||
@ -235,6 +250,7 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
 | 
			
		||||
    int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
 | 
			
		||||
    int dstIdx = baseIdx + srcIdx;
 | 
			
		||||
    if (dstIdx < kTopK) {
 | 
			
		||||
      smemLogits[dstIdx] = finalLogits[ii];
 | 
			
		||||
      smemIndices[dstIdx] = finalIndices[ii];
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
@ -242,58 +258,31 @@ __device__ void topKPerRowJob(const float* logits, const int rowStart,
 | 
			
		||||
  // Make sure the data is in shared memory.
 | 
			
		||||
  __syncthreads();
 | 
			
		||||
 | 
			
		||||
  // The topK logits.
 | 
			
		||||
  float topKLogits[kNumTopKItemsPerThread];
 | 
			
		||||
  // The topK indices.
 | 
			
		||||
  int topKIndices[kNumTopKItemsPerThread];
 | 
			
		||||
 | 
			
		||||
// Load from shared memory.
 | 
			
		||||
#pragma unroll
 | 
			
		||||
  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
 | 
			
		||||
    topKLogits[ii] = smemLogits[ii * kNumThreadsPerBlock + threadIdx.x];
 | 
			
		||||
    topKIndices[ii] = smemIndices[ii * kNumThreadsPerBlock + threadIdx.x];
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Sort the elements.
 | 
			
		||||
  TopKSort(smemFinal.topKSort)
 | 
			
		||||
      .SortDescendingBlockedToStriped(topKLogits, topKIndices);
 | 
			
		||||
 | 
			
		||||
// Store to global memory.
 | 
			
		||||
#pragma unroll
 | 
			
		||||
  for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
 | 
			
		||||
    int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
 | 
			
		||||
    outIndices[offset] =
 | 
			
		||||
        smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
 | 
			
		||||
    outIndices[offset] = topKIndices[ii] - rowStart;
 | 
			
		||||
    outLogits[offset] = topKLogits[ii];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <int kNumThreadsPerBlock = 512>
 | 
			
		||||
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
 | 
			
		||||
                                  const int* rowEnds, int* outIndices,
 | 
			
		||||
                                  int stride0, int stride1) {
 | 
			
		||||
  // The number of bins in the histogram.
 | 
			
		||||
  static constexpr int kNumBins = 512;
 | 
			
		||||
 | 
			
		||||
  // The top-k width.
 | 
			
		||||
  static constexpr int kTopK = 2048;
 | 
			
		||||
 | 
			
		||||
  // The row computed by this block.
 | 
			
		||||
  int rowIdx = blockIdx.x;
 | 
			
		||||
 | 
			
		||||
  // The range of logits within the row.
 | 
			
		||||
  int rowStart = rowStarts[rowIdx];
 | 
			
		||||
  int rowEnd = rowEnds[rowIdx];
 | 
			
		||||
 | 
			
		||||
  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
 | 
			
		||||
      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
template <int kNumThreadsPerBlock = 512>
 | 
			
		||||
static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
 | 
			
		||||
                                        int* outIndices, int stride0,
 | 
			
		||||
                                        int stride1, int next_n) {
 | 
			
		||||
  // The number of bins in the histogram.
 | 
			
		||||
  static constexpr int kNumBins = 512;
 | 
			
		||||
 | 
			
		||||
  // The top-k width.
 | 
			
		||||
  static constexpr int kTopK = 2048;
 | 
			
		||||
 | 
			
		||||
  // The row computed by this block.
 | 
			
		||||
  int rowIdx = blockIdx.x;
 | 
			
		||||
 | 
			
		||||
  // The range of logits within the row.
 | 
			
		||||
  int rowStart = 0;
 | 
			
		||||
  int seq_len = seqLens[rowIdx / next_n];
 | 
			
		||||
  int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
 | 
			
		||||
 | 
			
		||||
  topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
 | 
			
		||||
      logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
}  // namespace vllm
 | 
			
		||||
 | 
			
		||||
void apply_repetition_penalties_(
 | 
			
		||||
@ -337,23 +326,10 @@ void apply_repetition_penalties_(
 | 
			
		||||
      });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
 | 
			
		||||
                          const torch::Tensor& seqLens, torch::Tensor& indices,
 | 
			
		||||
                          int64_t numRows, int64_t stride0, int64_t stride1) {
 | 
			
		||||
  // Compute the results on the device.
 | 
			
		||||
  constexpr int kNumThreadsPerBlock = 512;
 | 
			
		||||
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 | 
			
		||||
 | 
			
		||||
  vllm::topKPerRowDecode<kNumThreadsPerBlock>
 | 
			
		||||
      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
 | 
			
		||||
          logits.data_ptr<float>(), seqLens.data_ptr<int>(),
 | 
			
		||||
          indices.data_ptr<int>(), static_cast<int>(stride0),
 | 
			
		||||
          static_cast<int>(stride1), static_cast<int>(next_n));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
 | 
			
		||||
                   const torch::Tensor& rowEnds, torch::Tensor& indices,
 | 
			
		||||
                   int64_t numRows, int64_t stride0, int64_t stride1) {
 | 
			
		||||
                   torch::Tensor& values, int64_t numRows, int64_t stride0,
 | 
			
		||||
                   int64_t stride1) {
 | 
			
		||||
  // Compute the results on the device.
 | 
			
		||||
  constexpr int kNumThreadsPerBlock = 512;
 | 
			
		||||
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 | 
			
		||||
@ -362,5 +338,6 @@ void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
 | 
			
		||||
      <<<numRows, kNumThreadsPerBlock, 0, stream>>>(
 | 
			
		||||
          logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
 | 
			
		||||
          rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
 | 
			
		||||
          static_cast<int>(stride0), static_cast<int>(stride1));
 | 
			
		||||
          values.data_ptr<float>(), static_cast<int>(stride0),
 | 
			
		||||
          static_cast<int>(stride1));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -185,16 +185,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 | 
			
		||||
  // Optimized top-k per row operation
 | 
			
		||||
  ops.def(
 | 
			
		||||
      "top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
 | 
			
		||||
      "Tensor! indices, int numRows, int stride0, "
 | 
			
		||||
      "Tensor! indices, Tensor! values, int numRows, int stride0, "
 | 
			
		||||
      "int stride1) -> ()");
 | 
			
		||||
  ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
 | 
			
		||||
 | 
			
		||||
  ops.def(
 | 
			
		||||
      "top_k_per_row_decode(Tensor logits, int next_n, "
 | 
			
		||||
      "Tensor seq_lens, Tensor! indices, int numRows, "
 | 
			
		||||
      "int stride0, int stride1) -> ()");
 | 
			
		||||
  ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
 | 
			
		||||
 | 
			
		||||
  // Layernorm-quant
 | 
			
		||||
  // Apply Root Mean Square (RMS) Normalization to the input tensor.
 | 
			
		||||
  ops.def(
 | 
			
		||||
@ -557,8 +551,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 | 
			
		||||
  // to prevent the meta function registry.
 | 
			
		||||
  ops.def(
 | 
			
		||||
      "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
 | 
			
		||||
      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
 | 
			
		||||
      "use_v2_format, int bit) "
 | 
			
		||||
      "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
 | 
			
		||||
      "-> Tensor",
 | 
			
		||||
      {stride_tag});
 | 
			
		||||
  ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
 | 
			
		||||
 | 
			
		||||
@ -5,7 +5,7 @@
 | 
			
		||||
# docs/contributing/dockerfile/dockerfile.md and
 | 
			
		||||
# docs/assets/contributing/dockerfile-stages-dependency.png
 | 
			
		||||
 | 
			
		||||
ARG CUDA_VERSION=12.9.1
 | 
			
		||||
ARG CUDA_VERSION=12.8.1
 | 
			
		||||
ARG PYTHON_VERSION=3.12
 | 
			
		||||
 | 
			
		||||
# By parameterizing the base images, we allow third-party to use their own
 | 
			
		||||
@ -132,9 +132,7 @@ WORKDIR /workspace
 | 
			
		||||
COPY requirements/common.txt requirements/common.txt
 | 
			
		||||
COPY requirements/cuda.txt requirements/cuda.txt
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
 | 
			
		||||
    uv pip install --python /opt/venv/bin/python3 --pre apache-tvm-ffi==0.1.0b15 \
 | 
			
		||||
    && uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
 | 
			
		||||
    uv pip install --python /opt/venv/bin/python3 -r requirements/cuda.txt \
 | 
			
		||||
    --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 | 
			
		||||
 | 
			
		||||
# cuda arch list used by torch
 | 
			
		||||
@ -275,7 +273,6 @@ WORKDIR /vllm-workspace
 | 
			
		||||
ENV DEBIAN_FRONTEND=noninteractive
 | 
			
		||||
ARG TARGETPLATFORM
 | 
			
		||||
 | 
			
		||||
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
 | 
			
		||||
ARG GDRCOPY_CUDA_VERSION=12.8
 | 
			
		||||
# Keep in line with FINAL_BASE_IMAGE
 | 
			
		||||
ARG GDRCOPY_OS_VERSION=Ubuntu22_04
 | 
			
		||||
@ -356,9 +353,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
# Install vllm wheel first, so that torch etc will be installed.
 | 
			
		||||
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    # TODO: remove apache-tvm-ffi once FlashInfer is fixed https://github.com/flashinfer-ai/flashinfer/issues/1962
 | 
			
		||||
    uv pip install --system --pre apache-tvm-ffi==0.1.0b15 \
 | 
			
		||||
    && uv pip install --system dist/*.whl --verbose \
 | 
			
		||||
    uv pip install --system dist/*.whl --verbose \
 | 
			
		||||
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 | 
			
		||||
 | 
			
		||||
# Install FlashInfer pre-compiled kernel cache and binaries
 | 
			
		||||
@ -427,7 +422,6 @@ ARG PYTHON_VERSION
 | 
			
		||||
 | 
			
		||||
ARG PIP_INDEX_URL UV_INDEX_URL
 | 
			
		||||
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 | 
			
		||||
ARG PYTORCH_CUDA_INDEX_BASE_URL
 | 
			
		||||
 | 
			
		||||
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 | 
			
		||||
# Reference: https://github.com/astral-sh/uv/pull/1694
 | 
			
		||||
@ -440,8 +434,7 @@ ENV UV_LINK_MODE=copy
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
 | 
			
		||||
    if [ "$CUDA_MAJOR" -ge 12 ]; then \
 | 
			
		||||
        uv pip install --system -r requirements/dev.txt \
 | 
			
		||||
        --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
 | 
			
		||||
        uv pip install --system -r requirements/dev.txt; \
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
# install development dependencies (for testing)
 | 
			
		||||
@ -488,7 +481,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    else \
 | 
			
		||||
        BITSANDBYTES_VERSION="0.46.1"; \
 | 
			
		||||
    fi; \
 | 
			
		||||
    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
 | 
			
		||||
    uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.14.0'
 | 
			
		||||
 | 
			
		||||
ENV VLLM_USAGE_SOURCE production-docker-image
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -31,7 +31,7 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 | 
			
		||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
 | 
			
		||||
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
 | 
			
		||||
    apt-get update -y \
 | 
			
		||||
    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
 | 
			
		||||
    && apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \
 | 
			
		||||
        gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
 | 
			
		||||
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
 | 
			
		||||
    && curl -LsSf https://astral.sh/uv/install.sh | sh
 | 
			
		||||
@ -79,9 +79,6 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
 | 
			
		||||
######################### BUILD IMAGE #########################
 | 
			
		||||
FROM base AS vllm-build
 | 
			
		||||
 | 
			
		||||
ARG max_jobs=32
 | 
			
		||||
ENV MAX_JOBS=${max_jobs}
 | 
			
		||||
 | 
			
		||||
ARG GIT_REPO_CHECK=0
 | 
			
		||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
 | 
			
		||||
ARG VLLM_CPU_DISABLE_AVX512=0
 | 
			
		||||
@ -107,20 +104,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
    --mount=type=cache,target=/root/.cache/ccache \
 | 
			
		||||
    --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
 | 
			
		||||
    --mount=type=bind,source=.git,target=.git \
 | 
			
		||||
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
 | 
			
		||||
    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
 | 
			
		||||
 | 
			
		||||
######################### TEST DEPS #########################
 | 
			
		||||
FROM base AS vllm-test-deps
 | 
			
		||||
 | 
			
		||||
WORKDIR /workspace/vllm
 | 
			
		||||
 | 
			
		||||
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
 | 
			
		||||
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
 | 
			
		||||
    cp requirements/test.in requirements/cpu-test.in && \
 | 
			
		||||
    sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
 | 
			
		||||
    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
 | 
			
		||||
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
 | 
			
		||||
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
 | 
			
		||||
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 | 
			
		||||
 | 
			
		||||
RUN --mount=type=cache,target=/root/.cache/uv \
 | 
			
		||||
 | 
			
		||||
@ -1,13 +1,13 @@
 | 
			
		||||
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
 | 
			
		||||
ARG TRITON_BRANCH="57c693b6"
 | 
			
		||||
ARG TRITON_BRANCH="f9e5bf54"
 | 
			
		||||
ARG TRITON_REPO="https://github.com/ROCm/triton.git"
 | 
			
		||||
ARG PYTORCH_BRANCH="1c57644d"
 | 
			
		||||
ARG PYTORCH_BRANCH="b2fb6885"
 | 
			
		||||
ARG PYTORCH_VISION_BRANCH="v0.23.0"
 | 
			
		||||
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 | 
			
		||||
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 | 
			
		||||
ARG FA_BRANCH="0e60e394"
 | 
			
		||||
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
 | 
			
		||||
ARG AITER_BRANCH="9716b1b8"
 | 
			
		||||
ARG AITER_BRANCH="2ab9f4cd"
 | 
			
		||||
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 | 
			
		||||
 | 
			
		||||
FROM ${BASE_IMAGE} AS base
 | 
			
		||||
 | 
			
		||||
										
											Binary file not shown.
										
									
								
							| 
		 Before Width: | Height: | Size: 119 KiB After Width: | Height: | Size: 119 KiB  | 
										
											Binary file not shown.
										
									
								
							| 
		 Before Width: | Height: | Size: 577 KiB  | 
@ -5,4 +5,4 @@ nav:
 | 
			
		||||
  - complete.md
 | 
			
		||||
  - run-batch.md
 | 
			
		||||
  - vllm bench:
 | 
			
		||||
    - bench/**/*.md
 | 
			
		||||
    - bench/*.md
 | 
			
		||||
 | 
			
		||||
@ -1,9 +0,0 @@
 | 
			
		||||
# vllm bench sweep plot
 | 
			
		||||
 | 
			
		||||
## JSON CLI Arguments
 | 
			
		||||
 | 
			
		||||
--8<-- "docs/cli/json_tip.inc.md"
 | 
			
		||||
 | 
			
		||||
## Options
 | 
			
		||||
 | 
			
		||||
--8<-- "docs/argparse/bench_sweep_plot.md"
 | 
			
		||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user