CMake build elf without PTX (#3739 )

Fix build when nvtools is missing (#3698 )
[Core][Bugfix] cache len of tokenizer (#3741 )
2025-10-21 15:43:52 +08:00 · 2024-03-30 01:53:08 +00:00 · 2024-03-29 18:52:39 -07:00 · 2024-03-29 18:46:39 -07:00 · 2024-03-29 23:26:44 +00:00 · 2024-03-29 15:57:08 -07:00
334 changed files with 28458 additions and 6349 deletions
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+
+# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
+mkdir -p images
+cd images
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
+
+cd -
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@ -0,0 +1,38 @@
+# This script build the ROCm docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Print ROCm version
+rocminfo
+
+# Try building the docker image
+docker build -t rocm -f Dockerfile.rocm .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f rocm || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
+
+# Wait for the server to start
+wait_for_server_to_start() {
+    timeout=300
+    counter=0
+
+    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+        sleep 1
+        counter=$((counter + 1))
+        if [ $counter -ge $timeout ]; then
+            echo "Timeout after $timeout seconds"
+            break
+        fi
+    done
+}
+wait_for_server_to_start
+
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+    localhost:8000/generate \
+    -d '{"prompt": "San Francisco is a"}'
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@ -23,8 +23,9 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 python3 benchmarks/benchmark_serving.py \
-    --backend openai \
-    --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
+    --backend vllm \
+    --dataset-name sharegpt \
+    --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
    --model meta-llama/Llama-2-7b-chat-hf \
    --num-prompts 20 \
    --endpoint /v1/completions \
@ -48,7 +49,9 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
 echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
+echo '```' >> benchmark_results.md
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+echo '```' >> benchmark_results.md

 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -12,45 +12,73 @@ steps:
  command: pytest -v -s async_engine

 - label: Basic Correctness Test
-  command: pytest -v -s --forked basic_correctness
+  command: pytest -v -s basic_correctness
+
+- label: Core Test
+  command: pytest -v -s core

 - label: Distributed Comm Ops Test
-  command: pytest -v -s --forked test_comm_ops.py
+  command: pytest -v -s test_comm_ops.py
  working_dir: "/vllm-workspace/tests/distributed"
  num_gpus: 2 # only support 1 or 2 for now.

- label: Distributed Correctness Test
-  command: pytest -v -s --forked test_basic_distributed_correctness.py
+- label: Distributed Tests
  working_dir: "/vllm-workspace/tests/distributed"
  num_gpus: 2 # only support 1 or 2 for now.
+  commands:
+  - pytest -v -s test_pynccl.py
+  - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
+  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py

 - label: Engine Test
-  command: pytest -v -s engine
+  command: pytest -v -s engine tokenization test_sequence.py test_config.py

 - label: Entrypoints Test
  command: pytest -v -s entrypoints

- label: Kernels Test
-  command: pytest -v -s kernels
-  soft_fail: true
+- label: Examples Test
+  working_dir: "/vllm-workspace/examples"
+  commands:
+    # install aws cli for llava_example.py
+    - pip install awscli
+    - python3 offline_inference.py
+    - python3 offline_inference_with_prefix.py
+    - python3 llm_engine_example.py
+    - python3 llava_example.py
+
+- label: Kernels Test %N
+  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4

 - label: Models Test
  commands:
-    - pytest -v -s models --forked
-  soft_fail: true
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
+
+- label: Llava Test
+  commands:
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models/test_llava.py

 - label: Prefix Caching Test
  commands:
    - pytest -v -s prefix_caching

 - label: Samplers Test
-  command: pytest -v -s samplers --forked
+  command: pytest -v -s samplers
+
+- label: LogitsProcessor Test
+  command: pytest -v -s test_logits_processor.py

 - label: Worker Test
  command: pytest -v -s worker

- label: LoRA Test
-  command: pytest -v -s lora --forked
+- label: Speculative decoding tests
+  command: pytest -v -s spec_decode
+
+- label: LoRA Test %N
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4

 - label: Metrics Test
  command: pytest -v -s metrics
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@ -3,6 +3,11 @@
 {% set default_working_dir = "/vllm-workspace/tests" %}

 steps:
+  - label: "AMD Test"
+    agents:
+      queue: amd
+    command: bash .buildkite/run-amd-test.sh
+
  - label: ":docker: build image"
    commands:
      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
@ -20,6 +25,9 @@ steps:
    agents:
      queue: kubernetes
    soft_fail: {{ step.soft_fail or false }}
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
    retry:
      automatic:
        - exit_status: -1  # Agent was lost
@ -45,6 +53,8 @@ steps:
                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                {% endif %}
                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
                  - name: HF_TOKEN
                    valueFrom:
                      secretKeyRef:
--- a/.github/ISSUE_TEMPLATE/100-documentation.yml
+++ b/.github/ISSUE_TEMPLATE/100-documentation.yml
@ -0,0 +1,22 @@
+name: 📚 Documentation
+description: Report an issue related to https://docs.vllm.ai/
+title: "[Doc]: "
+labels: ["documentation"]
+
+body:
+- type: textarea
+  attributes:
+    label: 📚 The doc issue
+    description: >
+      A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Suggest a potential alternative/fix
+    description: >
+      Tell us how we could improve the documentation in this regard.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -0,0 +1,39 @@
+name: 🛠️ Installation
+description: Report an issue here when you hit errors during installation.
+title: "[Installation]: "
+labels: ["installation"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: How you are installing vllm
+    description: |
+      Paste the full command you are trying to execute.
+    value: |
+      ```sh
+      pip install -vvv vllm
+      ```
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -0,0 +1,37 @@
+name: 💻 Usage
+description: Raise an issue here if you don't know how to use vllm.
+title: "[Usage]: "
+labels: ["usage"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: How would you like to use vllm
+    description: |
+      A detailed description of how you want to use vllm.
+    value: |
+      I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/400-bug
+++ b/.github/ISSUE_TEMPLATE/400-bug
@ -0,0 +1,81 @@
+name: 🐛 Bug report
+description: Raise an issue here if you find a bug.
+title: "[Bug]: "
+labels: ["bug"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Your current environment
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 🐛 Describe the bug
+    description: |
+      Please provide a clear and concise description of what the bug is.
+
+      If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
+
+      ```python
+      from vllm import LLM, SamplingParams
+
+      prompts = [
+          "Hello, my name is",
+          "The president of the United States is",
+          "The capital of France is",
+          "The future of AI is",
+      ]
+      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+      llm = LLM(model="facebook/opt-125m")
+
+      outputs = llm.generate(prompts, sampling_params)
+
+      # Print the outputs.
+      for output in outputs:
+          prompt = output.prompt
+          generated_text = output.outputs[0].text
+          print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+      ```
+
+      If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
+
+      Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+    placeholder: |
+      A clear and concise description of what the bug is.
+
+      ```python
+      # Sample code to reproduce the problem
+      ```
+
+      ```
+      The error message you got, with the full traceback.
+      ```
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
+
+      - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
+
+      - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
+
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/500-feature
+++ b/.github/ISSUE_TEMPLATE/500-feature
@ -0,0 +1,31 @@
+name: 🚀 Feature request
+description: Submit a proposal/request for a new vllm feature
+title: "[Feature]: "
+labels: ["feature request"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: 🚀 The feature, motivation and pitch
+    description: >
+      A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Alternatives
+    description: >
+      A description of any alternative solutions or features you've considered, if any.
+- type: textarea
+  attributes:
+    label: Additional context
+    description: >
+      Add any other context or screenshots about the feature request.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/600-new
+++ b/.github/ISSUE_TEMPLATE/600-new
@ -0,0 +1,33 @@
+name: 🤗 Support request for a new model from huggingface
+description: Submit a proposal/request for a new model from huggingface
+title: "[New Model]: "
+labels: ["new model"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
+- type: textarea
+  attributes:
+    label: The model to consider.
+    description: >
+      A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: The closest model vllm already supports.
+    description: >
+      Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
+- type: textarea
+  attributes:
+    label: What's your difficulty of supporting the model you want?
+    description: >
+      For example, any new operators or new architecture?
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/700-performance
+++ b/.github/ISSUE_TEMPLATE/700-performance
@ -0,0 +1,51 @@
+name: ⚡ Discussion on the performance of vllm
+description: Submit a proposal/discussion about the performance of vllm
+title: "[Performance]: "
+labels: ["performance"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Proposal to improve performance
+    description: >
+      How do you plan to improve vllm's performance?
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Report of performance regression
+    description: >
+      Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Misc discussion on performance
+    description: >
+      Anything about the performance.
+  validations:
+    required: false
+- type: textarea
+  attributes:
+    label: Your current environment (if you think it is necessary)
+    description: |
+      Please run the following and paste the output below.
+      ```sh
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      # For security purposes, please feel free to check the contents of collect_env.py before running it.
+      python collect_env.py
+      ```
+    value: |
+      ```text
+      The output of `python collect_env.py`
+      ```
+  validations:
+    required: false
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/800-misc
+++ b/.github/ISSUE_TEMPLATE/800-misc
@ -0,0 +1,21 @@
+name: 🎲 Misc/random discussions that do not fit into the above categories.
+description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
+title: "[Misc]: "
+labels: ["misc"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
+- type: textarea
+  attributes:
+    label: Anything you want to discuss about vllm.
+    description: >
+      Anything you want to discuss about vllm.
+  validations:
+    required: true
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1 @@
+blank_issues_enabled: false
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -0,0 +1,64 @@
+FILL IN THE PR DESCRIPTION HERE
+
+FIX #xxxx (*link existing issues this PR will resolve*)
+
+**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
+
+---
+
+<details>
+<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
+<summary><b> PR Checklist (Click to Expand) </b></summary>
+
+<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
+
+<h3>PR Title and Classification</h3>
+<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
+<ul>
+    <li><code>[Bugfix]</code> for bug fixes.</li>
+    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
+    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
+    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
+    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
+    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
+    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
+    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
+    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
+</ul>
+<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
+
+<h3>Code Quality</h3>
+
+<p>The PR need to meet the following code quality standards:</p>
+
+<ul>
+    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
+    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
+    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
+    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
+    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
+</ul>
+
+<h3>Notes for Large Changes</h3>
+<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
+
+<h3>What to Expect for the Reviews</h3>
+
+<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
+
+<ul>
+    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
+    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
+    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
+    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
+ </li>
+</ul>
+
+<h3>Thank You</h3>
+
+<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
+
+
+</details>
+
+
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -25,10 +25,13 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
+        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
    - name: Analysing the code with ruff
      run: |
-        ruff vllm tests
+        ruff .
    - name: Spelling check with codespell
      run: |
-         codespell --toml pyproject.toml
+        codespell --toml pyproject.toml
+    - name: Run isort
+      run: |
+        isort . --check-only
--- a/.yapfignore
+++ b/.yapfignore
@ -0,0 +1 @@
+collect_env.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,286 @@
+cmake_minimum_required(VERSION 3.21)
+
+project(vllm_extensions LANGUAGES CXX)
+
+message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+
+#
+# Supported python versions.  These versions will be searched in order, the
+# first match will be selected.  These should be kept in sync with setup.py.
+#
+set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
+
+# Supported NVIDIA architectures.
+set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+
+# Supported AMD GPU architectures.
+set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
+
+#
+# Supported/expected torch versions for CUDA/ROCm.
+#
+# Currently, having an incorrect pytorch version results in a warning
+# rather than an error.
+#
+# Note: the CUDA torch version is derived from pyproject.toml and various
+# requirements.txt files and should be kept consistent.  The ROCm torch
+# versions are derived from Dockerfile.rocm
+#
+set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
+set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
+set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
+
+#
+# Try to find python package with an executable that exactly matches
+# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
+#
+if (VLLM_PYTHON_EXECUTABLE)
+  find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
+else()
+  message(FATAL_ERROR
+    "Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
+    " before running cmake configure.")
+endif()
+
+#
+# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
+#
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+
+# Ensure the 'nvcc' command is in the PATH
+find_program(NVCC_EXECUTABLE nvcc)
+if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
+    message(FATAL_ERROR "nvcc not found")
+endif()
+
+#
+# Import torch cmake configuration.
+# Torch also imports CUDA (and partially HIP) languages with some customizations,
+# so there is no need to do this explicitly with check_language/enable_language,
+# etc.
+#
+find_package(Torch REQUIRED)
+
+#
+# Normally `torch.utils.cpp_extension.CUDAExtension` would add
+# `libtorch_python.so` for linking against an extension. Torch's cmake
+# configuration does not include this library (presumably since the cmake
+# config is used for standalone C++ binaries that link against torch).
+# The `libtorch_python.so` library defines some of the glue code between
+# torch/python via pybind and is required by VLLM extensions for this
+# reason. So, add it by manually with `find_library` using torch's
+# installed library path.
+#
+find_library(torch_python_LIBRARY torch_python PATHS
+  "${TORCH_INSTALL_PREFIX}/lib")
+
+#
+# Set up GPU language and check the torch version and warn if it isn't
+# what is expected.
+#
+if (NOT HIP_FOUND AND CUDA_FOUND)
+  set(VLLM_GPU_LANG "CUDA")
+
+  if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
+      "expected for CUDA build, saw ${Torch_VERSION} instead.")
+  endif()
+elseif(HIP_FOUND)
+  set(VLLM_GPU_LANG "HIP")
+
+  # Importing torch recognizes and sets up some HIP/ROCm configuration but does
+  # not let cmake recognize .hip files. In order to get cmake to understand the
+  # .hip extension automatically, HIP must be enabled explicitly.
+  enable_language(HIP)
+
+  # ROCm 5.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
+      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
+  endif()
+
+  # ROCm 6.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
+      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
+  endif()
+else()
+  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
+endif()
+
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# the supported versions for the current language.
+# The final set of arches is stored in `VLLM_GPU_ARCHES`.
+#
+override_gpu_arches(VLLM_GPU_ARCHES
+  ${VLLM_GPU_LANG}
+  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
+
+#
+# Query torch for additional GPU compilation flags for the given
+# `VLLM_GPU_LANG`.
+# The final set of arches is stored in `VLLM_GPU_FLAGS`.
+#
+get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
+
+#
+# Set nvcc parallelism.
+#
+if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
+endif()
+
+#
+# Define extension targets
+#
+
+#
+# _C extension
+#
+
+set(VLLM_EXT_SRC
+  "csrc/cache_kernels.cu"
+  "csrc/attention/attention_kernels.cu"
+  "csrc/pos_encoding_kernels.cu"
+  "csrc/activation_kernels.cu"
+  "csrc/layernorm_kernels.cu"
+  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
+  "csrc/quantization/gptq/q_gemm.cu"
+  "csrc/cuda_utils_kernels.cu"
+  "csrc/moe_align_block_size_kernels.cu"
+  "csrc/pybind.cpp")
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_EXT_SRC
+    "csrc/quantization/awq/gemm_kernels.cu"
+    "csrc/quantization/marlin/marlin_cuda_kernel.cu"
+    "csrc/custom_all_reduce.cu")
+endif()
+
+define_gpu_extension_target(
+  _C
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  WITH_SOABI)
+
+#
+# _moe_C extension
+#
+
+set(VLLM_MOE_EXT_SRC
+  "csrc/moe/moe_ops.cpp"
+  "csrc/moe/topk_softmax_kernels.cu")
+
+define_gpu_extension_target(
+  _moe_C
+  DESTINATION vllm
+  LANGUAGE ${VLLM_GPU_LANG}
+  SOURCES ${VLLM_MOE_EXT_SRC}
+  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
+  ARCHITECTURES ${VLLM_GPU_ARCHES}
+  WITH_SOABI)
+
+#
+# _punica_C extension
+#
+
+set(VLLM_PUNICA_EXT_SRC
+  "csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
+  "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
+  "csrc/punica/punica_ops.cc")
+
+#
+# Copy GPU compilation flags+update for punica
+#
+set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
+list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
+  "-D__CUDA_NO_HALF_OPERATORS__"
+  "-D__CUDA_NO_HALF_CONVERSIONS__"
+  "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+  "-D__CUDA_NO_HALF2_OPERATORS__")
+
+#
+# Filter out CUDA architectures < 8.0 for punica.
+#
+if (${VLLM_GPU_LANG} STREQUAL "CUDA")
+  set(VLLM_PUNICA_GPU_ARCHES)
+  foreach(ARCH ${VLLM_GPU_ARCHES})
+    string_to_ver(CODE_VER ${ARCH})
+    if (CODE_VER GREATER_EQUAL 8.0)
+      list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
+    endif()
+  endforeach()
+  message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+endif()
+
+if (VLLM_PUNICA_GPU_ARCHES)
+  define_gpu_extension_target(
+    _punica_C
+    DESTINATION vllm
+    LANGUAGE ${VLLM_GPU_LANG}
+    SOURCES ${VLLM_PUNICA_EXT_SRC}
+    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
+    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
+    WITH_SOABI)
+else()
+  message(WARNING "Unable to create _punica_C target because none of the "
+    "requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
+endif()
+
+#
+# Add the `default` target which detects which extensions should be
+# built based on platform/architecture.  This is the same logic that
+# setup.py uses to select which extensions should be built and should
+# be kept in sync.
+#
+# The `default` target makes direct use of cmake easier since knowledge
+# of which extensions are supported has been factored in, e.g.
+#
+# mkdir build && cd build
+# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
+# cmake --build . --target default
+#
+add_custom_target(default)
+
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+  message(STATUS "Enabling C extension.")
+  add_dependencies(default _C)
+endif()
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+  message(STATUS "Enabling moe extension.")
+  add_dependencies(default _moe_C)
+
+  # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
+  # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
+  # there are supported target arches.
+  if (VLLM_PUNICA_GPU_ARCHES AND
+      (ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
+    message(STATUS "Enabling punica extension.")
+    add_dependencies(default _punica_C)
+  endif()
+endif()
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -45,31 +45,9 @@ pytest tests/
 If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
 If not, please file a new issue, providing as much relevant information as possible.

-### Coding Style Guide
+### Pull Requests & Code Reviews

-In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
-
-We include a formatting script [`format.sh`](./format.sh) to format the code.
-
-### Pull Requests
-
-When submitting a pull request:
-
-1. Make sure your code has been rebased on top of the latest commit on the main branch.
-2. Ensure code is properly formatted by running [`format.sh`](./format.sh).
-3. Include a detailed description of the changes in the pull request.
-Explain why you made the changes you did.
-If your pull request fixes an open issue, please include a reference to it in the description.
-
-### Code Reviews
-
-All submissions, including submissions by project members, require a code review.
-To make the review process as smooth as possible, please:
-
-1. Keep your changes as concise as possible.
-If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests.
-2. Respond to all comments within a reasonable time frame.
-If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
+Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.

 ### Thank You

--- a/39
+++ b/39
@ -35,9 +35,14 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-build.txt

+# install compiler cache to speed up compilation leveraging local or remote caching
+RUN apt-get update -y && apt-get install -y ccache
+
 # copy input files
 COPY csrc csrc
 COPY setup.py setup.py
+COPY cmake cmake
+COPY CMakeLists.txt CMakeLists.txt
 COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
@ -54,9 +59,27 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1

-RUN python3 setup.py build_ext --inplace
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    python3 setup.py build_ext --inplace
 #################### EXTENSION Build IMAGE ####################

+#################### FLASH_ATTENTION Build IMAGE ####################
+FROM dev as flash-attn-builder
+# max jobs used for build
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# flash attention version
+ARG flash_attn_version=v2.5.6
+ENV FLASH_ATTN_VERSION=${flash_attn_version}
+
+WORKDIR /usr/src/flash-attention-v2
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### FLASH_ATTENTION Build IMAGE ####################

 #################### TEST IMAGE ####################
 # image to run unit testing suite
@ -68,6 +91,9 @@ WORKDIR /vllm-workspace
 # ADD is used to preserve directory structure
 ADD . /vllm-workspace/
 COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
+# Install flash attention (from pre-built wheel)
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 # ignore build dependencies installation because we are using pre-complied extensions
 RUN rm pyproject.toml
 RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
@ -76,7 +102,7 @@ RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip instal

 #################### RUNTIME BASE IMAGE ####################
 # We used base cuda image because pytorch installs its own cuda libraries.
-# However cupy depends on cuda libraries so we had to switch to the runtime image
+# However pynccl depends on cuda libraries so we had to switch to the runtime image
 # In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
 FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base

@ -88,6 +114,11 @@ WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements.txt
+
+# Install flash attention (from pre-built wheel)
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
+
 #################### RUNTIME BASE IMAGE ####################


@ -96,10 +127,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate
+    pip install accelerate hf_transfer modelscope

 COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY vllm vllm

+ENV VLLM_USAGE_SOURCE production-docker-image
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
 #################### OPENAI API SERVER ####################
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -70,16 +70,16 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
    && cd ..; \
    fi

-COPY ./ /app/vllm
-
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install xformers==0.0.23 --no-deps
-
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
 # Manually removed it so that later steps of numpy upgrade can continue
 RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi

+COPY ./ /app/vllm
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install xformers==0.0.23 --no-deps
+
 RUN cd /app \
    && cd vllm \
    && pip install -U -r requirements-rocm.txt \
@ -90,6 +90,6 @@ RUN cd /app \
    && cd ..

 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
+RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3

 CMD ["/bin/bash"]
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,6 @@
 include LICENSE
 include requirements.txt
+include CMakeLists.txt

+recursive-include cmake *
 recursive-include csrc *
--- a/README.md
+++ b/README.md
@ -16,6 +16,15 @@ Easy, fast, and cheap LLM serving for everyone

 ---

+**The Third vLLM Bay Area Meetup (April 2nd 6pm-8:30pm PT)**
+
+We are thrilled to announce our third vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from Roblox coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://robloxandvllmmeetup2024.splashthat.com/) and join us!
+
+---
+
 *Latest News* 🔥
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
@ -58,6 +67,8 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
 - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
 - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
+- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
+- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
 - DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
 - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
 - Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
@ -67,6 +78,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
 - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
 - InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
+- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
 - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
@ -77,8 +89,10 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
 - Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
+- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
 - StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
 - Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
+- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
 - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)

 Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -1,8 +1,10 @@
 import json
 import os
+import sys
 import time
-from dataclasses import dataclass
-from typing import Optional
+import traceback
+from dataclasses import dataclass, field
+from typing import List, Optional

 import aiohttp
 from tqdm.asyncio import tqdm
@ -26,8 +28,11 @@ class RequestFuncOutput:
    generated_text: str = ""
    success: bool = False
    latency: float = 0
-    ttft: float = 0
+    ttft: float = 0  # Time to first token
+    itl: List[float] = field(
+        default_factory=list)  # List of inter-token latencies
    prompt_len: int = 0
+    error: str = ""


 async def async_request_tgi(
@ -55,71 +60,38 @@ async def async_request_tgi(

        ttft = 0
        st = time.perf_counter()
+        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload) as response:
                if response.status == 200:
-                    async for data in response.content.iter_any():
+                    async for chunk in response.content:
+                        chunk = chunk.strip()
+                        if not chunk:
+                            continue
+
+                        chunk = remove_prefix(chunk.decode("utf-8"), "data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
                        if ttft == 0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft
-                    output.latency = time.perf_counter() - st

-                    body = data.decode("utf-8").lstrip("data:")
-                    output.generated_text = json.loads(body)["generated_text"]
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
                    output.success = True
-                else:
-                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
-            output.success = False
-
-        if pbar:
-            pbar.update(1)
-        return output
-
-
-async def async_request_vllm(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-    assert api_url.endswith("generate")
-
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        payload = {
-            "prompt": request_func_input.prompt,
-            "n": 1,
-            "best_of": request_func_input.best_of,
-            "use_beam_search": request_func_input.use_beam_search,
-            "temperature": 0.0 if request_func_input.use_beam_search else 1.0,
-            "top_p": 1.0,
-            "max_tokens": request_func_input.output_len,
-            "ignore_eos": True,
-            "stream": True,
-        }
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-
-        ttft = 0
-        st = time.perf_counter()
-        try:
-            async with session.post(url=api_url, json=payload) as response:
-                if response.status == 200:
-                    async for data in response.content.iter_any():
-                        if ttft == 0:
-                            ttft = time.perf_counter() - st
-                            output.ttft = ttft
-                    output.latency = time.perf_counter() - st
-
-                    # When streaming, '\0' is appended to the end of the response.
-                    body = data.decode("utf-8").strip("\0")
-                    output.generated_text = json.loads(
-                        body)["text"][0][len(request_func_input.prompt):]
-                    output.success = True
-
-                else:
-                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+                    output.generated_text = data["generated_text"]
+        except Exception:
            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))

        if pbar:
            pbar.update(1)
@ -146,26 +118,45 @@ async def async_request_trt_llm(
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len
-        ttft = 0

+        ttft = 0
        st = time.perf_counter()
+        most_recent_timestamp = st
        try:
-            async with session.post(url=api_url, json=payload) as resp:
-                if resp.status == 200:
-                    async for data in resp.content.iter_any():
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk in response.content:
+                        chunk = chunk.strip()
+                        if not chunk:
+                            continue
+
+                        chunk = remove_prefix(chunk.decode("utf-8"), "data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
                        if ttft == 0:
                            ttft = time.perf_counter() - st
                            output.ttft = ttft
-                    output.latency = time.perf_counter() - st

-                    body = data.decode("utf-8").lstrip("data:")
-                    output.generated_text = json.loads(body)["text_output"]
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp -
+                                              most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.generated_text = json.loads(data)["text_output"]
                    output.success = True

                else:
+                    output.error = response.reason
                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+        except Exception:
            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))

        if pbar:
            pbar.update(1)
@ -181,34 +172,35 @@ async def async_request_deepspeed_mii(
        assert not request_func_input.use_beam_search

        payload = {
-            "prompts": request_func_input.prompt,
-            "max_new_tokens": request_func_input.output_len,
-            "ignore_eos": True,
-            "do_sample": True,
-            "temperature":
-            0.01,  # deepspeed-mii does not accept 0.0 temperature.
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
            "top_p": 1.0,
        }
        output = RequestFuncOutput()
        output.prompt_len = request_func_input.prompt_len

-        # DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder.
-        # https://github.com/microsoft/DeepSpeed-MII/pull/311
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
        output.ttft = 0

        st = time.perf_counter()
        try:
            async with session.post(url=request_func_input.api_url,
-                                    json=payload) as resp:
-                if resp.status == 200:
-                    parsed_resp = await resp.json()
+                                    json=payload) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
                    output.latency = time.perf_counter() - st
-                    output.generated_text = parsed_resp[0]["generated_text"]
+                    output.generated_text = parsed_resp["text"][0]
                    output.success = True
                else:
+                    output.error = response.reason
                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+        except Exception:
            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))

        if pbar:
            pbar.update(1)
@ -220,7 +212,9 @@ async def async_request_openai_completions(
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
-    assert api_url.endswith("v1/completions")
+    assert api_url.endswith(
+        "v1/completions"
+    ), "OpenAI Completions API URL must end with 'v1/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@ -242,43 +236,150 @@ async def async_request_openai_completions(
        generated_text = ""
        ttft = 0
        st = time.perf_counter()
+        most_recent_timestamp = st
        try:
            async with session.post(url=api_url, json=payload,
                                    headers=headers) as response:
                if response.status == 200:
                    async for chunk in response.content:
-                        if ttft == 0:
-                            ttft = time.perf_counter() - st
-                            output.ttft = ttft
-
                        chunk = chunk.strip()
                        if not chunk:
                            continue

-                        chunk = chunk.decode("utf-8").lstrip("data: ")
+                        chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
                        if chunk == "[DONE]":
                            latency = time.perf_counter() - st
                        else:
-                            body = json.loads(chunk)
-                            generated_text += body["choices"][0]["text"]
+                            data = json.loads(chunk)
+
+                            if data["choices"][0]["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                # NOTE: Some completion API might have a last
+                                # usage summary response without a token so we
+                                # do not want to include as inter-token-latency
+                                elif data.get("usage", None) is None:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += data["choices"][0]["text"]

                    output.generated_text = generated_text
                    output.success = True
                    output.latency = latency
-                else:
-                    output.success = False
-        except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+        except Exception:
            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))

    if pbar:
        pbar.update(1)
    return output


+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "v1/chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        assert not request_func_input.use_beam_search
+        payload = {
+            "model": request_func_input.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": request_func_input.prompt,
+                },
+            ],
+            "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload,
+                                    headers=headers) as response:
+                if response.status == 200:
+                    async for chunk in response.content:
+                        chunk = chunk.strip()
+                        if not chunk:
+                            continue
+
+                        chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
+                        if chunk == "[DONE]":
+                            latency = time.perf_counter() - st
+                        else:
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if "content" in data["choices"][0]["delta"]:
+                                # First token
+                                if ttft == 0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)
+
+                                generated_text += data["choices"][0]["delta"][
+                                    "content"]
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                else:
+                    output.error = response.reason
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
+# introduced in Python 3.9
+def remove_prefix(text: str, prefix: str) -> str:
+    if text.startswith(prefix):
+        return text[len(prefix):]
+    return text
+
+
 ASYNC_REQUEST_FUNCS = {
    "tgi": async_request_tgi,
-    "vllm": async_request_vllm,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
 }
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -16,17 +16,19 @@ def main(args: argparse.Namespace):

    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
-        model=args.model,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-        enforce_eager=args.enforce_eager,
-        kv_cache_dtype=args.kv_cache_dtype,
-        device=args.device,
-    )
+    llm = LLM(model=args.model,
+              tokenizer=args.tokenizer,
+              quantization=args.quantization,
+              tensor_parallel_size=args.tensor_parallel_size,
+              trust_remote_code=args.trust_remote_code,
+              dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
+              kv_cache_dtype=args.kv_cache_dtype,
+              device=args.device,
+              ray_workers_use_nsight=args.ray_workers_use_nsight,
+              enable_chunked_prefill=args.enable_chunked_prefill,
+              download_dir=args.download_dir,
+              block_size=args.block_size)

    sampling_params = SamplingParams(
        n=args.n,
@ -145,5 +147,25 @@ if __name__ == '__main__':
        default="cuda",
        choices=["cuda"],
        help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument('--block-size',
+                        type=int,
+                        default=16,
+                        help='block size of key/value cache')
+    parser.add_argument(
+        '--enable-chunked-prefill',
+        type=bool,
+        default=False,
+        help='If True, the prefill requests can be chunked based on the '
+        'max_num_batched_tokens')
+    parser.add_argument(
+        "--ray-workers-use-nsight",
+        action='store_true',
+        help="If specified, use nsight to profile ray workers",
+    )
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -0,0 +1,52 @@
+import argparse
+import time
+
+from vllm import LLM, SamplingParams
+
+PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501
+
+
+def test_prefix(llm=None, sampling_params=None, prompts=None):
+    start_time = time.time()
+
+    llm.generate(prompts, sampling_params=sampling_params)
+
+    end_time = time.time()
+    print(f"cost time {end_time - start_time}")
+
+
+def main(args):
+    llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
+              tokenizer_mode='auto',
+              trust_remote_code=True,
+              enforce_eager=True,
+              enable_prefix_caching=args.enable_prefix_caching)
+
+    num_prompts = 100
+    prompts = [PROMPT] * num_prompts
+    sampling_params = SamplingParams(temperature=0, max_tokens=100)
+
+    print("------warm up------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts[:1],
+        sampling_params=sampling_params,
+    )
+
+    print("------start generating------")
+    test_prefix(
+        llm=llm,
+        prompts=prompts,
+        sampling_params=sampling_params,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Benchmark the performance with or without automatic '
+        'prefix caching.')
+    parser.add_argument('--enable-prefix-caching',
+                        action='store_true',
+                        help='enable prefix caching')
+    args = parser.parse_args()
+    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -1,8 +1,8 @@
 """Benchmark online serving throughput.

 On the server side, run one of the following commands:
-    (vLLM backend)
-    python -m vllm.entrypoints.api_server \
+    vLLM OpenAI API server
+    python -m vllm.entrypoints.openai.api_server \
        --model <your_model> --swap-space 16 \
        --disable-log-requests

@ -12,28 +12,30 @@ On the server side, run one of the following commands:
 On the client side, run:
    python benchmarks/benchmark_serving.py \
        --backend <backend> \
-        --tokenizer <your_model> --dataset <target_dataset> \
-        --request-rate <request_rate>
+        --model <your_model> \
+        --dataset-name sharegpt \
+        --dataset-path <path to dataset> \
+        --request-rate <request_rate> \ # By default <request_rate> is inf
+        --num-prompts <num_prompts> # By default <num_prompts> is 1000
 """
 import argparse
 import asyncio
 import json
+import os
 import random
 import time
+import warnings
 from dataclasses import dataclass
 from datetime import datetime
 from typing import AsyncGenerator, List, Tuple

 import numpy as np
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
-from vllm.transformers_utils.tokenizer import get_tokenizer

-from backend_request_func import (
-    ASYNC_REQUEST_FUNCS,
-    RequestFuncInput,
-    RequestFuncOutput,
-)
+from vllm.transformers_utils.tokenizer import get_tokenizer


@dataclass
@ -52,7 +54,7 @@ class BenchmarkMetrics:
    p99_tpot_ms: float


-def sample_requests(
+def sample_sharegpt_requests(
    dataset_path: str,
    num_requests: int,
    tokenizer: PreTrainedTokenizerBase,
@ -100,6 +102,73 @@ def sample_requests(
    return sampled_requests


+def sample_sonnet_requests(
+    dataset_path: str,
+    num_requests: int,
+    input_len: int,
+    output_len: int,
+    prefix_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, str, int, int]]:
+    assert input_len > prefix_len, "input_len must be greater than prefix_len."
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        poem_lines = f.readlines()
+
+    # Tokenize the poem lines.
+    poem_token_ids = tokenizer(poem_lines).input_ids
+    average_poem_len = sum(
+        len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
+
+    # Base prefix for all requests.
+    base_prompt = "Pick as many lines as you can from these poem lines:\n"
+    base_message = [{
+        "role": "user",
+        "content": base_prompt,
+    }]
+    base_prompt_formatted = tokenizer.apply_chat_template(
+        base_message, add_generation_prompt=True, tokenize=False)
+    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
+
+    assert (input_len > base_prompt_offset
+            ), f"Please set 'args.input-len' higher than {base_prompt_offset}."
+    num_input_lines = round(
+        (input_len - base_prompt_offset) / average_poem_len)
+
+    # First approximately `prefix_len` number of tokens in the
+    # prompt are fixed poem lines.
+    assert (
+        prefix_len > base_prompt_offset
+    ), f"Please set 'args.prefix-len' higher than {base_prompt_offset}."
+
+    num_prefix_lines = round(
+        (prefix_len - base_prompt_offset) / average_poem_len)
+    prefix_lines = poem_lines[:num_prefix_lines]
+
+    # Sample the rest of lines per request.
+    sampled_requests: List[Tuple[str, int, int]] = []
+    for _ in range(num_requests):
+        sampled_lines = "".join(
+            prefix_lines +
+            random.sample(poem_lines, num_input_lines - num_prefix_lines))
+
+        prompt = f"{base_prompt}{sampled_lines}"
+        message = [
+            {
+                "role": "user",
+                "content": prompt,
+            },
+        ]
+        prompt_formatted = tokenizer.apply_chat_template(
+            message, add_generation_prompt=True, tokenize=False)
+        prompt_len = len(tokenizer(prompt_formatted).input_ids)
+        sampled_requests.append(
+            (prompt, prompt_formatted, prompt_len, output_len))
+
+    return sampled_requests
+
+
 async def get_request(
    input_requests: List[Tuple[str, int, int]],
    request_rate: float,
@ -122,37 +191,42 @@ def calculate_metrics(
    outputs: List[RequestFuncOutput],
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
-) -> BenchmarkMetrics:
-    total_output = 0
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    actual_output_lens = []
    total_input = 0
    completed = 0
-    per_token_latencies = []
+    tpots = []
    ttfts = []
    for i in range(len(outputs)):
        if outputs[i].success:
-            output_len = len(tokenizer.encode(outputs[i].generated_text))
-            total_output += output_len
+            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
+            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
-            per_token_latencies.append(outputs[i].latency / output_len)
+            if output_len > 1:
+                tpots.append(
+                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
            ttfts.append(outputs[i].ttft)
            completed += 1
+        else:
+            actual_output_lens.append(0)

    metrics = BenchmarkMetrics(
        completed=completed,
        total_input=total_input,
-        total_output=total_output,
+        total_output=sum(actual_output_lens),
        request_throughput=completed / dur_s,
        input_throughput=total_input / dur_s,
-        output_throughput=total_output / dur_s,
-        mean_ttft_ms=np.mean(ttfts) * 1000,
-        median_ttft_ms=np.median(ttfts) * 1000,
-        p99_ttft_ms=np.percentile(ttfts, 99) * 1000,
-        mean_tpot_ms=np.mean(per_token_latencies) * 1000,
-        median_tpot_ms=np.median(per_token_latencies) * 1000,
-        p99_tpot_ms=np.percentile(per_token_latencies, 99) * 1000,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0) *
+        1000,  # ttfts is empty if streaming is not supported by backend
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+        mean_tpot_ms=np.mean(tpots) * 1000,
+        median_tpot_ms=np.median(tpots) * 1000,
+        p99_tpot_ms=np.percentile(tpots, 99) * 1000,
    )

-    return metrics
+    return metrics, actual_output_lens


 async def benchmark(
@ -171,10 +245,10 @@ async def benchmark(
    else:
        raise ValueError(f"Unknown backend: {backend}")

-    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
-
    print(f"Traffic request rate: {request_rate}")

+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
    benchmark_start_time = time.perf_counter()
    tasks = []
    async for request in get_request(input_requests, request_rate):
@ -192,40 +266,53 @@ async def benchmark(
            asyncio.create_task(
                request_func(request_func_input=request_func_input,
                             pbar=pbar)))
-    outputs = await asyncio.gather(*tasks)
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

    if not disable_tqdm:
        pbar.close()

    benchmark_duration = time.perf_counter() - benchmark_start_time

-    metrics = calculate_metrics(
+    metrics, actual_output_lens = calculate_metrics(
        input_requests=input_requests,
        outputs=outputs,
        dur_s=benchmark_duration,
        tokenizer=tokenizer,
    )

-    print(f"Successful requests: {metrics.completed}")
-    print(f"Benchmark duration: {benchmark_duration:2f} s")
-    print(f"Total input tokens: {metrics.total_input}")
-    print(f"Total generated tokens: {metrics.total_output}")
-    print(f"Request throughput: {metrics.request_throughput:.2f} requests/s")
-    print(f"Input token throughput: {metrics.input_throughput:.2f} tokens/s")
-    print(f"Output token throughput: {metrics.output_throughput:.2f} tokens/s")
-    print(f"Mean TTFT: {metrics.mean_ttft_ms:.2f} ms")
-    print(f"Median TTFT: {metrics.median_ttft_ms:.2f} ms")
-    print(f"P99 TTFT: {metrics.p99_ttft_ms:.2f} ms")
-    print(f"Mean TPOT: {metrics.mean_tpot_ms:.2f} ms")
-    print(f"Median TPOT: {metrics.median_tpot_ms:.2f} ms")
-    print(f"P99 TPOT: {metrics.p99_tpot_ms:.2f} ms")
+    print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+                                    benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:",
+                                 metrics.total_output))
+    print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+                                    metrics.request_throughput))
+    print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
+                                    metrics.input_throughput))
+    print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+                                    metrics.output_throughput))
+    print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
+                                    metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
+                               n=50,
+                               c='-'))
+    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
+                                    metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("=" * 50)

    result = {
        "duration": benchmark_duration,
        "completed": metrics.completed,
        "total_input_tokens": metrics.total_input,
        "total_output_tokens": metrics.total_output,
-        "request_inthroughput": metrics.request_throughput,
+        "request_throughput": metrics.request_throughput,
        "input_throughput": metrics.input_throughput,
        "output_throughput": metrics.output_throughput,
        "mean_ttft_ms": metrics.mean_ttft_ms,
@ -233,7 +320,13 @@ async def benchmark(
        "p99_ttft_ms": metrics.p99_ttft_ms,
        "mean_tpot_ms": metrics.mean_tpot_ms,
        "median_tpot_ms": metrics.median_tpot_ms,
-        "p99_tpot_ms": metrics.p99_tpot_ms
+        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
    }
    return result

@ -254,7 +347,58 @@ def main(args: argparse.Namespace):

    tokenizer = get_tokenizer(tokenizer_id,
                              trust_remote_code=args.trust_remote_code)
-    input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
+
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next "
+            "release. Please use '--dataset-name' and "
+            "'--dataset-path' in the future runs.",
+            stacklevel=2)
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+        )
+
+    elif args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+        )
+
+    elif args.dataset_name == "sonnet":
+        # Do not format the prompt, pass to message directly
+        if args.backend == "openai-chat":
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.input_len,
+                output_len=args.output_len,
+                prefix_len=args.prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [(prompt, prompt_len, output_len)
+                              for prompt, prompt_formatted, prompt_len,
+                              output_len in input_requests]
+        else:
+            assert (
+                tokenizer.chat_template or tokenizer.default_chat_template
+            ), "Tokenizer/model must have chat template for sonnet dataset."
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.input_len,
+                output_len=args.output_len,
+                prefix_len=args.prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [(prompt_formatted, prompt_len, output_len)
+                              for prompt, prompt_formatted, prompt_len,
+                              output_len in input_requests]
+
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")

    benchmark_result = asyncio.run(
        benchmark(
@ -277,13 +421,23 @@ def main(args: argparse.Namespace):
        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
        result_json["date"] = current_dt
        result_json["backend"] = backend
-        result_json["version"] = args.version
        result_json["model_id"] = model_id
        result_json["tokenizer_id"] = tokenizer_id
        result_json["best_of"] = args.best_of
        result_json["use_beam_search"] = args.use_beam_search
        result_json["num_prompts"] = args.num_prompts

+        # Metadata
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    kvstring = item.split("=")
+                    result_json[kvstring[0].strip()] = kvstring[1].strip()
+                else:
+                    raise ValueError(
+                        "Invalid metadata format. Please use KEY=VALUE format."
+                    )
+
        # Traffic
        result_json["request_rate"] = (
            args.request_rate if args.request_rate < float("inf") else "inf")
@ -293,7 +447,9 @@ def main(args: argparse.Namespace):

        # Save to file
        base_model_id = model_id.split("/")[-1]
-        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
+        if args.result_dir:
+            file_name = os.path.join(args.result_dir, file_name)
        with open(file_name, "w") as outfile:
            json.dump(result_json, outfile)

@ -307,12 +463,6 @@ if __name__ == "__main__":
        default="vllm",
        choices=list(ASYNC_REQUEST_FUNCS.keys()),
    )
-    parser.add_argument(
-        "--version",
-        type=str,
-        default="N/A",
-        help="Version of the serving backend/engine.",
-    )
    parser.add_argument(
        "--base-url",
        type=str,
@ -324,12 +474,26 @@ if __name__ == "__main__":
    parser.add_argument(
        "--endpoint",
        type=str,
-        default="/generate",
+        default="/v1/completions",
        help="API endpoint.",
    )
-    parser.add_argument("--dataset",
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in the "
+        "next release.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=["sharegpt", "sonnet"],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument("--dataset-path",
                        type=str,
-                        required=True,
+                        default=None,
                        help="Path to the dataset.")
    parser.add_argument(
        "--model",
@ -341,7 +505,7 @@ if __name__ == "__main__":
        "--tokenizer",
        type=str,
        help=
-        "Name or path of the tokenizer, if not using the default model tokenizer.",
+        "Name or path of the tokenizer, if not using the default tokenizer.",
    )
    parser.add_argument(
        "--best-of",
@ -357,6 +521,27 @@ if __name__ == "__main__":
        default=1000,
        help="Number of prompts to process.",
    )
+    parser.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help=
+        "Number of input tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help=
+        "Number of output tokens per request, used only for sonnet dataset.",
+    )
+    parser.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help=
+        "Number of prefix tokens per request, used only for sonnet dataset.",
+    )
    parser.add_argument(
        "--request-rate",
        type=float,
@ -382,6 +567,21 @@ if __name__ == "__main__":
        action="store_true",
        help="Specify to save benchmark results to a json file",
    )
+    parser.add_argument(
+        "--metadata",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+        "for metadata of this run to be saved in the result JSON file "
+        "for record keeping purposes.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -6,9 +6,9 @@ import time
 from typing import List, Optional, Tuple

 import torch
+from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)
-from tqdm import tqdm


 def sample_requests(
@ -73,21 +73,25 @@ def run_vllm(
    enforce_eager: bool,
    kv_cache_dtype: str,
    device: str,
+    enable_prefix_caching: bool,
+    gpu_memory_utilization: float = 0.9,
+    download_dir: Optional[str] = None,
 ) -> float:
    from vllm import LLM, SamplingParams
-    llm = LLM(
-        model=model,
-        tokenizer=tokenizer,
-        quantization=quantization,
-        tensor_parallel_size=tensor_parallel_size,
-        seed=seed,
-        trust_remote_code=trust_remote_code,
-        dtype=dtype,
-        max_model_len=max_model_len,
-        enforce_eager=enforce_eager,
-        kv_cache_dtype=kv_cache_dtype,
-        device=device,
-    )
+    llm = LLM(model=model,
+              tokenizer=tokenizer,
+              quantization=quantization,
+              tensor_parallel_size=tensor_parallel_size,
+              seed=seed,
+              trust_remote_code=trust_remote_code,
+              dtype=dtype,
+              max_model_len=max_model_len,
+              gpu_memory_utilization=gpu_memory_utilization,
+              enforce_eager=enforce_eager,
+              kv_cache_dtype=kv_cache_dtype,
+              device=device,
+              enable_prefix_caching=enable_prefix_caching,
+              download_dir=download_dir)

    # Add the requests to the engine.
    for prompt, _, output_len in requests:
@ -179,13 +183,15 @@ def run_mii(
    tensor_parallel_size: int,
    output_len: int,
 ) -> float:
-    from mii import pipeline
-    llm = pipeline(model, tensor_parallel=tensor_parallel_size)
+    from mii import client, serve
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
    prompts = [prompt for prompt, _, _ in requests]

    start = time.perf_counter()
-    llm(prompts, max_new_tokens=output_len)
+    llm.generate(prompts, max_new_tokens=output_len)
    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
    return end - start


@ -211,7 +217,9 @@ def main(args: argparse.Namespace):
                                args.seed, args.n, args.use_beam_search,
                                args.trust_remote_code, args.dtype,
                                args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device)
+                                args.kv_cache_dtype, args.device,
+                                args.enable_prefix_caching,
+                                args.gpu_memory_utilization, args.download_dir)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -286,6 +294,12 @@ if __name__ == "__main__":
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
    parser.add_argument("--enforce-eager",
                        action="store_true",
                        help="enforce eager execution")
@ -302,6 +316,15 @@ if __name__ == "__main__":
        default="cuda",
        choices=["cuda"],
        help='device type for vLLM execution, supporting CUDA only currently.')
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@ -2,13 +2,15 @@ import json
 import os
 import sys

-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-
-from vllm.model_executor.layers.fused_moe import fused_moe
 import torch
 import torch.nn.functional as F
 import triton

+from vllm.model_executor.layers.fused_moe import (fused_moe,
+                                                  get_config_file_name)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+

 def main():
    method = fused_moe
@ -64,7 +66,7 @@ def run_grid(bs, method):
        print(f'{tp_size=} {bs=}')
        print(f'{config}')
        # warmup
-        print(f'warming up')
+        print('warming up')
        try:
            for _ in range(num_warmup_trials):
                run_timing(
@ -82,7 +84,7 @@ def run_grid(bs, method):
            continue

        # trial
-        print(f'benchmarking')
+        print('benchmarking')
        for _ in range(num_trials):
            kernel_dur_ms = run_timing(
                num_calls=num_calls,
@ -103,17 +105,25 @@ def run_grid(bs, method):
                best_config = config
                best_time_us = kernel_dur_us

-            print(
-                f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}'
-            )
+            print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
+                  f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
+                  f'{d_model=} {model_intermediate_size=} {num_layers=}')

    print("best_time_us", best_time_us)
    print("best_config", best_config)

-    filename = "/tmp/config.jsonl"
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(num_total_experts,
+                                    model_intermediate_size // tp_size)
    print(f"writing config to file {filename}")
-    with open(filename, "a") as f:
-        f.write(json.dumps({str(bs): best_config}) + "\n")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")


 def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -1,12 +1,12 @@
-from typing import Optional
 import argparse
 import random
 import time
+from typing import Optional

 import torch

-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
 from vllm._C import ops
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random

 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -0,0 +1,121 @@
+import argparse
+from itertools import accumulate
+from typing import Optional
+
+import nvtx
+import torch
+
+from vllm.model_executor.layers.rotary_embedding import get_rope
+
+
+def benchmark_rope_kernels_multi_lora(
+    is_neox_style: bool,
+    batch_size: int,
+    seq_len: int,
+    num_heads: int,
+    head_size: int,
+    rotary_dim: Optional[int],
+    dtype: torch.dtype,
+    seed: int,
+    device: str,
+    max_position: int = 8192,
+    base: int = 10000,
+) -> None:
+    torch.random.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    torch.set_default_device(device)
+    if rotary_dim is None:
+        rotary_dim = head_size
+    # silulating serving 4 LoRAs
+    scaling_factors = [1, 2, 4, 8]
+    # batched RoPE can take multiple scaling factors
+    batched_rope = get_rope(head_size, rotary_dim, max_position, base,
+                            is_neox_style, {
+                                "type": "linear",
+                                "factor": tuple(scaling_factors)
+                            })
+    # non-batched RoPE takes only one scaling factor, we create multiple
+    # instances to simulate the same behavior
+    non_batched_ropes = []
+    for scaling_factor in scaling_factors:
+        non_batched_ropes.append(
+            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
+                     {
+                         "type": "linear",
+                         "factor": (scaling_factor, )
+                     }))
+
+    positions = torch.randint(0, max_position, (batch_size, seq_len))
+    query = torch.randn(batch_size,
+                        seq_len,
+                        num_heads * head_size,
+                        dtype=dtype)
+    key = torch.randn_like(query)
+
+    # create query offsets for batched RoPE, we concat multiple kv cache
+    # together and each query needs to find the right kv cache of its type
+    offset_map = torch.tensor(
+        list(
+            accumulate([0] + [
+                max_position * scaling_factor * 2
+                for scaling_factor in scaling_factors[:-1]
+            ])))
+    query_types = torch.randint(0,
+                                len(scaling_factors), (batch_size, seq_len),
+                                device=device)
+    # map query types to offsets
+    query_offsets = offset_map[query_types]
+    # the kernel takes flattened offsets
+    flatten_offsets = query_offsets.flatten()
+
+    # batched queries of the same type together for non-batched RoPE
+    queries = [query[query_types == i] for i in range(len(scaling_factors))]
+    keys = [key[query_types == i] for i in range(len(scaling_factors))]
+    packed_qkr = zip(queries, keys, non_batched_ropes)
+    # synchronize before start timing
+    torch.cuda.synchronize()
+    with nvtx.annotate("non-batched", color="yellow"):
+        for q, k, r in packed_qkr:
+            r.forward(positions, q, k)
+    torch.cuda.synchronize()
+    with nvtx.annotate("batched", color="green"):
+        batched_rope.forward(positions, query, key, flatten_offsets)
+    torch.cuda.synchronize()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description="Benchmark the rotary embedding kernels.")
+    parser.add_argument("--is-neox-style", type=bool, default=True)
+    parser.add_argument("--batch-size", type=int, default=16)
+    parser.add_argument("--seq-len", type=int, default=512)
+    parser.add_argument("--num-heads", type=int, default=8)
+    parser.add_argument("--head-size",
+                        type=int,
+                        choices=[64, 80, 96, 112, 128, 256],
+                        default=128)
+    parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["bfloat16", "float"],
+                        default="float")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--device",
+                        type=str,
+                        choices=["cuda:0", "cuda:1"],
+                        default="cuda:0")
+    args = parser.parse_args()
+    print(args)
+
+    benchmark_rope_kernels_multi_lora(
+        is_neox_style=args.is_neox_style,
+        batch_size=args.batch_size,
+        seq_len=args.seq_len,
+        num_heads=args.num_heads,
+        head_size=args.head_size,
+        rotary_dim=args.rotary_dim,
+        dtype=getattr(torch, args.dtype),
+        seed=args.seed,
+        device=args.device,
+    )
--- a/benchmarks/sonnet.txt
+++ b/benchmarks/sonnet.txt
@ -0,0 +1,518 @@
+FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+#
+# A command line tool for running pytorch's hipify preprocessor on CUDA
+# source files.
+#
+# See https://github.com/ROCm/hipify_torch
+# and <torch install dir>/utils/hipify/hipify_python.py
+#
+
+import argparse
+import os
+import shutil
+
+from torch.utils.hipify.hipify_python import hipify
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+
+    # Project directory where all the source + include files live.
+    parser.add_argument(
+        "-p",
+        "--project_dir",
+        help="The project directory.",
+    )
+
+    # Directory where hipified files are written.
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        help="The output directory.",
+    )
+
+    # Source files to convert.
+    parser.add_argument("sources",
+                        help="Source files to hipify.",
+                        nargs="*",
+                        default=[])
+
+    args = parser.parse_args()
+
+    # Limit include scope to project_dir only
+    includes = [os.path.join(args.project_dir, '*')]
+
+    # Get absolute path for all source files.
+    extra_files = [os.path.abspath(s) for s in args.sources]
+
+    # Copy sources from project directory to output directory.
+    # The directory might already exist to hold object files so we ignore that.
+    shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
+
+    hipify_result = hipify(project_directory=args.project_dir,
+                           output_directory=args.output_dir,
+                           header_include_dirs=[],
+                           includes=includes,
+                           extra_files=extra_files,
+                           show_detailed=True,
+                           is_pytorch_extension=True,
+                           hipify_extra_files_only=True)
+
+    hipified_sources = []
+    for source in args.sources:
+        s_abs = os.path.abspath(source)
+        hipified_s_abs = (hipify_result[s_abs].hipified_path if
+                          (s_abs in hipify_result
+                           and hipify_result[s_abs].hipified_path is not None)
+                          else s_abs)
+        hipified_sources.append(hipified_s_abs)
+
+    assert (len(hipified_sources) == len(args.sources))
+
+    # Print hipified source files.
+    print("\n".join(hipified_sources))
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -0,0 +1,346 @@
+#
+# Attempt to find the python package that uses the same python executable as
+# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
+#
+macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
+  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
+  set(Python_EXECUTABLE ${EXECUTABLE})
+  find_package(Python COMPONENTS Interpreter Development.Module)
+  if (NOT Python_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
+  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
+    message(FATAL_ERROR
+      "Python version (${_VER}) is not one of the supported versions: "
+      "${_SUPPORTED_VERSIONS_LIST}.")
+  endif()
+  message(STATUS "Found python matching: ${EXECUTABLE}.")
+endmacro()
+
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, a fatal message `ERR_MSG` is issued.
+#
+function (run_python OUT EXPR ERR_MSG)
+  execute_process(
+    COMMAND
+    "${Python_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
+  endif()
+  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+endfunction()
+
+# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
+# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
+macro (append_cmake_prefix_path PKG EXPR)
+  run_python(_PREFIX_PATH
+    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
+  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
+endmacro()
+
+#
+# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
+# of CUDA source files. The names of the corresponding "hipified" sources are
+# stored in `OUT_SRCS`.
+#
+function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
+  #
+  # Split into C++ and non-C++ (i.e. CUDA) sources.
+  #
+  set(SRCS ${ORIG_SRCS})
+  set(CXX_SRCS ${ORIG_SRCS})
+  list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
+  list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
+
+  #
+  # Generate ROCm/HIP source file names from CUDA file names.
+  # Since HIP files are generated code, they will appear in the build area
+  # `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
+  #
+  set(HIP_SRCS)
+  foreach (SRC ${SRCS})
+    string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
+    string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
+    list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
+  endforeach()
+
+  set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
+  add_custom_target(
+    hipify${NAME}
+    COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
+    DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
+    BYPRODUCTS ${HIP_SRCS}
+    COMMENT "Running hipify on ${NAME} extension source files.")
+
+  # Swap out original extension sources with hipified sources.
+  list(APPEND HIP_SRCS ${CXX_SRCS})
+  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
+endfunction()
+
+#
+# Get additional GPU compiler flags from torch.
+#
+function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
+  if (${GPU_LANG} STREQUAL "CUDA")
+    #
+    # Get common NVCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+
+    if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
+      list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
+    endif()
+
+  elseif(${GPU_LANG} STREQUAL "HIP")
+    #
+    # Get common HIP/HIPCC flags from torch.
+    #
+    run_python(GPU_FLAGS
+      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
+      "Failed to determine torch nvcc compiler flags")
+
+    list(APPEND GPU_FLAGS
+      "-DUSE_ROCM"
+      "-U__HIP_NO_HALF_CONVERSIONS__"
+      "-U__HIP_NO_HALF_OPERATORS__"
+      "-fno-gpu-rdc")
+
+  endif()
+  set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
+endfunction()
+
+# Macro for converting a `gencode` version number to a cmake version number.
+macro(string_to_ver OUT_VER IN_STR)
+  string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
+endmacro()
+
+#
+# Override the GPU architectures detected by cmake/torch and filter them by
+# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
+# `GPU_ARCHES`.
+#
+# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
+#
+macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
+  set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
+  message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
+
+  if (${GPU_LANG} STREQUAL "HIP")
+    #
+    # `GPU_ARCHES` controls the `--offload-arch` flags.
+    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
+    # via the `PYTORCH_ROCM_ARCH` env variable.
+    #
+
+    #
+    # Find the intersection of the supported + detected architectures to
+    # set the module architecture flags.
+    #
+    set(${GPU_ARCHES})
+    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
+      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+        list(APPEND ${GPU_ARCHES} ${_ARCH})
+      endif()
+    endforeach()
+
+    if(NOT ${GPU_ARCHES})
+      message(FATAL_ERROR
+        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
+        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
+    endif()
+
+  elseif(${GPU_LANG} STREQUAL "CUDA")
+    #
+    # Setup/process CUDA arch flags.
+    #
+    # The torch cmake setup hardcodes the detected architecture flags in
+    # `CMAKE_CUDA_FLAGS`.  Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
+    # can't modified on a per-target basis, e.g. for the `punica` extension.
+    # So, all the `-gencode` flags need to be extracted and removed from
+    # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
+    # Since it's not possible to use `target_compiler_options` for adding target
+    # specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
+    # must be used instead.  This requires repackaging the architecture flags
+    # into a format that cmake expects for `CUDA_ARCHITECTURES`.
+    #
+    # This is a bit fragile in that it depends on torch using `-gencode` as opposed
+    # to one of the other nvcc options to specify architectures.
+    #
+    # Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
+    # detected architectures.
+    #
+    message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+
+    # Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
+    string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+
+    # Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
+    # and passed back via the `CUDA_ARCHITECTURES` property.
+    string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
+      ${CMAKE_CUDA_FLAGS})
+
+    # If this error is triggered, it might mean that torch has changed how it sets
+    # up nvcc architecture code generation flags.
+    if (NOT _CUDA_ARCH_FLAGS)
+      message(FATAL_ERROR
+        "Could not find any architecture related code generation flags in "
+        "CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
+    endif()
+
+    message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+    message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
+
+    # Initialize the architecture lists to empty.
+    set(${GPU_ARCHES})
+
+    # Process each `gencode` flag.
+    foreach(_ARCH ${_CUDA_ARCH_FLAGS})
+      # For each flag, extract the version number and whether it refers to PTX
+      # or native code.
+      # Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
+      # for that match.
+
+      string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
+      if (_COMPUTE)
+        set(_COMPUTE ${CMAKE_MATCH_1})
+      endif()
+
+      string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
+      if (_SM)
+        set(_SM ${CMAKE_MATCH_1})
+      endif()
+
+      string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
+      if (_CODE)
+        set(_CODE ${CMAKE_MATCH_1})
+      endif()
+
+      # Make sure the virtual architecture can be matched.
+      if (NOT _COMPUTE)
+        message(FATAL_ERROR
+          "Could not determine virtual architecture from: ${_ARCH}.")
+      endif()
+
+      # One of sm_ or compute_ must exist.
+      if ((NOT _SM) AND (NOT _CODE))
+        message(FATAL_ERROR
+          "Could not determine a codegen architecture from: ${_ARCH}.")
+      endif()
+
+      if (_SM)
+        # -real suffix let CMake to only generate elf code for the kernels.
+        # we want this, otherwise the added ptx (default) will increase binary size.
+        set(_VIRT "-real")
+        set(_CODE_ARCH ${_SM})
+      else()
+        # -virtual suffix let CMake to generate ptx code for the kernels.
+        set(_VIRT "-virtual")
+        set(_CODE_ARCH ${_CODE})
+      endif()
+
+      # Check if the current version is in the supported arch list.
+      string_to_ver(_CODE_VER ${_CODE_ARCH})
+      if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
+        message(STATUS "discarding unsupported CUDA arch ${_VER}.")
+        continue()
+      endif()
+
+      # Add it to the arch list.
+      list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
+    endforeach()
+  endif()
+  message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
+endmacro()
+
+#
+# Define a target named `GPU_MOD_NAME` for a single extension. The
+# arguments are:
+#
+# DESTINATION <dest>         - Module destination directory.
+# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
+#                              etc.
+# SOURCES <sources>          - List of source files relative to CMakeLists.txt
+#                              directory.
+#
+# Optional arguments:
+#
+# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
+#                              format.
+#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
+#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+#                              ARCHITECTURES will use cmake's defaults if
+#                              not provided.
+# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
+# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
+# LIBRARIES <libraries>      - Extra link libraries.
+# WITH_SOABI                 - Generate library with python SOABI suffix name.
+#
+# Note: optimization level/debug info is set via cmake build type.
+#
+function (define_gpu_extension_target GPU_MOD_NAME)
+  cmake_parse_arguments(PARSE_ARGV 1
+    GPU
+    "WITH_SOABI"
+    "DESTINATION;LANGUAGE"
+    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
+
+  # Add hipify preprocessing step when building with HIP/ROCm.
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
+  endif()
+
+  if (GPU_WITH_SOABI)
+    set(GPU_WITH_SOABI WITH_SOABI)
+  else()
+    set(GPU_WITH_SOABI)
+  endif()
+
+  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
+
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    # Make this target dependent on the hipify preprocessor step.
+    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+  endif()
+
+  if (GPU_ARCHITECTURES)
+    set_target_properties(${GPU_MOD_NAME} PROPERTIES
+      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
+  endif()
+
+  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
+
+  target_compile_options(${GPU_MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
+
+  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
+
+  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
+    ${GPU_INCLUDE_DIRECTORIES})
+
+  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
+    ${GPU_LIBRARIES})
+
+  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
+  # dependencies that are not necessary and may not be installed.
+  if (GPU_LANGUAGE STREQUAL "CUDA")
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
+      ${CUDA_LIBRARIES})
+  else()
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+  endif()
+
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
+endfunction()
--- a/collect_env.py
+++ b/collect_env.py
@ -0,0 +1,719 @@
+# ruff: noqa
+# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
+
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
+import datetime
+import locale
+import os
+import re
+import subprocess
+import sys
+from collections import namedtuple
+
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+# System Environment Information
+SystemEnv = namedtuple(
+    'SystemEnv',
+    [
+        'torch_version',
+        'is_debug_build',
+        'cuda_compiled_version',
+        'gcc_version',
+        'clang_version',
+        'cmake_version',
+        'os',
+        'libc_version',
+        'python_version',
+        'python_platform',
+        'is_cuda_available',
+        'cuda_runtime_version',
+        'cuda_module_loading',
+        'nvidia_driver_version',
+        'nvidia_gpu_models',
+        'cudnn_version',
+        'pip_version',  # 'pip' or 'pip3'
+        'pip_packages',
+        'conda_packages',
+        'hip_compiled_version',
+        'hip_runtime_version',
+        'miopen_runtime_version',
+        'caching_allocator_config',
+        'is_xnnpack_available',
+        'cpu_info',
+        'rocm_version',  # vllm specific field
+        'neuron_sdk_version',  # vllm specific field
+        'vllm_version',  # vllm specific field
+        'vllm_build_flags',  # vllm specific field
+        'gpu_topo',  # vllm specific field
+    ])
+
+DEFAULT_CONDA_PATTERNS = {
+    "torch",
+    "numpy",
+    "cudatoolkit",
+    "soumith",
+    "mkl",
+    "magma",
+    "triton",
+    "optree",
+}
+
+DEFAULT_PIP_PATTERNS = {
+    "torch",
+    "numpy",
+    "mypy",
+    "flake8",
+    "triton",
+    "optree",
+    "onnx",
+}
+
+
+def run(command):
+    """Return (return-code, stdout, stderr)."""
+    shell = True if type(command) is str else False
+    p = subprocess.Popen(command,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE,
+                         shell=shell)
+    raw_output, raw_err = p.communicate()
+    rc = p.returncode
+    if get_platform() == 'win32':
+        enc = 'oem'
+    else:
+        enc = locale.getpreferredencoding()
+    output = raw_output.decode(enc)
+    err = raw_err.decode(enc)
+    return rc, output.strip(), err.strip()
+
+
+def run_and_read_all(run_lambda, command):
+    """Run command using run_lambda; reads and returns entire output if rc is 0."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Run command using run_lambda, returns the first regex match if it exists."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+
+def run_and_return_first_line(run_lambda, command):
+    """Run command using run_lambda and returns first line if output is not empty."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out.split('\n')[0]
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    if patterns is None:
+        patterns = DEFAULT_CONDA_PATTERNS
+    conda = os.environ.get('CONDA_EXE', 'conda')
+    out = run_and_read_all(run_lambda, "{} list".format(conda))
+    if out is None:
+        return out
+
+    return "\n".join(line for line in out.splitlines()
+                     if not line.startswith("#") and any(name in line
+                                                         for name in patterns))
+
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
+
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'clang --version',
+                                     r'clang version (.*)')
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cmake --version',
+                                     r'cmake (.*)')
+
+
+def get_nvidia_driver_version(run_lambda):
+    if get_platform() == 'darwin':
+        cmd = 'kextstat | grep -i cuda'
+        return run_and_parse_first_match(run_lambda, cmd,
+                                         r'com[.]nvidia[.]CUDA [(](.*?)[)]')
+    smi = get_nvidia_smi()
+    return run_and_parse_first_match(run_lambda, smi,
+                                     r'Driver Version: (.*?) ')
+
+
+def get_gpu_info(run_lambda):
+    if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(
+            torch.version, 'hip') and torch.version.hip is not None):
+        if TORCH_AVAILABLE and torch.cuda.is_available():
+            if torch.version.hip is not None:
+                prop = torch.cuda.get_device_properties(0)
+                if hasattr(prop, "gcnArchName"):
+                    gcnArch = " ({})".format(prop.gcnArchName)
+                else:
+                    gcnArch = "NoGCNArchNameOnOldPyTorch"
+            else:
+                gcnArch = ""
+            return torch.cuda.get_device_name(None) + gcnArch
+        return None
+    smi = get_nvidia_smi()
+    uuid_regex = re.compile(r' \(UUID: .+?\)')
+    rc, out, _ = run_lambda(smi + ' -L')
+    if rc != 0:
+        return None
+    # Anonymize GPUs by removing their UUID
+    return re.sub(uuid_regex, '', out)
+
+
+def get_running_cuda_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'nvcc --version',
+                                     r'release .+ V(.*)')
+
+
+def get_cudnn_version(run_lambda):
+    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%")
+        where_cmd = os.path.join(system_root, 'System32', 'where')
+        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
+    elif get_platform() == 'darwin':
+        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
+        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
+        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
+        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
+        cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
+    else:
+        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
+    rc, out, _ = run_lambda(cudnn_cmd)
+    # find will return 1 if there are permission errors or if not found
+    if len(out) == 0 or (rc != 1 and rc != 0):
+        l = os.environ.get('CUDNN_LIBRARY')
+        if l is not None and os.path.isfile(l):
+            return os.path.realpath(l)
+        return None
+    files_set = set()
+    for fn in out.split('\n'):
+        fn = os.path.realpath(fn)  # eliminate symbolic links
+        if os.path.isfile(fn):
+            files_set.add(fn)
+    if not files_set:
+        return None
+    # Alphabetize the result because the order is non-deterministic otherwise
+    files = sorted(files_set)
+    if len(files) == 1:
+        return files[0]
+    result = '\n'.join(files)
+    return 'Probably one of the following:\n{}'.format(result)
+
+
+def get_nvidia_smi():
+    # Note: nvidia-smi is currently available only on Windows and Linux
+    smi = 'nvidia-smi'
+    if get_platform() == 'win32':
+        system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+        program_files_root = os.environ.get('PROGRAMFILES',
+                                            'C:\\Program Files')
+        legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation',
+                                   'NVSMI', smi)
+        new_path = os.path.join(system_root, 'System32', smi)
+        smis = [new_path, legacy_path]
+        for candidate_smi in smis:
+            if os.path.exists(candidate_smi):
+                smi = '"{}"'.format(candidate_smi)
+                break
+    return smi
+
+
+def get_rocm_version(run_lambda):
+    """Returns the ROCm version if available, otherwise 'N/A'."""
+    return run_and_parse_first_match(run_lambda, 'hipcc --version',
+                                     r'HIP version: (\S+)')
+
+
+def get_neuron_sdk_version(run_lambda):
+    # Adapted from your install script
+    try:
+        result = run_lambda(["neuron-ls"])
+        return result if result[0] == 0 else 'N/A'
+    except Exception:
+        return 'N/A'
+
+
+def get_vllm_version():
+    try:
+        import vllm
+        return vllm.__version__
+    except ImportError:
+        return 'N/A'
+
+
+def summarize_vllm_build_flags():
+    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
+    return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
+        os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
+        'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
+        'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
+    )
+
+
+def get_gpu_topo(run_lambda):
+    if get_platform() == 'linux':
+        return run_and_read_all(run_lambda, 'nvidia-smi topo -m')
+    return None
+
+
+# example outputs of CPU infos
+#  * linux
+#    Architecture:            x86_64
+#      CPU op-mode(s):        32-bit, 64-bit
+#      Address sizes:         46 bits physical, 48 bits virtual
+#      Byte Order:            Little Endian
+#    CPU(s):                  128
+#      On-line CPU(s) list:   0-127
+#    Vendor ID:               GenuineIntel
+#      Model name:            Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#        CPU family:          6
+#        Model:               106
+#        Thread(s) per core:  2
+#        Core(s) per socket:  32
+#        Socket(s):           2
+#        Stepping:            6
+#        BogoMIPS:            5799.78
+#        Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
+#                             sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
+#                             xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
+#                             pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
+#                             hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
+#                             fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
+#                             avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
+#                             xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
+#                             avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
+#    Virtualization features:
+#      Hypervisor vendor:     KVM
+#      Virtualization type:   full
+#    Caches (sum of all):
+#      L1d:                   3 MiB (64 instances)
+#      L1i:                   2 MiB (64 instances)
+#      L2:                    80 MiB (64 instances)
+#      L3:                    108 MiB (2 instances)
+#    NUMA:
+#      NUMA node(s):          2
+#      NUMA node0 CPU(s):     0-31,64-95
+#      NUMA node1 CPU(s):     32-63,96-127
+#    Vulnerabilities:
+#      Itlb multihit:         Not affected
+#      L1tf:                  Not affected
+#      Mds:                   Not affected
+#      Meltdown:              Not affected
+#      Mmio stale data:       Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
+#      Retbleed:              Not affected
+#      Spec store bypass:     Mitigation; Speculative Store Bypass disabled via prctl and seccomp
+#      Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer sanitization
+#      Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
+#      Srbds:                 Not affected
+#      Tsx async abort:       Not affected
+#  * win32
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU0
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+#
+#    Architecture=9
+#    CurrentClockSpeed=2900
+#    DeviceID=CPU1
+#    Family=179
+#    L2CacheSize=40960
+#    L2CacheSpeed=
+#    Manufacturer=GenuineIntel
+#    MaxClockSpeed=2900
+#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
+#    ProcessorType=3
+#    Revision=27142
+
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, '', ''
+    if get_platform() == 'linux':
+        rc, out, err = run_lambda('lscpu')
+    elif get_platform() == 'win32':
+        rc, out, err = run_lambda(
+            'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE'
+        )
+    elif get_platform() == 'darwin':
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = 'None'
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
+def get_platform():
+    if sys.platform.startswith('linux'):
+        return 'linux'
+    elif sys.platform.startswith('win32'):
+        return 'win32'
+    elif sys.platform.startswith('cygwin'):
+        return 'cygwin'
+    elif sys.platform.startswith('darwin'):
+        return 'darwin'
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion',
+                                     r'(.*)')
+
+
+def get_windows_version(run_lambda):
+    system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+    wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
+    findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
+    return run_and_read_all(
+        run_lambda,
+        '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'lsb_release -a',
+                                     r'Description:\t(.*)')
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
+                                     r'PRETTY_NAME="(.*)"')
+
+
+def get_os(run_lambda):
+    from platform import machine
+    platform = get_platform()
+
+    if platform == 'win32' or platform == 'cygwin':
+        return get_windows_version(run_lambda)
+
+    if platform == 'darwin':
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return 'macOS {} ({})'.format(version, machine())
+
+    if platform == 'linux':
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        return '{} ({})'.format(platform, machine())
+
+    # Unknown platform
+    return platform
+
+
+def get_python_platform():
+    import platform
+    return platform.platform()
+
+
+def get_libc_version():
+    import platform
+    if get_platform() != 'linux':
+        return 'N/A'
+    return '-'.join(platform.libc_ver())
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
+    if patterns is None:
+        patterns = DEFAULT_PIP_PATTERNS
+
+    # People generally have `pip` as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    def run_with_pip(pip):
+        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+        return "\n".join(line for line in out.splitlines()
+                         if any(name in line for name in patterns))
+
+    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
+    out = run_with_pip([sys.executable, '-mpip'])
+
+    return pip_version, out
+
+
+def get_cachingallocator_config():
+    ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
+    return ca_config
+
+
+def get_cuda_module_loading_config():
+    if TORCH_AVAILABLE and torch.cuda.is_available():
+        torch.cuda.init()
+        config = os.environ.get('CUDA_MODULE_LOADING', '')
+        return config
+    else:
+        return "N/A"
+
+
+def is_xnnpack_available():
+    if TORCH_AVAILABLE:
+        import torch.backends.xnnpack
+        return str(
+            torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
+    else:
+        return "N/A"
+
+
+def get_env_info():
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    if TORCH_AVAILABLE:
+        version_str = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+        cuda_available_str = str(torch.cuda.is_available())
+        cuda_version_str = torch.version.cuda
+        if not hasattr(torch.version,
+                       'hip') or torch.version.hip is None:  # cuda version
+            hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+        else:  # HIP version
+
+            def get_version_or_na(cfg, prefix):
+                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
+                return _lst[0] if _lst else 'N/A'
+
+            cfg = torch._C._show_config().split('\n')
+            hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime')
+            miopen_runtime_version = get_version_or_na(cfg, 'MIOpen')
+            cuda_version_str = 'N/A'
+            hip_compiled_version = torch.version.hip
+    else:
+        version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
+        hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
+
+    sys_version = sys.version.replace("\n", " ")
+
+    conda_packages = get_conda_packages(run_lambda)
+
+    rocm_version = get_rocm_version(run_lambda)
+    neuron_sdk_version = get_neuron_sdk_version(run_lambda)
+    vllm_version = get_vllm_version()
+    vllm_build_flags = summarize_vllm_build_flags()
+    gpu_topo = get_gpu_topo(run_lambda)
+
+    return SystemEnv(
+        torch_version=version_str,
+        is_debug_build=debug_mode_str,
+        python_version='{} ({}-bit runtime)'.format(
+            sys_version,
+            sys.maxsize.bit_length() + 1),
+        python_platform=get_python_platform(),
+        is_cuda_available=cuda_available_str,
+        cuda_compiled_version=cuda_version_str,
+        cuda_runtime_version=get_running_cuda_version(run_lambda),
+        cuda_module_loading=get_cuda_module_loading_config(),
+        nvidia_gpu_models=get_gpu_info(run_lambda),
+        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
+        cudnn_version=get_cudnn_version(run_lambda),
+        hip_compiled_version=hip_compiled_version,
+        hip_runtime_version=hip_runtime_version,
+        miopen_runtime_version=miopen_runtime_version,
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=conda_packages,
+        os=get_os(run_lambda),
+        libc_version=get_libc_version(),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        caching_allocator_config=get_cachingallocator_config(),
+        is_xnnpack_available=is_xnnpack_available(),
+        cpu_info=get_cpu_info(run_lambda),
+        rocm_version=rocm_version,
+        neuron_sdk_version=neuron_sdk_version,
+        vllm_version=vllm_version,
+        vllm_build_flags=vllm_build_flags,
+        gpu_topo=gpu_topo,
+    )
+
+
+env_info_fmt = """
+PyTorch version: {torch_version}
+Is debug build: {is_debug_build}
+CUDA used to build PyTorch: {cuda_compiled_version}
+ROCM used to build PyTorch: {hip_compiled_version}
+
+OS: {os}
+GCC version: {gcc_version}
+Clang version: {clang_version}
+CMake version: {cmake_version}
+Libc version: {libc_version}
+
+Python version: {python_version}
+Python platform: {python_platform}
+Is CUDA available: {is_cuda_available}
+CUDA runtime version: {cuda_runtime_version}
+CUDA_MODULE_LOADING set to: {cuda_module_loading}
+GPU models and configuration: {nvidia_gpu_models}
+Nvidia driver version: {nvidia_driver_version}
+cuDNN version: {cudnn_version}
+HIP runtime version: {hip_runtime_version}
+MIOpen runtime version: {miopen_runtime_version}
+Is XNNPACK available: {is_xnnpack_available}
+
+CPU:
+{cpu_info}
+
+Versions of relevant libraries:
+{pip_packages}
+{conda_packages}
+""".strip()
+
+env_info_fmt += """
+ROCM Version: {rocm_version}
+Neuron SDK Version: {neuron_sdk_version}
+vLLM Version: {vllm_version}
+vLLM Build Flags:
+{vllm_build_flags}
+GPU Topology:
+{gpu_topo}
+""".strip()
+
+
+def pretty_str(envinfo):
+
+    def replace_nones(dct, replacement='Could not collect'):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true='Yes', false='No'):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag='[prepend]'):
+        lines = text.split('\n')
+        updated_lines = [tag + line for line in lines]
+        return '\n'.join(updated_lines)
+
+    def replace_if_empty(text, replacement='No relevant packages'):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split('\n')) > 1:
+            return '\n{}\n'.format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # If nvidia_gpu_models is multiline, start on the next line
+    mutable_dict['nvidia_gpu_models'] = \
+        maybe_start_on_next_line(envinfo.nvidia_gpu_models)
+
+    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
+    dynamic_cuda_fields = [
+        'cuda_runtime_version',
+        'nvidia_gpu_models',
+        'nvidia_driver_version',
+    ]
+    all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
+    all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None
+                                          for field in dynamic_cuda_fields)
+    if TORCH_AVAILABLE and not torch.cuda.is_available(
+    ) and all_dynamic_cuda_fields_missing:
+        for field in all_cuda_fields:
+            mutable_dict[field] = 'No CUDA'
+        if envinfo.cuda_compiled_version is None:
+            mutable_dict['cuda_compiled_version'] = 'None'
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict['pip_packages'] = replace_if_empty(
+        mutable_dict['pip_packages'])
+    mutable_dict['conda_packages'] = replace_if_empty(
+        mutable_dict['conda_packages'])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict['pip_packages']:
+        mutable_dict['pip_packages'] = prepend(
+            mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version))
+    if mutable_dict['conda_packages']:
+        mutable_dict['conda_packages'] = prepend(
+            mutable_dict['conda_packages'], '[conda] ')
+    mutable_dict['cpu_info'] = envinfo.cpu_info
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(
+            torch.utils, '_crash_handler'):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [
+                os.path.join(minidump_dir, dump)
+                for dump in os.listdir(minidump_dir)
+            ]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
+                '%Y-%m-%d %H:%M:%S')
+            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
+                  "if this is related to your bug please include it when you file a report ***"
+            print(msg, file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@ -33,12 +33,25 @@ template<typename T>
 __device__ __forceinline__ T gelu_kernel(const T& x) {
  // Equivalent to PyTorch GELU with 'none' approximation.
  // Refer to:
-  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
  const float f = (float) x;
  constexpr float ALPHA = M_SQRT1_2;
  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
 }

+template<typename T>
+__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'tanh' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
+  const float f = (float) x;
+  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
+  constexpr float KAPPA = 0.044715;
+  float x_cube = f * f * f;
+  float inner = BETA * (f + KAPPA * x_cube);
+  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
+}
+
 } // namespace vllm

 // Launch activation and gating kernel.
@ -73,6 +86,13 @@ void gelu_and_mul(
  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
 }

+void gelu_tanh_and_mul(
+  torch::Tensor& out,      // [..., d]
+  torch::Tensor& input)    // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
+}
+
 namespace vllm {

 // Element-wise activation kernel template.
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@ -15,9 +15,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifdef USE_ROCM
-#include <hip/hip_runtime.h>
-#endif

 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
@ -31,11 +28,6 @@

 #include <algorithm>

-#ifndef USE_ROCM
-#define WARP_SIZE 32
-#else
-#define WARP_SIZE warpSize
-#endif
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@ -1,5 +1,15 @@
 #pragma once

+#ifdef USE_ROCM
+#include <hip/hip_runtime.h>
+#endif
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
 #ifndef USE_ROCM
  #define VLLM_LDG(arg) __ldg(arg)
 #else
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@ -29,7 +29,7 @@ fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
    std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
  }
  return (fptr_t) new vllm::CustomAllreduce(
-      reinterpret_cast<vllm::Metadata *>(meta.data_ptr()), rank_data.data_ptr(),
+      reinterpret_cast<vllm::Signal *>(meta.data_ptr()), rank_data.data_ptr(),
      rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
 }

@ -62,9 +62,9 @@ bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
  if (inp_size % 16 != 0) return false;
  if (!_is_weak_contiguous(inp)) return false;
  if (world_size == 2 || full_nvlink) return inp_size <= max_size;
-  // 4 PCIE GPUs use 2 stage allreduce, and is only faster than NCCL when size
-  // <= 512k
-  return world_size <= 4 && inp_size <= 512 * 1024;
+  // for 4 or more non NVLink-capable GPUs, custom allreduce provides little
+  // performance improvement over NCCL.
+  return false;
 }

 void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out,
@ -126,7 +126,7 @@ void dispose(fptr_t _fa) {
  delete fa;
 }

-int meta_size() { return sizeof(vllm::Metadata); }
+int meta_size() { return sizeof(vllm::Signal); }

 void register_buffer(fptr_t _fa, torch::Tensor &t,
                     const std::vector<std::string> &handles,
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -23,29 +23,17 @@

 namespace vllm {

+constexpr int kMaxBlocks = 64;
+// note: we don't want to use atomics for signals because peer atomics are no
+// supported on PCIe links
 struct Signal {
-  alignas(64) union {
-    uint64_t flag;
-    unsigned char data[8];
-  } start;
-  alignas(64) union {
-    uint64_t flag;
-    unsigned char data[8];
-  } end;
+  alignas(128) uint32_t start[kMaxBlocks][8];
+  alignas(128) uint32_t end[kMaxBlocks][8];
 };

-struct Metadata {
-  alignas(128) Signal sg;
-  alignas(128) int counter;
-};
-static_assert(offsetof(Metadata, counter) == 128);
-static_assert(sizeof(Metadata) == 256);
-
 struct __align__(16) RankData { const void *__restrict__ ptrs[8]; };

-struct RankSignals {
-  volatile Signal *signals[8];
-};
+struct __align__(16) RankSignals { volatile Signal *signals[8]; };

 // like std::array, but aligned
 template <typename T, int sz>
@ -135,70 +123,49 @@ DINLINE O downcast(array_t<float, O::size> val) {
  }
 }

-// compute flag at compile time
-__host__ __device__ constexpr uint64_t compute_flag(int ngpus) {
-  auto m = std::numeric_limits<uint64_t>::max();
-  return m >> ((8 - ngpus) * 8);
-}
-
+// This function is meant to be used as the first synchronization in the all
+// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
+// prior memory accesses. Note: volatile writes will not be reordered against
+// other volatile writes.
 template <int ngpus>
-DINLINE void start_sync(const RankSignals &sg, volatile Metadata *meta,
+DINLINE void start_sync(const RankSignals &sg, volatile Signal *self_sg,
                        int rank) {
-  constexpr auto FLAG = compute_flag(ngpus);
-  if (blockIdx.x == 0) {
-    if (threadIdx.x < ngpus)
-      // simultaneously write to the corresponding byte to all other ranks.
-      // Latency = 1 p2p write
-      sg.signals[threadIdx.x]->start.data[rank] = 255;
-    else if (threadIdx.x == 32)
-      // reset
-      meta->sg.end.flag = 0;
-  }
-  if (threadIdx.x == 0) {
-    while (meta->sg.start.flag != FLAG)
+  if (threadIdx.x < ngpus) {
+    // reset flag for next time
+    self_sg->end[blockIdx.x][threadIdx.x] = 0;
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
+    // wait until we got true from all ranks
+    while (!self_sg->start[blockIdx.x][threadIdx.x])
      ;
  }
  __syncthreads();
 }

+// This function is meant to be used as the second or the final synchronization
+// barrier in the all reduce kernel. If it's the final synchronization barrier,
+// we don't need to make any visibility guarantees for prior memory accesses.
 template <int ngpus, bool final_sync = false>
-DINLINE void end_sync(const RankSignals &sg, volatile Metadata *meta,
+DINLINE void end_sync(const RankSignals &sg, volatile Signal *self_sg,
                      int rank) {
-  constexpr auto FLAG = compute_flag(ngpus);
  __syncthreads();
-  __shared__ int num;
-  if (threadIdx.x == 0) num = atomicAdd((int *)&meta->counter, 1);
-  __syncthreads();
-
-  // Only the last completing block can perform the end synchronization
-  // This can ensures when the final busy wait ends, all ranks must have
-  // finished reading each other's buffer.
-  if (num == gridDim.x - 1) {
-    if (threadIdx.x == 32) {
-      // reset in a different warp
-      meta->counter = 0;
-      meta->sg.start.flag = 0;
-    } else if (threadIdx.x < ngpus) {
-      // simultaneously write to the corresponding byte to all other ranks.
-      // Latency = 1 p2p write
-      sg.signals[threadIdx.x]->end.data[rank] = 255;
-    }
-    // if this is the final sync, only one block needs it
-    // because kernel exit can serve as sync
-    if constexpr (final_sync) {
-      if (threadIdx.x == 0) {
-        while (meta->sg.end.flag != FLAG)
-          ;
-      }
-    }
-  }
-  if constexpr (!final_sync) {
-    if (threadIdx.x == 0) {
-      while (meta->sg.end.flag != FLAG)
-        ;
-    }
-    __syncthreads();
+  // eliminate the case that prior writes are not visible after signals become
+  // visible. Note that I did not managed to make this happen through a lot of
+  // testing. Might be the case that hardware provides stronger guarantee than
+  // the memory model. 
+  if constexpr (!final_sync) __threadfence_system();
+  if (threadIdx.x < ngpus) {
+    // reset flag for next time
+    self_sg->start[blockIdx.x][threadIdx.x] = 0;
+    // simultaneously write to the corresponding flag of all ranks.
+    // Latency = 1 p2p write
+    sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1;
+    // wait until we got true from all ranks
+    while (!self_sg->end[blockIdx.x][threadIdx.x])
+      ;
  }
+  if constexpr (!final_sync) __syncthreads();
 }

 template <typename P, int ngpus, typename A>
@ -214,32 +181,32 @@ DINLINE P packed_reduce(const P *ptrs[], int idx) {
 template <typename T, int ngpus>
 __global__ void __launch_bounds__(512, 1)
    cross_device_reduce_1stage(RankData *_dp, RankSignals sg,
-                               volatile Metadata *meta, T *__restrict__ result,
+                               volatile Signal *self_sg, T *__restrict__ result,
                               int rank, int size) {
  using P = typename packed_t<T>::P;
  using A = typename packed_t<T>::A;
  // note: we don't reorder the address so the accumulation order is the same
  // for all ranks, ensuring bitwise identical results
  auto dp = *_dp;
-  start_sync<ngpus>(sg, meta, rank);
+  start_sync<ngpus>(sg, self_sg, rank);
  // do the actual reduction
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
       idx += gridDim.x * blockDim.x) {
    ((P *)result)[idx] =
        packed_reduce<P, ngpus, A>((const P **)&dp.ptrs[0], idx);
  }
-  end_sync<ngpus, true>(sg, meta, rank);
+  end_sync<ngpus, true>(sg, self_sg, rank);
 }

 template <typename P>
 DINLINE P *get_tmp_buf(volatile Signal *sg) {
-  return (P *)(((Metadata *)sg) + 1);
+  return (P *)(((Signal *)sg) + 1);
 }

 template <typename T, int ngpus>
 __global__ void __launch_bounds__(512, 1)
    cross_device_reduce_2stage(RankData *_dp, RankSignals sg,
-                               volatile Metadata *meta, T *__restrict__ result,
+                               volatile Signal *self_sg, T *__restrict__ result,
                               int rank, int size) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = gridDim.x * blockDim.x;
@ -248,6 +215,7 @@ __global__ void __launch_bounds__(512, 1)
  int part = size / ngpus;
  int start = rank * part;
  int end = rank == ngpus - 1 ? size : start + part;
+  int largest_part = part + size % ngpus;
  const P *ptrs[ngpus];
  P *tmps[ngpus];
 #pragma unroll
@ -257,75 +225,28 @@ __global__ void __launch_bounds__(512, 1)
    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
  }
  auto tmp_out = tmps[0];
-  start_sync<ngpus>(sg, meta, rank);
+  start_sync<ngpus>(sg, self_sg, rank);
  // stage 1: reduce scatter
  for (int idx = start + tid; idx < end; idx += stride) {
    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
  }
-  // Maybe TODO: replace this with per-block release-acquire
-  // can save about 1-2us (not a lot though)
-  end_sync<ngpus>(sg, meta, rank);
+  end_sync<ngpus>(sg, self_sg, rank);

-  // stage 2: allgather
-  for (int idx = tid; idx < part; idx += stride) {
+  // stage 2: allgather. Note: it's important to match the tid between
+  // the two stages, because visibility across devices is only guaranteed
+  // between threads that have the same tid. If thread i computes the sum of
+  // start + i in the first stage, then thread i also gathers start + i from all
+  // ranks.
+  for (int idx = tid; idx < largest_part; idx += stride) {
 #pragma unroll
    for (int i = 0; i < ngpus; i++) {
-      int dst_idx = ((rank + i) % ngpus) * part + idx;
-      ((P *)result)[dst_idx] = tmps[i][idx];
+      int gather_from_rank = ((rank + i) % ngpus);
+      if (gather_from_rank == ngpus - 1 || idx < part) {
+        int dst_idx = gather_from_rank * part + idx;
+        ((P *)result)[dst_idx] = tmps[i][idx];
+      }
    }
  }
-  // process the last larger partition
-  int remaining = size - part * ngpus;
-  if (tid < remaining) {
-    int dst_idx = tid + part * ngpus;
-    ((P *)result)[dst_idx] = get_tmp_buf<P>(sg.signals[ngpus - 1])[part + tid];
-  }
-
-  // faster than this
-  // for (int idx = tid; idx < size; idx += stride) {
-  //   int target_rank = idx / part;
-  //   if (target_rank == ngpus) target_rank -= 1;
-  //   ((P *)result)[idx] = tmps[target_rank][idx - target_rank * part];
-  // }
-}
-
-template <typename T, int ngpus>
-__global__ void __launch_bounds__(512, 1)
-    cross_device_reduce_half_butterfly(RankData *_dp, RankSignals sg,
-                                       volatile Metadata *meta,
-                                       T *__restrict__ result, int rank,
-                                       int size) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = gridDim.x * blockDim.x;
-  using P = typename packed_t<T>::P;
-  using A = typename packed_t<T>::A;
-  auto tmp_out = get_tmp_buf<P>(sg.signals[rank]);
-  constexpr int hg = ngpus / 2;
-  // Actually not quite half butterfly.
-  // This is an all-to-all within each group containing half of the ranks
-  // followed by cross-group add. Equivalent to half butterfly when there
-  // are 4 GPUs, a common case for PCIe cards like T4 and A10.
-  const P *ptrs[hg];
-  {
-    int start = rank - rank % hg;
-#pragma unroll
-    for (int i = 0; i < hg; i++) {
-      ptrs[i] = (const P *)_dp->ptrs[i + start];
-    }
-  }
-  start_sync<ngpus>(sg, meta, rank);
-  for (int idx = tid; idx < size; idx += stride) {
-    tmp_out[idx] = packed_reduce<P, hg, A>(ptrs, idx);
-  }
-  end_sync<ngpus>(sg, meta, rank);
-
-  auto src = get_tmp_buf<P>(sg.signals[(ngpus - 1) - rank % ngpus]);
-  // do the cross group reduction
-  for (int idx = tid; idx < size; idx += stride) {
-    auto tmp = tmp_out[idx];
-    packed_assign_add(tmp, src[idx]);
-    ((P *)result)[idx] = tmp;
-  }
 }

 using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
@ -341,7 +262,7 @@ class CustomAllreduce {
  // below are device pointers
  RankSignals sg_;
  std::unordered_map<void *, RankData *> buffers_;
-  Metadata *meta_;
+  Signal *self_sg_;

  // stores the registered device pointers from all ranks
  RankData *d_rank_data_base_, *d_rank_data_end_;
@ -352,32 +273,32 @@ class CustomAllreduce {
  /**
   * meta is a pointer to device metadata and temporary buffer for allreduce.
   *
-   * There's a total of sizeof(Metadata) of prefix before the actual data,
+   * There's a total of sizeof(Signal) of prefix before the actual data,
   * so meta + 1 points to actual temporary buffer.
   *
   * note: this class does not own any device memory. Any required buffers
   * are passed in from the constructor
   */
-  CustomAllreduce(Metadata *meta, void *rank_data, size_t rank_data_sz,
+  CustomAllreduce(Signal *meta, void *rank_data, size_t rank_data_sz,
                  const cudaIpcMemHandle_t *handles,
                  const std::vector<int64_t> &offsets, int rank,
                  bool full_nvlink = true)
      : rank_(rank),
        world_size_(offsets.size()),
        full_nvlink_(full_nvlink),
-        meta_(meta),
+        self_sg_(meta),
        d_rank_data_base_(reinterpret_cast<RankData *>(rank_data)),
        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
    for (int i = 0; i < world_size_; i++) {
-      Metadata *rank_meta;
+      Signal *rank_sg;
      if (i != rank_) {
        char *handle = open_ipc_handle(&handles[i]);
        handle += offsets[i];
-        rank_meta = (Metadata *)handle;
+        rank_sg = (Signal *)handle;
      } else {
-        rank_meta = meta_;
+        rank_sg = self_sg_;
      }
-      sg_.signals[i] = &rank_meta->sg;
+      sg_.signals[i] = rank_sg;
    }
  }

@ -492,6 +413,10 @@ class CustomAllreduce {
          "custom allreduce currently requires input length to be multiple "
          "of " +
          std::to_string(d));
+    if (block_limit > kMaxBlocks)
+      throw std::runtime_error("max supported block limit is " +
+                               std::to_string(kMaxBlocks) + ". Got " +
+                               std::to_string(block_limit));

    RankData *ptrs;
    cudaStreamCaptureStatus status;
@ -512,9 +437,9 @@ class CustomAllreduce {
    size /= d;
    auto bytes = size * sizeof(typename packed_t<T>::P);
    int blocks = std::min(block_limit, (size + threads - 1) / threads);
-#define KL(ngpus, name) \
-  name<T, ngpus>        \
-      <<<blocks, threads, 0, stream>>>(ptrs, sg_, meta_, output, rank_, size);
+#define KL(ngpus, name)                                                       \
+  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+                                                 rank_, size);
 #define REDUCE_CASE(ngpus)                            \
  case ngpus: {                                       \
    if (world_size_ == 2) {                           \
@ -526,8 +451,6 @@ class CustomAllreduce {
      } else {                                        \
        KL(ngpus, cross_device_reduce_2stage);        \
      }                                               \
-    } else {                                          \
-      KL(ngpus, cross_device_reduce_half_butterfly);  \
    }                                                 \
    break;                                            \
  }
@ -556,7 +479,7 @@ class CustomAllreduce {
 /**
 * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
 a template instantiation:
- * template void CustomAllreduce::allreduce<half>(cudaStream_t, half *, half *,
- int, int, int);
+ * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
+ half *, int, int, int);
 */
 }  // namespace vllm
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@ -92,7 +92,7 @@ __global__ void gen_data(curandState_t *state, T *data, double *ground_truth,

 template <typename T>
 void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
-         int data_size) {
+         int data_size, bool performance_test) {
  T *result;
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
@ -101,7 +101,7 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,

  cudaIpcMemHandle_t self_data_handle;
  cudaIpcMemHandle_t data_handles[8];
-  vllm::Metadata *buffer;
+  vllm::Signal *buffer;
  T *self_data_copy;
  /**
   * Allocate IPC buffer
@ -115,9 +115,9 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
   * convenience.
   */
  CUDACHECK(
-      cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Metadata)));
-  CUDACHECK(cudaMemset(buffer, 0,
-                       2 * data_size * sizeof(T) + sizeof(vllm::Metadata)));
+      cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
+  CUDACHECK(
+      cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
  CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
  CUDACHECK(cudaIpcGetMemHandle(&self_data_handle, buffer));

@ -133,7 +133,7 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
                           offsets, myRank);
  auto *self_data =
      reinterpret_cast<T *>(reinterpret_cast<char *>(buffer) +
-                            sizeof(vllm::Metadata) + data_size * sizeof(T));
+                            sizeof(vllm::Signal) + data_size * sizeof(T));
  // hack buffer registration
  {
    std::vector<std::string> handles;
@ -143,8 +143,8 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
      char *end = (char *)&data_handles[i + 1];
      handles.emplace_back(begin, end);
    }
-    std::vector<int64_t> offsets(
-        nRanks, sizeof(vllm::Metadata) + data_size * sizeof(T));
+    std::vector<int64_t> offsets(nRanks,
+                                 sizeof(vllm::Signal) + data_size * sizeof(T));
    fa.register_buffer(handles, offsets, self_data);
  }

@ -169,81 +169,112 @@ void run(int myRank, int nRanks, ncclComm_t &comm, int threads, int block_limit,
  } else {
    ncclDtype = ncclFloat;
  }
-
-  dummy_kernel<<<1, 1, 0, stream>>>();
-  constexpr int warmup_iters = 5;
-  constexpr int num_iters = 25;
-  // warmup
-  for (int i = 0; i < warmup_iters; i++) {
-    NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum, comm,
-                            stream));
-  }
-  CUDACHECK(cudaEventRecord(start, stream));
-  for (int i = 0; i < num_iters; i++) {
-    NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum, comm,
-                            stream));
-  }
-  CUDACHECK(cudaEventRecord(stop, stream));
-  CUDACHECK(cudaStreamSynchronize(stream));
-  float allreduce_ms = 0;
-  cudaEventElapsedTime(&allreduce_ms, start, stop);
-
-  // if (myRank == 1) dummy_kernel<<<1, 1, 0, stream>>>();
-  // set_data<T><<<16, 1024, 0, stream>>>(self_data, data_size, myRank);
-
-  dummy_kernel<<<1, 1, 0, stream>>>();
-  // warm up
-  for (int i = 0; i < warmup_iters; i++) {
-    fa.allreduce<T>(stream, self_data, result, data_size, threads, block_limit);
-  }
-  CUDACHECK(cudaEventRecord(start, stream));
-  for (int i = 0; i < num_iters; i++) {
-    fa.allreduce<T>(stream, self_data, result, data_size, threads, block_limit);
-  }
-  CUDACHECK(cudaEventRecord(stop, stream));
-  CUDACHECK(cudaStreamSynchronize(stream));
-
-  float duration_ms = 0;
-  cudaEventElapsedTime(&duration_ms, start, stop);
-  if (myRank == 0)
-    printf(
-        "Rank %d done, nGPUs:%d, sz (kb): %d, %d, %d, my time:%.2fus, nccl "
-        "time:%.2fus\n",
-        myRank, nRanks, data_size * sizeof(T) / 1024, threads, block_limit,
-        duration_ms * 1e3 / num_iters, allreduce_ms * 1e3 / num_iters);
-
-  // And wait for all the queued up work to complete
-  CUDACHECK(cudaStreamSynchronize(stream));
-
-  NCCLCHECK(ncclAllReduce(self_data_copy, self_data, data_size, ncclDtype,
-                          ncclSum, comm, stream));
-
  double *nccl_result, *my_result;
  CUDACHECK(cudaMallocHost(&nccl_result, data_size * sizeof(double)));
  CUDACHECK(cudaMallocHost(&my_result, data_size * sizeof(double)));
-
-  convert_data<T><<<108, 1024, 0, stream>>>(self_data, result, nccl_result,
-                                            my_result, data_size);
-  CUDACHECK(cudaStreamSynchronize(stream));
-
-  for (unsigned long j = 0; j < data_size; j++) {
-    auto diff = abs(nccl_result[j] - my_result[j]);
-    if (diff >= 1e-2) {
-      printf("Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
-             myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
-      break;
+  if (performance_test) {
+    dummy_kernel<<<1, 1, 0, stream>>>();
+    constexpr int warmup_iters = 5;
+    constexpr int num_iters = 100;
+    // warmup
+    for (int i = 0; i < warmup_iters; i++) {
+      NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum,
+                              comm, stream));
    }
-  }
+    CUDACHECK(cudaEventRecord(start, stream));
+    for (int i = 0; i < num_iters; i++) {
+      NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum,
+                              comm, stream));
+    }
+    CUDACHECK(cudaEventRecord(stop, stream));
+    CUDACHECK(cudaStreamSynchronize(stream));
+    float allreduce_ms = 0;
+    cudaEventElapsedTime(&allreduce_ms, start, stop);

-  long double nccl_diffs = 0.0;
-  long double my_diffs = 0.0;
-  for (int j = 0; j < data_size; j++) {
-    nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
-    my_diffs += abs(my_result[j] - ground_truth[j]);
+    dummy_kernel<<<1, 1, 0, stream>>>();
+    // warm up
+    for (int i = 0; i < warmup_iters; i++) {
+      fa.allreduce<T>(stream, self_data, result, data_size, threads,
+                      block_limit);
+    }
+    CUDACHECK(cudaEventRecord(start, stream));
+    for (int i = 0; i < num_iters; i++) {
+      fa.allreduce<T>(stream, self_data, result, data_size, threads,
+                      block_limit);
+    }
+    CUDACHECK(cudaEventRecord(stop, stream));
+    CUDACHECK(cudaStreamSynchronize(stream));
+
+    float duration_ms = 0;
+    cudaEventElapsedTime(&duration_ms, start, stop);
+    if (myRank == 0)
+      printf(
+          "Rank %d done, nGPUs:%d, sz (kb): %d, %d, %d, my time:%.2fus, nccl "
+          "time:%.2fus\n",
+          myRank, nRanks, data_size * sizeof(T) / 1024, threads, block_limit,
+          duration_ms * 1e3 / num_iters, allreduce_ms * 1e3 / num_iters);
+
+    // And wait for all the queued up work to complete
+    CUDACHECK(cudaStreamSynchronize(stream));
+
+    NCCLCHECK(ncclAllReduce(self_data_copy, self_data, data_size, ncclDtype,
+                            ncclSum, comm, stream));
+
+    convert_data<T><<<108, 1024, 0, stream>>>(self_data, result, nccl_result,
+                                              my_result, data_size);
+    CUDACHECK(cudaStreamSynchronize(stream));
+
+    for (unsigned long j = 0; j < data_size; j++) {
+      auto diff = abs(nccl_result[j] - my_result[j]);
+      if (diff >= 4e-2) {
+        printf("Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
+               myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
+        break;
+      }
+    }
+    long double nccl_diffs = 0.0;
+    long double my_diffs = 0.0;
+    for (int j = 0; j < data_size; j++) {
+      nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
+      my_diffs += abs(my_result[j] - ground_truth[j]);
+    }
+    if (myRank == 0)
+      std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
+                << " me: " << my_diffs / data_size << std::endl;
+  } else {
+    for (int i = 0; i < 100; i++) {
+      fa.allreduce<T>(stream, self_data, result, data_size, threads,
+                      block_limit);
+      CUDACHECK(cudaStreamSynchronize(stream));
+      NCCLCHECK(ncclAllReduce(self_data, self_data_copy, data_size, ncclDtype,
+                              ncclSum, comm, stream));
+      convert_data<T><<<108, 1024, 0, stream>>>(
+          self_data_copy, result, nccl_result, my_result, data_size);
+      CUDACHECK(cudaStreamSynchronize(stream));
+
+      for (unsigned long j = 0; j < data_size; j++) {
+        auto diff = abs(nccl_result[j] - my_result[j]);
+        if (diff >= 4e-2) {
+          printf(
+              "Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
+              myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
+          break;
+        }
+      }
+    }
+    if (myRank == 0)
+      printf("Test passed: nGPUs:%d, sz (kb): %d, %d, %d\n", nRanks,
+             data_size * sizeof(T) / 1024, threads, block_limit);
+    // long double nccl_diffs = 0.0;
+    // long double my_diffs = 0.0;
+    // for (int j = 0; j < data_size; j++) {
+    //   nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
+    //   my_diffs += abs(my_result[j] - ground_truth[j]);
+    // }
+    // if (myRank == 0)
+    //   std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
+    //             << " me: " << my_diffs / data_size << std::endl;
  }
-  if (myRank == 0)
-    std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
-              << " me: " << my_diffs / data_size << std::endl;

  CUDACHECK(cudaFree(result));
  CUDACHECK(cudaFree(self_data_copy));
@ -269,14 +300,15 @@ int main(int argc, char **argv) {
                     MPI_COMM_WORLD));
  NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));

+  bool performance_test = true;
  cudaProfilerStart();
  // for (int threads : {256, 512}) {
  //   for (int block_limit = 16; block_limit < 112; block_limit += 4) {
  //     run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024);
  //   }
  // }
-  for (int sz = 512; sz <= (32 << 20); sz *= 2) {
-    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 50);
+  for (int sz = 512; sz <= (8 << 20); sz *= 2) {
+    run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
  }

  cudaProfilerStop();
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@ -7,10 +7,17 @@
 #include "cuda_compat.h"
 #include "dispatch_utils.h"

-const static size_t NUM_MAX_EXPERTS = 64;
 #define CEILDIV(x,y) (((x) + (y) - 1) / (y))

 namespace vllm {
+
+namespace {
+__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, int32_t col) {
+    // don't worry about overflow because num_experts is relatively small
+    return row * total_col + col;
+}
+}
+
 template <typename scalar_t>
 __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, 
                                int32_t *sorted_token_ids, 
@ -21,10 +28,14 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
                                size_t numel) {
    const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
    const size_t start_idx = threadIdx.x * tokens_per_thread;
-    __shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS];
-    __shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1];
+
+    extern __shared__ int32_t shared_mem[];
+
+    int32_t* tokens_cnts = shared_mem; // 2d tensor with shape (num_experts + 1, num_experts)
+    int32_t* cumsum = shared_mem + (num_experts + 1) * num_experts; // 1d tensor with shape (num_experts + 1)
+
    for (int i = 0; i < num_experts; ++i) {
-        tokens_cnts[threadIdx.x + 1][i] = 0;
+        tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
    }

    /**
@ -33,15 +44,15 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
    * to expert expert_index.
    */
    for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
-        ++tokens_cnts[threadIdx.x + 1][topk_ids[i]]; 
+        ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])]; 
    }

    __syncthreads();

    // For each expert we accumulate the token counts from the different threads.
-    tokens_cnts[0][threadIdx.x] = 0;
+    tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
    for (int i = 1; i <= blockDim.x; ++i) {
-        tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x];
+        tokens_cnts[index(num_experts, i, threadIdx.x)] += tokens_cnts[index(num_experts, i-1, threadIdx.x)];
    }

    __syncthreads();
@ -50,7 +61,7 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
    if (threadIdx.x == 0) {
        cumsum[0] = 0;
        for (int i = 1; i <= num_experts; ++i) {
-            cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[blockDim.x][i - 1], block_size) * block_size;
+            cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], block_size) * block_size;
        }
        *total_tokens_post_pad = cumsum[num_experts];
    }
@ -78,9 +89,9 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
        * stores the indices of the tokens processed by the expert with expert_id within
        * the current thread's token shard.
        */
-        int32_t rank_post_pad = tokens_cnts[threadIdx.x][expert_id] + cumsum[expert_id];
+        int32_t rank_post_pad = tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + cumsum[expert_id];
        sorted_token_ids[rank_post_pad] = i;
-        ++tokens_cnts[threadIdx.x][expert_id];
+        ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
    }
 }
 }
@ -93,11 +104,17 @@ void moe_align_block_size(
    torch::Tensor experts_ids,
    torch::Tensor num_tokens_post_pad) {
    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    assert(num_experts <= NUM_MAX_EXPERTS);
    VLLM_DISPATCH_INTEGRAL_TYPES(
        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-        vllm::moe_align_block_size_kernel<scalar_t><<<1, num_experts, 0, stream>>>(
-            topk_ids.data_ptr<scalar_t>(), 
+        // calc needed amount of shared mem for `tokens_cnts` and `cumsum` tensors
+        const int32_t shared_mem = ((num_experts + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
+
+        // set dynamic shared mem
+        auto kernel = vllm::moe_align_block_size_kernel<scalar_t>;
+        AT_CUDA_CHECK(
+            VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize((void *)kernel, shared_mem));
+        kernel<<<1, num_experts, shared_mem, stream>>>(
+            topk_ids.data_ptr<scalar_t>(),
            sorted_token_ids.data_ptr<int32_t>(), 
            experts_ids.data_ptr<int32_t>(), 
            num_tokens_post_pad.data_ptr<int32_t>(), 
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -53,6 +53,16 @@ void rotary_embedding(
  torch::Tensor& cos_sin_cache,
  bool is_neox);

+void batched_rotary_embedding(
+  torch::Tensor& positions,
+  torch::Tensor& query,
+  torch::Tensor& key,
+  int head_size,
+  torch::Tensor& cos_sin_cache,
+  bool is_neox,
+  int rot_dim,
+  torch::Tensor& cos_sin_cache_offsets);
+
 void silu_and_mul(
  torch::Tensor& out,
  torch::Tensor& input);
@ -61,6 +71,10 @@ void gelu_and_mul(
  torch::Tensor& out,
  torch::Tensor& input);

+void gelu_tanh_and_mul(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
 void gelu_new(
  torch::Tensor& out,
  torch::Tensor& input);
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@ -8,7 +8,7 @@
 namespace vllm {

 template<typename scalar_t, bool IS_NEOX>
-inline __device__ void apply_rotary_embedding(
+inline __device__ void apply_token_rotary_embedding(
  scalar_t* __restrict__ arr,
  const scalar_t* __restrict__ cos_ptr,
  const scalar_t* __restrict__ sin_ptr,
@ -37,6 +37,42 @@ inline __device__ void apply_rotary_embedding(
  arr[y_index] = y * cos + x * sin;
 }

+template<typename scalar_t, bool IS_NEOX>
+inline __device__ void apply_rotary_embedding(
+  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+  const scalar_t* cache_ptr,
+  const int head_size,
+  const int num_heads,
+  const int num_kv_heads,
+  const int rot_dim,
+  const int token_idx,
+  const int64_t query_stride,
+  const int64_t key_stride)
+{
+  const int embed_dim = rot_dim / 2;
+  const scalar_t* cos_ptr = cache_ptr;
+  const scalar_t* sin_ptr = cache_ptr + embed_dim;
+
+  const int nq = num_heads * embed_dim;
+  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
+    const int head_idx = i / embed_dim;
+    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
+    const int rot_offset = i % embed_dim;
+    apply_token_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
+                                              sin_ptr, rot_offset, embed_dim);
+  }
+
+  const int nk = num_kv_heads * embed_dim;
+  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
+    const int head_idx = i / embed_dim;
+    const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+    const int rot_offset = i % embed_dim;
+    apply_token_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
+                                              sin_ptr, rot_offset, embed_dim);
+  }
+}
+
 template<typename scalar_t, bool IS_NEOX>
 __global__ void rotary_embedding_kernel(
  const int64_t* __restrict__ positions,        // [batch_size, seq_len] or [num_tokens]
@ -54,27 +90,29 @@ __global__ void rotary_embedding_kernel(
  int64_t pos = positions[token_idx];
  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;

-  const int embed_dim = rot_dim / 2;
-  const scalar_t* cos_ptr = cache_ptr;
-  const scalar_t* sin_ptr = cache_ptr + embed_dim;
+  apply_rotary_embedding<scalar_t, IS_NEOX>(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
+}

-  const int nq = num_heads * embed_dim;
-  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
-    const int head_idx = i / embed_dim;
-    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
-    const int rot_offset = i % embed_dim;
-    apply_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
-                                              sin_ptr, rot_offset, embed_dim);
-  }
+template<typename scalar_t, bool IS_NEOX>
+__global__ void batched_rotary_embedding_kernel(
+  const int64_t* __restrict__ positions,              // [batch_size, seq_len] or [num_tokens]
+  scalar_t* __restrict__ query,                       // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                         // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+  const scalar_t* __restrict__ cos_sin_cache,         // [max_position, 2, rot_dim // 2]
+  const int64_t* __restrict__ cos_sin_cache_offsets,  // [batch_size, seq_len] or [num_tokens]
+  const int rot_dim,
+  const int64_t query_stride,
+  const int64_t key_stride,
+  const int num_heads,
+  const int num_kv_heads,
+  const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+  int64_t pos = positions[token_idx];
+  int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
+  const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;

-  const int nk = num_kv_heads * embed_dim;
-  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
-    const int head_idx = i / embed_dim;
-    const int64_t token_head = token_idx * key_stride + head_idx * head_size;
-    const int rot_offset = i % embed_dim;
-    apply_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
-                                              sin_ptr, rot_offset, embed_dim);
-  }
+  apply_rotary_embedding<scalar_t, IS_NEOX>(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
 }

 } // namespace vllm
@ -128,3 +166,61 @@ void rotary_embedding(
      }
    });
 }
+
+/*
+Batched version of rotary embedding, pack multiple LoRAs together
+and process in batched manner.
+*/
+void batched_rotary_embedding(
+  torch::Tensor& positions,         // [batch_size, seq_len] or [num_tokens]
+  torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
+  torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
+  int head_size,
+  torch::Tensor& cos_sin_cache,     // [max_position, rot_dim]
+  bool is_neox,
+  int rot_dim,
+  torch::Tensor& cos_sin_cache_offsets // [num_tokens]
+) {
+  int64_t num_tokens = cos_sin_cache_offsets.size(0);
+  int num_heads = query.size(-1) / head_size;
+  int num_kv_heads = key.size(-1) / head_size;
+  int64_t query_stride = query.stride(-2);
+  int64_t key_stride = key.stride(-2);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+    query.scalar_type(),
+    "rotary_embedding",
+    [&] {
+      if (is_neox) {
+        vllm::batched_rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
+          positions.data_ptr<int64_t>(),
+          query.data_ptr<scalar_t>(),
+          key.data_ptr<scalar_t>(),
+          cos_sin_cache.data_ptr<scalar_t>(),
+          cos_sin_cache_offsets.data_ptr<int64_t>(),
+          rot_dim,
+          query_stride,
+          key_stride,
+          num_heads,
+          num_kv_heads,
+          head_size);
+      } else {
+        vllm::batched_rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
+          positions.data_ptr<int64_t>(),
+          query.data_ptr<scalar_t>(),
+          key.data_ptr<scalar_t>(),
+          cos_sin_cache.data_ptr<scalar_t>(),
+          cos_sin_cache_offsets.data_ptr<int64_t>(),
+          rot_dim,
+          query_stride,
+          key_stride,
+          num_heads,
+          num_kv_heads,
+          head_size);
+      }
+    });
+}
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@ -14,21 +14,28 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 128) \
    f(in_T, out_T, W_T, narrow, 256) \
    f(in_T, out_T, W_T, narrow, 512) \
+    f(in_T, out_T, W_T, narrow, 768) \
    f(in_T, out_T, W_T, narrow, 1024) \
+    f(in_T, out_T, W_T, narrow, 1152) \
    f(in_T, out_T, W_T, narrow, 1280) \
+    f(in_T, out_T, W_T, narrow, 1536) \
    f(in_T, out_T, W_T, narrow, 1728) \
    f(in_T, out_T, W_T, narrow, 1792) \
    f(in_T, out_T, W_T, narrow, 2048) \
+    f(in_T, out_T, W_T, narrow, 2304) \
    f(in_T, out_T, W_T, narrow, 2560) \
    f(in_T, out_T, W_T, narrow, 2752) \
+    f(in_T, out_T, W_T, narrow, 2816) \
    f(in_T, out_T, W_T, narrow, 3072) \
    f(in_T, out_T, W_T, narrow, 3456) \
    f(in_T, out_T, W_T, narrow, 3584) \
    f(in_T, out_T, W_T, narrow, 4096) \
+    f(in_T, out_T, W_T, narrow, 4608) \
    f(in_T, out_T, W_T, narrow, 5120) \
    f(in_T, out_T, W_T, narrow, 5504) \
    f(in_T, out_T, W_T, narrow, 5632) \
    f(in_T, out_T, W_T, narrow, 6144) \
+    f(in_T, out_T, W_T, narrow, 6848) \
    f(in_T, out_T, W_T, narrow, 6912) \
    f(in_T, out_T, W_T, narrow, 7168) \
    f(in_T, out_T, W_T, narrow, 8192) \
@ -36,11 +43,14 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 10240) \
    f(in_T, out_T, W_T, narrow, 11008) \
    f(in_T, out_T, W_T, narrow, 12288) \
+    f(in_T, out_T, W_T, narrow, 13696) \
    f(in_T, out_T, W_T, narrow, 13824) \
    f(in_T, out_T, W_T, narrow, 14336) \
    f(in_T, out_T, W_T, narrow, 16384) \
    f(in_T, out_T, W_T, narrow, 20480) \
+    f(in_T, out_T, W_T, narrow, 22016) \
    f(in_T, out_T, W_T, narrow, 24576) \
+    f(in_T, out_T, W_T, narrow, 27392) \
    f(in_T, out_T, W_T, narrow, 28672) \
    f(in_T, out_T, W_T, narrow, 32000) \
    f(in_T, out_T, W_T, narrow, 32256) \
--- a/csrc/punica/bgmv/generator.py
+++ b/csrc/punica/bgmv/generator.py
@ -10,7 +10,7 @@ TEMPLATE = """
 #include "bgmv_impl.cuh"

 FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, {input_dtype}, {output_dtype}, {weight_dtype})
-""".lstrip()
+""".lstrip()  # noqa: E501

 for input_dtype in DTYPES:
    for output_dtype in DTYPES:
--- a/csrc/punica/punica_ops.cc
+++ b/csrc/punica/punica_ops.cc
@ -1,7 +1,7 @@
 #include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <torch/extension.h>
-
+#include <c10/cuda/CUDAGuard.h>
 #include <cstdint>

 #include "bgmv/bgmv_config.h"
@ -91,6 +91,7 @@ void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
  CHECK_EQ(w.size(2), h_out);
  CHECK_EQ(indicies.size(0), x.size(0));
  CHECK_EQ(y.size(0), x.size(0));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
  bool ok = false;
  if (h_in < 65536 && h_out < 65536) {
    // TODO: See if we can get rid of this massive nested switch
@ -322,6 +323,7 @@ void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
  CHECK_EQ(w.size(2), h_out);
  CHECK_EQ(indicies.size(0), x.size(0));
  CHECK_EQ(y.size(0), x.size(0));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
  bool ok = false;
  if (h_in < 65536 && h_out < 65536) {
    // TODO: See if we can get rid of this massive nested switch
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@ -25,7 +25,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  ops.def(
    "gelu_and_mul",
    &gelu_and_mul,
-    "Activation function used in GeGLU.");
+    "Activation function used in GeGLU with `none` approximation.");
+  ops.def(
+    "gelu_tanh_and_mul",
+    &gelu_tanh_and_mul,
+    "Activation function used in GeGLU with `tanh` approximation.");
  ops.def(
    "gelu_new",
    &gelu_new,
@ -52,6 +56,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    &rotary_embedding,
    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");

+  ops.def(
+    "batched_rotary_embedding",
+    &batched_rotary_embedding,
+    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)");
+
 // Quantization ops
 #ifndef USE_ROCM
  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
--- a/csrc/reduction_utils.cuh
+++ b/csrc/reduction_utils.cuh
@ -24,17 +24,27 @@ namespace vllm {
 template<typename T>
 __inline__ __device__ T warpReduceSum(T val) {
 #pragma unroll
-  for (int mask = 16; mask > 0; mask >>= 1)
+  for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1)
    val += VLLM_SHFL_XOR_SYNC(val, mask);
  return val;
 }

+__inline__ __device__ constexpr int _calculateLaneMask(int warp_size) {
+  return warp_size - 1;
+}
+
+__inline__ __device__ constexpr int _calculateWidShift(int warp_size) {
+  return 5 + (warp_size >> 6);
+}
+
 /* Calculate the sum of all elements in a block */
 template<typename T>
 __inline__ __device__ T blockReduceSum(T val) {
-  static __shared__ T shared[32];
-  int lane = threadIdx.x & 0x1f;
-  int wid = threadIdx.x >> 5;
+  static __shared__ T shared[WARP_SIZE];
+  constexpr auto LANE_MASK = _calculateLaneMask(WARP_SIZE);
+  constexpr auto WID_SHIFT = _calculateWidShift(WARP_SIZE);
+  int lane = threadIdx.x & LANE_MASK;
+  int wid = threadIdx.x >> WID_SHIFT;

  val = warpReduceSum<T>(val);

@ -45,7 +55,7 @@ __inline__ __device__ T blockReduceSum(T val) {

  // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
  // blockDim.x is not divided by 32
-  val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
+  val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f);
  val = warpReduceSum<T>(val);
  return val;
 }
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@ -1,3 +1,10 @@
 sphinx == 6.2.1
 sphinx-book-theme == 1.0.1
 sphinx-copybutton == 0.5.2
+myst-parser == 2.0.0
+sphinx-argparse
+
+# packages to install to build the documentation
+pydantic
+-f https://download.pytorch.org/whl/cpu
+torch
--- a/docs/source/assets/kernel/k_vecs.png
+++ b/docs/source/assets/kernel/k_vecs.png
--- a/docs/source/assets/kernel/key.png
+++ b/docs/source/assets/kernel/key.png
--- a/docs/source/assets/kernel/logits_vec.png
+++ b/docs/source/assets/kernel/logits_vec.png
--- a/docs/source/assets/kernel/q_vecs.png
+++ b/docs/source/assets/kernel/q_vecs.png
--- a/docs/source/assets/kernel/query.png
+++ b/docs/source/assets/kernel/query.png
--- a/docs/source/assets/kernel/v_vec.png
+++ b/docs/source/assets/kernel/v_vec.png
--- a/docs/source/assets/kernel/value.png
+++ b/docs/source/assets/kernel/value.png
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -10,10 +10,11 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.

+import logging
 import os
 import sys
+
 from sphinx.ext import autodoc
-import logging

 sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))

@ -22,7 +23,7 @@ logger = logging.getLogger(__name__)
 # -- Project information -----------------------------------------------------

 project = 'vLLM'
-copyright = '2023, vLLM Team'
+copyright = '2024, vLLM Team'
 author = 'the vLLM Team'

 # -- General configuration ---------------------------------------------------
@ -37,6 +38,8 @@ extensions = [
    "sphinx_copybutton",
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
+    "myst_parser",
+    "sphinxarg.ext",
 ]

 # Add any paths that contain templates here, relative to this directory.
@ -72,8 +75,15 @@ html_theme_options = {

 # Mock out external dependencies here.
 autodoc_mock_imports = [
-    "torch", "transformers", "psutil", "prometheus_client", "sentencepiece",
-    "vllm.cuda_utils", "vllm._C"
+    "torch",
+    "transformers",
+    "psutil",
+    "prometheus_client",
+    "sentencepiece",
+    "vllm.cuda_utils",
+    "vllm._C",
+    "numpy",
+    "tqdm",
 ]

 for mock_target in autodoc_mock_imports:
--- a/docs/source/dev/engine/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.rst
@ -2,5 +2,5 @@ LLMEngine
 =================================

 .. autoclass:: vllm.engine.llm_engine.LLMEngine
-    :members: add_request, abort_request, step, _init_cache
+    :members: add_request, abort_request, step
    :show-inheritance:
--- a/docs/source/dev/kernel/paged_attention.rst
+++ b/docs/source/dev/kernel/paged_attention.rst
@ -0,0 +1,525 @@
+vLLM Paged Attention
+====================
+
+-  Currently, vLLM utilizes its own implementation of a multi-head query
+   attention kernel (``csrc/attention/attention_kernels.cu``). 
+   This kernel is designed to be compatible with
+   vLLM's paged KV caches, where the key and value cache are stored in
+   separate blocks (note that this block concept differs from the GPU
+   thread block. So in a later document, I will refer to vLLM paged
+   attention block as "block", while refer to GPU thread block as
+   "thread block").
+-  To achieve high performance, this kernel relies on a specially
+   designed memory layout and access method, specifically when threads
+   read data from global memory to shared memory. The purpose of this
+   document is to provide a high-level explanation of the kernel
+   implementation step by step, aiding those who wish to learn about the
+   vLLM multi-head query attention kernel. After going through this 
+   document, users will likely have a better understanding and feel easier
+   to follow the actual implementation.
+-  Please note that this document may not cover all details, such as how
+   to calculate the correct index for the corresponding data or the dot
+   multiplication implementation. However, after reading this document
+   and becoming familiar with the high-level logic flow, it should be
+   easier for you to read the actual code and understand the details.
+
+Inputs
+------
+
+-  The kernel function takes a list of arguments for the current thread
+   to perform its assigned work. The three most important arguments are
+   the input pointers ``q``, ``k_cache``, and ``v_cache``, which point
+   to query, key, and value data on global memory that need to be read
+   and processed. The output pointer ``out`` points to global memory
+   where the result should be written. These four pointers actually
+   refer to multi-dimensional arrays, but each thread only accesses the
+   portion of data assigned to it. I have omitted all other runtime
+   parameters here for simplicity.
+
+   .. code:: cpp
+
+      template<
+      typename scalar_t,
+      int HEAD_SIZE,
+      int BLOCK_SIZE,
+      int NUM_THREADS,
+      int PARTITION_SIZE = 0>
+      __device__ void paged_attention_kernel(
+      ... // Other side args.
+      const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
+      const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+      const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+      const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+      ... // Other side args.
+      )
+
+-  There are also a list of template arguments above the function
+   signature that are determined during compilation time. ``scalar_t``
+   represents the data type of the query, key, and value data elements,
+   such as FP16. ``HEAD_SIZE`` indicates the number of elements in each
+   head. ``BLOCK_SIZE`` refers to the number of tokens in each block.
+   ``NUM_THREADS`` denotes the number of threads in each thread block.
+   ``PARTITION_SIZE`` represents the number of tensor parallel GPUs (For
+   simplicity, we assume this is 0 and tensor parallel is disabled).
+-  With these arguments, we need to perform a sequence of preparations.
+   This includes calculating the current head index, block index, and
+   other necessary variables. However, for now, we can ignore these
+   preparations and proceed directly to the actual calculations. It will
+   be easier to understand them once we grasp the entire flow.
+
+Concepts
+--------
+
+-  Just before we dive into the calculation flow, I want to describe a
+   few concepts that are needed for later sections. However, you may
+   skip this section and return later if you encounter any confusing
+   terminologies.
+-  **Sequence**: A sequence represents a client request. For example,
+   the data pointed to by ``q`` has a shape of
+   ``[num_seqs, num_heads, head_size]``. That represents there are total
+   ``num_seqs`` of query sequence data are pointed by ``q``. Since this 
+   kernel is a single query attention kernel, each sequence only has one
+   query token. Hence, the ``num_seqs`` equals the total number of tokens 
+   that are processed in the batch.
+-  **Context**: The context consists of the generated tokens from the
+   sequence. For instance, ``["What", "is", "your"]`` are the context
+   tokens, and the input query token is ``"name"``. The model might
+   generate the token ``"?"``.
+-  **Vec**: The vec is a list of elements that are fetched and
+   calculated together. For query and key data, the vec size
+   (``VEC_SIZE``) is determined so that each thread group can fetch and
+   calculate 16 bytes of data at a time. For value data, the vec size
+   (``V_VEC_SIZE``) is determined so that each thread can fetch and
+   calculate 16 bytes of data at a time. For example, if the
+   ``scalar_t`` is FP16 (2 bytes) and ``THREAD_GROUP_SIZE`` is 2, the 
+   ``VEC_SIZE`` will be 4, while the ``V_VEC_SIZE`` will be 8.
+-  **Thread group**: The thread group is a small group of
+   threads(\ ``THREAD_GROUP_SIZE``) that fetches and calculates one
+   query token and one key token at a time. Each thread handles only a
+   portion of the token data. The total number of elements processed by
+   one thread group is referred as ``x``. For example, if the thread
+   group contains 2 threads and the head size is 8, then thread 0
+   handles the query and key elements at index 0, 2, 4, 6, while thread
+   1 handles the elements at index 1, 3, 5, 7.
+-  **Block**: The key and value cache data in vLLM are split into
+   blocks. Each block stores data for a fixed number(\ ``BLOCK_SIZE``)
+   of tokens at one head. Each block may contain only a portion of the
+   whole context tokens. For example, if the block size is 16 and the
+   head size is 128, then for one head, one block can store 16 \* 128 =
+   2048 elements.
+-  **Warp**: A warp is a group of 32 threads(\ ``WARP_SIZE``) that
+   execute simultaneously on a stream multiprocessor (SM). In this
+   kernel, each warp processes the calculation between one query token
+   and key tokens of one entire block at a time (it may process multiple
+   blocks in multiple iterations). For example, if there are 4 warps and
+   6 blocks for one context, the assignment would be like warp 0 handles
+   the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
+   handles the 2nd block and warp 3 handles the 3rd block.
+-  **Thread block**: A thread block is a group of
+   threads(\ ``NUM_THREADS``) that can access the same shared memory.
+   Each thread block contains multiple warps(\ ``NUM_WARPS``), and in
+   this kernel, each thread block processes the calculation between one
+   query token and key tokens of a whole context.
+-  **Grid**: A grid is a collection of thread blocks and defines the
+   shape of the collection. In this kernel, the shape is
+   ``(num_heads, num_seqs, max_num_partitions)``. Therefore, each thread
+   block only handles the calculation for one head, one sequence, and
+   one partition.
+
+Query
+-----
+
+-  This section will introduce how query data is stored in memory and
+   fetched by each thread. As mentioned above, each thread group fetches
+   one query token data, while each thread itself only handles a part of
+   one query token data. Within each warp, every thread group will fetch
+   the same query token data, but will multiply it with different key
+   token data.
+
+   .. code:: cpp
+
+      const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+
+   .. figure:: ../../assets/kernel/query.png
+      :alt: query
+      :width: 70%
+      :align: center
+
+      Query data of one token at one head
+
+-  Each thread defines its own ``q_ptr`` which points to the assigned
+   query token data on global memory. For example, if ``VEC_SIZE`` is 4
+   and ``HEAD_SIZE`` is 128, the ``q_ptr`` points to data that contains
+   total of 128 elements divided into 128 / 4 = 32 vecs.
+
+   .. figure:: ../../assets/kernel/q_vecs.png
+      :alt: q_vecs
+      :width: 70%
+      :align: center
+
+      ``q_vecs`` for one thread group
+
+   .. code:: cpp
+
+      __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+
+-  Next, we need to read the global memory data pointed to by ``q_ptr``
+   into shared memory as ``q_vecs``. It is important to note that each
+   vecs is assigned to a different row. For example, if the
+   ``THREAD_GROUP_SIZE`` is 2, thread 0 will handle the 0th row vecs,
+   while thread 1 handles the 1st row vecs. By reading the query data in
+   this way, neighboring threads like thread 0 and thread 1 can read
+   neighbor memory, achieving the memory coalescing to improve
+   performance.
+
+Key
+---
+
+-  Similar to the "Query" section, this section introduces memory layout
+   and assignment for keys. While each thread group only handle one
+   query token one kernel run, it may handle multiple key tokens across
+   multiple iterations. Meanwhile, each warp will process multiple blocks
+   of key tokens in multiple iterations, ensuring that all context
+   tokens are processed by the entire thread group after the kernel run.
+   In this context, "handle" refers to performing the dot multiplication
+   between query data and key data.
+
+   .. code:: cpp
+
+      const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+                          + kv_head_idx * kv_head_stride
+                          + physical_block_offset * x;
+
+-  Unlike to ``q_ptr``, ``k_ptr`` in each thread will point to different
+   key token at different iterations. As shown above, that ``k_ptr``
+   points to key token data based on ``k_cache`` at assigned block,
+   assigned head and assigned token.
+
+   .. figure:: ../../assets/kernel/key.png
+      :alt: key
+      :width: 70%
+      :align: center
+
+      Key data of all context tokens at one head
+
+-  The diagram above illustrates the memory layout for key data. It
+   assumes that the ``BLOCK_SIZE`` is 16, ``HEAD_SIZE`` is 128, ``x`` is
+   8, ``THREAD_GROUP_SIZE`` is 2, and there are a total of 4 warps. Each
+   rectangle represents all the elements for one key token at one head,
+   which will be processed by one thread group. The left half shows the
+   total 16 blocks of key token data for warp 0, while the right half
+   represents the remaining key token data for other warps or
+   iterations. Inside each rectangle, there are a total 32 vecs (128
+   elements for one token) that will be processed by 2 threads (one
+   thread group) separately.
+
+   .. figure:: ../../assets/kernel/k_vecs.png
+      :alt: k_vecs
+      :width: 70%
+      :align: center
+
+      ``k_vecs`` for one thread
+
+   .. code:: cpp
+
+      K_vec k_vecs[NUM_VECS_PER_THREAD]
+
+-  Next, we need to read the key token data from ``k_ptr`` and store
+   them on register memory as ``k_vecs``. We use register memory for
+   ``k_vecs`` because it will only be accessed by one thread once,
+   whereas ``q_vecs`` will be accessed by multiple threads multiple
+   times. Each ``k_vecs`` will contain multiple vectors for later
+   calculation. Each vec will be set at each inner iteration. The
+   assignment of vecs allows neighboring threads in a warp to read
+   neighboring memory together, which again promotes the memory
+   coalescing. For instance, thread 0 will read vec 0, while thread 1
+   will read vec 1. In the next inner loop, thread 0 will read vec 2,
+   while thread 1 will read vec 3, and so on.
+-  You may still be a little confused about the overall flow. Don't
+   worry, please keep reading the next "QK" section. It will illustrate
+   the query and key calculation flow in a clearer and higher-level
+   manner.
+
+QK
+---
+
+-  As shown the pseudo code below, before the entire for loop block, we
+   fetch the query data for one token and store it in ``q_vecs``. Then,
+   in the outer for loop, we iterate through different ``k_ptrs`` that
+   point to different tokens and prepare the ``k_vecs`` in the inner for
+   loop. Finally, we perform the dot multiplication between the
+   ``q_vecs`` and each ``k_vecs``.
+
+   .. code:: cpp
+
+      q_vecs = ...
+      for ... {
+         k_ptr = ...
+         for ... {
+            k_vecs[i] = ...
+         }
+         ...
+         float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
+      }
+
+-  As mentioned before, for each thread, it only fetches part of the
+   query and key token data at a time. However, there will be a cross
+   thread group reduction happen in the ``Qk_dot<>::dot`` . So ``qk``
+   returned here is not just between part of the query and key token dot
+   multiplication, but actually a full result between entire query and
+   key token data.
+-  For example, if the value of ``HEAD_SIZE`` is 128 and
+   ``THREAD_GROUP_SIZE`` is 2, each thread's ``k_vecs`` will contain
+   total 64 elements. However, the returned ``qk`` is actually the
+   result of dot multiplication between 128 query elements and 128 key
+   elements. If you want to learn more about the details of the dot
+   multiplication and reduction, you may refer to the implementation of
+   ``Qk_dot<>::dot``. However, for the sake of simplicity, I will not
+   cover it in this document.
+
+Softmax
+-------
+
+-  Next, we need to calculate the normalized softmax for all ``qk``\ s,
+   as shown above, where each :math:`x` represents a ``qk``. To do this,
+   we must obtain the reduced value of ``qk_max``\ (:math:`m(x)`) and
+   the ``exp_sum``\ (:math:`\ell(x)`) of all ``qk``\ s. The reduction
+   should be performed across the entire thread block, encompassing
+   results between the query token and all context key tokens.
+
+   .. math::
+      :nowrap:
+
+      \begin{gather*}
+      m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
+      \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
+      \end{gather*}
+
+``qk_max`` and ``logits``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+-  Just right after we get the ``qk`` result, we can set the temporary
+   ``logits`` result with ``qk`` (In the end, the ``logits`` should
+   store the normalized softmax result). Also we can compare and collect
+   the ``qk_max`` for all ``qk``\ s that are calculated by current
+   thread group.
+
+   .. code:: cpp
+
+      if (thread_group_offset == 0) {
+         const bool mask = token_idx >= context_len;
+         logits[token_idx - start_token_idx] = mask ? 0.f : qk;
+         qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+      }
+
+-  Please note that the ``logits`` here is on shared memory, so each
+   thread group will set the fields for its own assigned context tokens.
+   Overall, the size of logits should be number of context tokens.
+
+   .. code:: cpp
+
+      for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+          qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+      }
+
+      if (lane == 0) {
+         red_smem[warp_idx] = qk_max;
+      }
+
+-  Then we need to get the reduced ``qk_max`` across each warp. The main
+   idea is to make threads in warp to communicate with each other and
+   get the final max ``qk`` .
+
+   .. code:: cpp
+
+      for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+          qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+      }
+      qk_max = VLLM_SHFL_SYNC(qk_max, 0);
+
+-  Finally, we can get the reduced ``qk_max`` from whole thread block by
+   compare the ``qk_max`` from all warps in this thread block. Then we
+   need to broadcast the final result to each thread.
+
+``exp_sum``
+~~~~~~~~~~~
+
+-  Similar to ``qk_max``, we need to get the reduced sum value from the
+   entire thread block too.
+
+   .. code:: cpp
+
+      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+          float val = __expf(logits[i] - qk_max);
+          logits[i] = val;
+          exp_sum += val;
+      }
+      ...
+      exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
+
+-  Firstly, sum all exp values from each thread group, and meanwhile,
+   convert each entry of ``logits`` from ``qk`` to ``exp(qk - qk_max)``.
+   Please note, the ``qk_max`` here is already the max ``qk`` across the
+   whole thread block. And then we can do reduction for ``exp_sum``
+   across whole thread block just like the ``qk_max``.
+
+   .. code:: cpp
+
+      const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+         logits[i] *= inv_sum;
+      }
+
+-  Finally, with the reduced ``qk_max`` and ``exp_sum``, we can obtain
+   the final normalized softmax result as ``logits``. This ``logits``
+   variable will be used for dot multiplication with the value data in
+   later steps. Now, it should store the normalized softmax result of
+   ``qk`` for all assigned context tokens.
+
+Value
+-----
+
+.. figure:: ../../assets/kernel/value.png
+   :alt: value
+   :width: 70%
+   :align: center
+
+   Value data of all context tokens at one head
+
+.. figure:: ../../assets/kernel/logits_vec.png
+   :alt: logits_vec
+   :width: 50%
+   :align: center
+
+   ``logits_vec`` for one thread
+
+.. figure:: ../../assets/kernel/v_vec.png
+   :alt: v_vec
+   :width: 70%
+   :align: center
+
+   List of ``v_vec`` for one thread
+
+-  Now we need to retrieve the value data and perform dot multiplication
+   with ``logits``. Unlike query and key, there is no thread group
+   concept for value data. As shown in diagram, different from key token
+   memory layout, elements from the same column correspond to the same
+   value token. For one block of value data, there are ``HEAD_SIZE`` of
+   rows and ``BLOCK_SIZE`` of columns that are split into multiple
+   ``v_vecs``.
+-  Each thread always fetches ``V_VEC_SIZE`` elements from the same
+   ``V_VEC_SIZE`` of tokens at a time. As a result, a single thread
+   retrieves multiple ``v_vec``\ s from different rows and the same
+   columns through multiple inner iterations. For each ``v_vec``, it
+   needs to be dot multiplied with the corresponding ``logits_vec``,
+   which is also ``V_VEC_SIZE`` elements from ``logits``. Overall, with
+   multiple inner iterations, each warp will process one block of value
+   tokens. And with multiple outer iterations, the whole context value
+   tokens are processd
+
+   .. code:: cpp
+
+      float accs[NUM_ROWS_PER_THREAD];
+      for ... { // Iteration over different blocks.
+          logits_vec = ...
+          for ... { // Iteration over different rows.
+              v_vec = ...
+              ...
+              accs[i] += dot(logits_vec, v_vec);
+          }
+      }
+
+-  As shown in the above pseudo code, in the outer loop, similar to
+   ``k_ptr``, ``logits_vec`` iterates over different blocks and reads
+   ``V_VEC_SIZE`` elements from ``logits``. In the inner loop, each
+   thread reads ``V_VEC_SIZE`` elements from the same tokens as a
+   ``v_vec`` and performs dot multiplication. It is important to note
+   that in each inner iteration, the thread fetches different head
+   position elements for the same tokens. The dot result is then
+   accumulated in ``accs``. Therefore, each entry of ``accs`` is mapped
+   to a head position assigned to the current thread.
+-  For example, if ``BLOCK_SIZE`` is 16 and ``V_VEC_SIZE`` is 8, each
+   thread fetches 8 value elements for 8 tokens at a time. Each element
+   is from different tokens at the same head position. If ``HEAD_SIZE``
+   is 128 and ``WARP_SIZE`` is 32, for each inner loop, a warp needs to
+   fetch ``WARP_SIZE * V_VEC_SIZE = 256`` elements. This means there are
+   a total of 128 \* 16 / 256 = 8 inner iterations for a warp to handle
+   a whole block of value tokens. And each ``accs`` in each thread
+   contains 8 elements that accumulated at 8 different head positions.
+   For the thread 0, the ``accs`` variable will have 8 elements, which
+   are 0th, 32th … 224th elements of a value head that are accumulated
+   from all assigned 8 tokens.
+
+LV
+---
+-  Now, we need to perform reduction for ``accs`` within each warp. This
+   process allows each thread to accumulate the ``accs`` for the
+   assigned head positions of all tokens in one block.
+
+   .. code:: cpp
+
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+         float acc = accs[i];
+         for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+            acc += VLLM_SHFL_XOR_SYNC(acc, mask);
+         }
+         accs[i] = acc;
+      }
+
+-  Next, we perform reduction for ``accs`` across all warps, allowing
+   each thread to have the accumulation of ``accs`` for the assigned
+   head positions of all context tokens. Please note that each ``accs``
+   in every thread only stores the accumulation for a portion of
+   elements of the entire head for all context tokens. However, overall,
+   all results for output have been calculated but are just stored in
+   different thread register memory.
+
+   .. code:: cpp
+
+      float* out_smem = reinterpret_cast<float*>(shared_mem);
+      for (int i = NUM_WARPS; i > 1; i /= 2) {
+          // Upper warps write to shared memory.
+          ...
+              float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+              for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+                      ...
+              dst[row_idx] = accs[i];
+          }
+
+          // Lower warps update the output.
+              const float* src = &out_smem[warp_idx * HEAD_SIZE];
+          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+                      ...
+              accs[i] += src[row_idx];
+          }
+
+              // Write out the accs.
+      }
+
+Output
+------
+
+-  Now we can write all of calculated result from local register memory
+   to final output global memory.
+
+   .. code:: cpp
+
+      scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+                      + head_idx * max_num_partitions * HEAD_SIZE
+                      + partition_idx * HEAD_SIZE;
+
+-  First, we need to define the ``out_ptr`` variable, which points to
+   the start address of the assigned sequence and assigned head.
+
+   .. code:: cpp
+
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+      if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+          from_float(*(out_ptr + row_idx), accs[i]);
+      }
+      }
+
+-  Finally, we need to iterate over different assigned head positions
+   and write out the corresponding accumulated result based on the
+   ``out_ptr``.
--- a/docs/source/dev/sampling_params.rst
+++ b/docs/source/dev/sampling_params.rst
@ -0,0 +1,4 @@
+Sampling Params
+===============
+
+.. automodule:: vllm.sampling_params.SamplingParams
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@ -100,7 +100,7 @@ You can build and install vLLM from source:

 Build a docker image from `Dockerfile.rocm`, and launch a docker container.

-The `Dokerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments:
+The `Dockerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments:

 * `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
 * `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@ -60,6 +60,15 @@ You can also build and install vLLM from source:
    $ cd vllm
    $ pip install -e .  # This may take 5-10 minutes.

+.. tip::
+    To avoid your system being overloaded, you can limit the number of compilation jobs
+    to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
+
+    .. code-block:: console
+
+        $ export MAX_JOBS=6
+        $ pip install -e .
+
 .. tip::
    If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.

--- a/docs/source/getting_started/neuron-installation.rst
+++ b/docs/source/getting_started/neuron-installation.rst
@ -0,0 +1,136 @@
+.. _installation_neuron:
+
+Installation with Neuron
+========================
+
+vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK.
+At the moment Paged Attention is not supported in Neuron SDK, but naive continuous batching is supported in transformers-neuronx.
+Data types currently supported in Neuron SDK are FP16 and BF16.
+
+Requirements
+------------
+
+* OS: Linux
+* Python: 3.8 -- 3.11
+* Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
+* Pytorch 2.0.1/2.1.1
+* AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
+
+Installation steps:
+
+- :ref:`Build from source <build_from_source_neuron>`
+
+  - :ref:`Step 0. Launch Trn1/Inf2 instances <launch_instances>`
+  - :ref:`Step 1. Install drivers and tools <install_drivers>`
+  - :ref:`Step 2. Install transformers-neuronx and its dependencies <install_tnx>`
+  - :ref:`Step 3. Install vLLM from source <install_vllm>`
+
+.. _build_from_source_neuron:
+
+Build from source
+-----------------
+
+Following instructions are applicable to Neuron SDK 2.16 and beyond.
+
+.. _launch_instances:
+
+Step 0. Launch Trn1/Inf2 instances
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here are the steps to launch trn1/inf2 instances, in order to install `PyTorch Neuron ("torch-neuronx") Setup on Ubuntu 22.04 LTS <https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/setup/neuron-setup/pytorch/neuronx/ubuntu/torch-neuronx-ubuntu22.html>`_.
+
+- Please follow the instructions at `launch an Amazon EC2 Instance <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html#ec2-launch-instance>`_ to launch an instance. When choosing the instance type at the EC2 console, please make sure to select the correct instance type.
+- To get more information about instances sizes and pricing see: `Trn1 web page <https://aws.amazon.com/ec2/instance-types/trn1/>`_, `Inf2 web page <https://aws.amazon.com/ec2/instance-types/inf2/>`_
+- Select Ubuntu Server 22.04 TLS AMI
+- When launching a Trn1/Inf2, please adjust your primary EBS volume size to a minimum of 512GB.
+- After launching the instance, follow the instructions in `Connect to your instance <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html>`_ to connect to the instance
+
+.. _install_drivers:
+
+Step 1. Install drivers and tools
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The installation of drivers and tools wouldn't be necessary, if `Deep Learning AMI Neuron <https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html>`_ is installed. In case the drivers and tools are not installed on the operating system, follow the steps below:
+
+.. code-block:: console
+
+    # Configure Linux for Neuron repository updates
+    . /etc/os-release
+    sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+    deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+    EOF
+    wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+
+    # Update OS packages
+    sudo apt-get update -y
+
+    # Install OS headers
+    sudo apt-get install linux-headers-$(uname -r) -y
+
+    # Install git
+    sudo apt-get install git -y
+
+    # install Neuron Driver
+    sudo apt-get install aws-neuronx-dkms=2.* -y
+
+    # Install Neuron Runtime
+    sudo apt-get install aws-neuronx-collectives=2.* -y
+    sudo apt-get install aws-neuronx-runtime-lib=2.* -y
+
+    # Install Neuron Tools
+    sudo apt-get install aws-neuronx-tools=2.* -y
+
+    # Add PATH
+    export PATH=/opt/aws/neuron/bin:$PATH
+
+
+.. _install_tnx:
+
+Step 2. Install transformers-neuronx and its dependencies
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`transformers-neuronx <https://github.com/aws-neuron/transformers-neuronx>`_ will be the backend to support inference on trn1/inf2 instances.
+Follow the steps below to install transformer-neuronx package and its dependencies.
+
+.. code-block:: console
+
+    # Install Python venv
+    sudo apt-get install -y python3.10-venv g++
+
+    # Create Python venv
+    python3.10 -m venv aws_neuron_venv_pytorch
+
+    # Activate Python venv
+    source aws_neuron_venv_pytorch/bin/activate
+
+    # Install Jupyter notebook kernel
+    pip install ipykernel
+    python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+    pip install jupyter notebook
+    pip install environment_kernels
+
+    # Set pip repository pointing to the Neuron repository
+    python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+
+    # Install wget, awscli
+    python -m pip install wget
+    python -m pip install awscli
+
+    # Update Neuron Compiler and Framework
+    python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+
+.. _install_vllm:
+
+Step 3. Install vLLM from source
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows:
+
+.. code-block:: console
+
+    $ git clone https://github.com/vllm-project/vllm.git
+    $ cd vllm
+    $ pip install -U -r requirements-neuron.txt
+    $ pip install .
+
+If neuron packages are detected correctly in the installation process, ``vllm-0.3.0+neuron212`` will be installed.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -62,19 +62,19 @@ Documentation

   getting_started/installation
   getting_started/amd-installation
+   getting_started/neuron-installation
   getting_started/quickstart

 .. toctree::
   :maxdepth: 1
   :caption: Serving

-   serving/distributed_serving
-   serving/run_on_sky
-   serving/deploying_with_kserve
-   serving/deploying_with_triton
+   serving/openai_compatible_server
   serving/deploying_with_docker
-   serving/serving_with_langchain
+   serving/distributed_serving
   serving/metrics
+   serving/usage_stats
+   serving/integrations

 .. toctree::
   :maxdepth: 1
@ -96,7 +96,9 @@ Documentation
   :maxdepth: 2
   :caption: Developer Documentation

+   dev/sampling_params
   dev/engine/engine_index
+   dev/kernel/paged_attention

 Indices and tables
 ==================
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@ -56,8 +56,8 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
    -    return_dict: Optional[bool] = None,
    -) -> Union[Tuple, CausalLMOutputWithPast]:
    +    positions: torch.Tensor,
-    +    kv_caches: List[KVCache],
-    +    input_metadata: InputMetadata,
+    +    kv_caches: List[torch.Tensor],
+    +    attn_metadata: AttentionMetadata,
    +) -> Optional[SamplerOutput]:

 1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@ -81,6 +81,10 @@ Below, you can find an explanation of every engine argument for vLLM:

    Token block size for contiguous chunks of tokens.

+.. option:: --enable-prefix-caching
+
+    Enables automatic prefix caching
+
 .. option:: --seed <seed>

    Random seed for operations.
--- a/docs/source/models/lora.rst
+++ b/docs/source/models/lora.rst
@ -90,9 +90,10 @@ Requests can specify the LoRA adapter as if it were any other model via the ``mo
 processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
 LoRA adapter requests if they were provided and ``max_loras`` is set high enough).

-The following is an example request 
+The following is an example request
+
+.. code-block:: bash

-.. code-block::bash 
    curl http://localhost:8000/v1/completions \
        -H "Content-Type: application/json" \
        -d '{
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@ -8,84 +8,125 @@ The following is the list of model architectures that are currently supported by
 Alongside each architecture, we include some popular models that use it.

 .. list-table::
-  :widths: 25 25 50
+  :widths: 25 25 50 5
  :header-rows: 1

  * - Architecture
    - Models
    - Example HuggingFace Models
+    - :ref:`LoRA <lora>`
  * - :code:`AquilaForCausalLM`
    - Aquila
    - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
+    - ✅︎
  * - :code:`BaiChuanForCausalLM`
    - Baichuan
    - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc.
+    - ✅︎
  * - :code:`ChatGLMModel`
    - ChatGLM
    - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
+    - ✅︎
+  * - :code:`CohereForCausalLM`
+    - Command-R
+    - :code:`CohereForAI/c4ai-command-r-v01`, etc.
+    - 
+  * - :code:`DbrxForCausalLM`
+    - DBRX
+    - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc.
+    - 
  * - :code:`DeciLMForCausalLM`
    - DeciLM
    - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
+    - 
  * - :code:`BloomForCausalLM`
    - BLOOM, BLOOMZ, BLOOMChat
    - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
+    - 
  * - :code:`FalconForCausalLM`
    - Falcon
    - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
+    - 
  * - :code:`GemmaForCausalLM`
    - Gemma
    - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
+    - ✅︎
  * - :code:`GPT2LMHeadModel`
    - GPT-2
    - :code:`gpt2`, :code:`gpt2-xl`, etc.
+    - 
  * - :code:`GPTBigCodeForCausalLM`
    - StarCoder, SantaCoder, WizardCoder
    - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
+    - 
  * - :code:`GPTJForCausalLM`
    - GPT-J
    - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
+    - 
  * - :code:`GPTNeoXForCausalLM`
    - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
    - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
+    - 
  * - :code:`InternLMForCausalLM`
    - InternLM
    - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
+    - ✅︎
  * - :code:`InternLM2ForCausalLM`
    - InternLM2
    - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc.
+    -
+  * - :code:`JAISLMHeadModel`
+    - Jais
+    - :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc.
+    -
  * - :code:`LlamaForCausalLM`
    - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
    - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
+    - ✅︎
  * - :code:`MistralForCausalLM`
    - Mistral, Mistral-Instruct
    - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
+    - ✅︎
  * - :code:`MixtralForCausalLM`
    - Mixtral-8x7B, Mixtral-8x7B-Instruct
    - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.
+    - ✅︎
  * - :code:`MPTForCausalLM`
    - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
    - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
+    - 
  * - :code:`OLMoForCausalLM`
    - OLMo
    - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
+    - 
  * - :code:`OPTForCausalLM`
    - OPT, OPT-IML
    - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
+    - 
  * - :code:`OrionForCausalLM`
    - Orion
    - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
+    - 
  * - :code:`PhiForCausalLM`
    - Phi
    - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
+    - 
  * - :code:`QWenLMHeadModel`
    - Qwen
    - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
+    - 
  * - :code:`Qwen2ForCausalLM`
    - Qwen2
    - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
+    - ✅︎
+  * - :code:`Qwen2MoeForCausalLM`
+    - Qwen2MoE
+    - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
+    - 
  * - :code:`StableLmForCausalLM`
    - StableLM
    - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
+    - 

 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
--- a/docs/source/serving/deploying_with_bentoml.rst
+++ b/docs/source/serving/deploying_with_bentoml.rst
@ -0,0 +1,8 @@
+.. _deploying_with_bentoml:
+
+Deploying with BentoML
+======================
+
+`BentoML <https://github.com/bentoml/BentoML>`_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes.
+
+For details, see the tutorial `vLLM inference in the BentoML documentation <https://docs.bentoml.com/en/latest/use-cases/large-language-models/vllm.html>`_.
--- a/docs/source/serving/integrations.rst
+++ b/docs/source/serving/integrations.rst
@ -0,0 +1,11 @@
+Integrations
+------------
+
+.. toctree::
+   :maxdepth: 1
+
+   run_on_sky
+   deploying_with_kserve
+   deploying_with_triton
+   deploying_with_bentoml
+   serving_with_langchain
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -0,0 +1,114 @@
+# OpenAI Compatible Server
+
+vLLM provides an HTTP server that implements OpenAI's [Completions](https://platform.openai.com/docs/api-reference/completions) and [Chat](https://platform.openai.com/docs/api-reference/chat) API.
+
+You can start the server using Python, or using [Docker](deploying_with_docker.rst):
+```bash
+python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-hf --dtype float32 --api-key token-abc123
+```
+
+To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
+```python
+from openai import OpenAI
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="token-abc123",
+)
+
+completion = client.chat.completions.create(
+  model="meta-llama/Llama-2-7b-hf",
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"}
+  ]
+)
+
+print(completion.choices[0].message)
+```
+
+## API Reference
+Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except:
+- Chat: `tools`, and `tool_choice`.
+- Completions: `suffix`.
+
+## Extra Parameters
+vLLM supports a set of parameters that are not part of the OpenAI API.
+In order to use them, you can pass them as extra parameters in the OpenAI client.
+Or directly merge them into the JSON payload if you are using HTTP call directly.
+
+```python
+completion = client.chat.completions.create(
+  model="meta-llama/Llama-2-7b-hf",
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+  ],
+  extra_body={
+    "guided_choice": ["positive", "negative"]
+  }
+)
+```
+
+### Extra Parameters for Chat API
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-chat-completion-sampling-params
+:end-before: end-chat-completion-sampling-params
+```
+
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-chat-completion-extra-params
+:end-before: end-chat-completion-extra-params
+```
+
+### Extra Parameters for Completions API
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-sampling-params
+:end-before: end-completion-sampling-params
+```
+
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-extra-params
+:end-before: end-completion-extra-params
+```
+
+## Chat Template
+
+In order for the language model to support chat protocol, vLLM requires the model to include
+a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
+specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
+
+An example chat template for `meta-llama/Llama-2-7b-chat-hf` can be found [here](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/09bd0f49e16738cdfaa6e615203e126038736eb0/tokenizer_config.json#L12)
+
+Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
+you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
+template, or the template in string form. Without a chat template, the server will not be able to process chat
+and all chat requests will error.
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+  --model ... \
+  --chat-template ./path-to-chat-template.jinja
+```
+
+vLLM community provides a set of chat templates for popular models. You can find them in the examples
+directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
+
+## Command line arguments for the server
+
+```{argparse}
+:module: vllm.entrypoints.openai.cli_args
+:func: make_arg_parser
+:prog: vllm-openai-server
+```
--- a/docs/source/serving/usage_stats.md
+++ b/docs/source/serving/usage_stats.md
@ -0,0 +1,57 @@
+# Usage Stats Collection
+
+vLLM collects anonymous usage data by default to help the engineering team better understand which hardware and model configurations are widely used. This data allows them to prioritize their efforts on the most common workloads. The collected data is transparent, does not contain any sensitive information, and will be publicly released for the community's benefit.
+
+## What data is collected?
+
+You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py).
+
+Here is an example as of v0.4.0:
+
+```json
+{
+  "uuid": "fbe880e9-084d-4cab-a395-8984c50f1109",
+  "provider": "GCP",
+  "num_cpu": 24,
+  "cpu_type": "Intel(R) Xeon(R) CPU @ 2.20GHz",
+  "cpu_family_model_stepping": "6,85,7",
+  "total_memory": 101261135872,
+  "architecture": "x86_64",
+  "platform": "Linux-5.10.0-28-cloud-amd64-x86_64-with-glibc2.31",
+  "gpu_count": 2,
+  "gpu_type": "NVIDIA L4",
+  "gpu_memory_per_device": 23580639232,
+  "model_architecture": "OPTForCausalLM",
+  "vllm_version": "0.3.2+cu123",
+  "context": "LLM_CLASS",
+  "log_time": 1711663373492490000,
+  "source": "production",
+  "dtype": "torch.float16",
+  "tensor_parallel_size": 1,
+  "block_size": 16,
+  "gpu_memory_utilization": 0.9,
+  "quantization": null,
+  "kv_cache_dtype": "auto",
+  "enable_lora": false,
+  "enable_prefix_caching": false,
+  "enforce_eager": false,
+  "disable_custom_all_reduce": true
+}
+```
+
+You can preview the collected data by running the following command:
+
+```bash
+tail ~/.config/vllm/usage_stats.json
+```
+
+## Opt-out of Usage Stats Collection
+
+You can opt-out of usage stats collection by setting the VLLM_NO_USAGE_STATS or DO_NOT_TRACK environment variable, or by creating a ~/.config/vllm/do_not_track file:
+
+```bash
+# Any of the following methods can disable usage stats collection
+export VLLM_NO_USAGE_STATS=1
+export DO_NOT_TRACK=1
+mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
+```
--- a/examples/gradio_openai_chatbot_webserver.py
+++ b/examples/gradio_openai_chatbot_webserver.py
@ -1,6 +1,7 @@
 import argparse
-from openai import OpenAI
+
 import gradio as gr
+from openai import OpenAI

 # Argument parser setup
 parser = argparse.ArgumentParser(
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@ -0,0 +1,90 @@
+import argparse
+import os
+import subprocess
+
+import torch
+
+from vllm import LLM
+from vllm.sequence import MultiModalData
+
+# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
+
+
+def run_llava_pixel_values():
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        image_input_type="pixel_values",
+        image_token_id=32000,
+        image_input_shape="1,3,336,336",
+        image_feature_size=576,
+    )
+
+    prompt = "<image>" * 576 + (
+        "\nUSER: What is the content of this image?\nASSISTANT:")
+
+    # This should be provided by another online or offline component.
+    images = torch.load("images/stop_sign_pixel_values.pt")
+
+    outputs = llm.generate(prompt,
+                           multi_modal_data=MultiModalData(
+                               type=MultiModalData.Type.IMAGE, data=images))
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def run_llava_image_features():
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        image_input_type="image_features",
+        image_token_id=32000,
+        image_input_shape="1,576,1024",
+        image_feature_size=576,
+    )
+
+    prompt = "<image>" * 576 + (
+        "\nUSER: What is the content of this image?\nASSISTANT:")
+
+    # This should be provided by another online or offline component.
+    images = torch.load("images/stop_sign_image_features.pt")
+
+    outputs = llm.generate(prompt,
+                           multi_modal_data=MultiModalData(
+                               type=MultiModalData.Type.IMAGE, data=images))
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def main(args):
+    if args.type == "pixel_values":
+        run_llava_pixel_values()
+    else:
+        run_llava_image_features()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Demo on Llava")
+    parser.add_argument("--type",
+                        type=str,
+                        choices=["pixel_values", "image_features"],
+                        default="pixel_values",
+                        help="image input type")
+    args = parser.parse_args()
+    # Download from s3
+    s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
+    local_directory = "images"
+
+    # Make sure the local directory exists or create it
+    os.makedirs(local_directory, exist_ok=True)
+
+    # Use AWS CLI to sync the directory, assume anonymous access
+    subprocess.check_call([
+        "aws",
+        "s3",
+        "sync",
+        s3_bucket_path,
+        local_directory,
+        "--no-sign-request",
+    ])
+    main(args)
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@ -1,7 +1,7 @@
 import argparse
 from typing import List, Tuple

-from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams


 def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
--- a/examples/multilora_inference.py
+++ b/examples/multilora_inference.py
@ -1,14 +1,15 @@
 """
-This example shows how to use the multi-LoRA functionality for offline inference.
+This example shows how to use the multi-LoRA functionality
+for offline inference.

 Requires HuggingFace credentials for access to Llama2.
 """

-from typing import Optional, List, Tuple
+from typing import List, Optional, Tuple

 from huggingface_hub import snapshot_download

-from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
 from vllm.lora.request import LoRARequest


@ -16,7 +17,7 @@ def create_test_prompts(
        lora_path: str
 ) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
    """Create a list of test prompts with their sampling parameters.
-    
+
    2 requests for base model, 4 requests for the LoRA. We define 2
    different LoRA adapters (using the same model for demo purposes).
    Since we also set `max_loras=1`, the expectation is that the requests
@ -34,36 +35,40 @@ def create_test_prompts(
                        top_k=5,
                        presence_penalty=0.2,
                        max_tokens=128), None),
-        ("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128,
-                        stop_token_ids=[32003]),
-         LoRARequest("sql-lora", 1, lora_path)),
-        ("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-         SamplingParams(n=3,
-                        best_of=3,
-                        use_beam_search=True,
-                        temperature=0,
-                        max_tokens=128,
-                        stop_token_ids=[32003]),
-         LoRARequest("sql-lora", 1, lora_path)),
-        ("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128,
-                        stop_token_ids=[32003]),
-         LoRARequest("sql-lora2", 2, lora_path)),
-        ("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
-         SamplingParams(n=3,
-                        best_of=3,
-                        use_beam_search=True,
-                        temperature=0,
-                        max_tokens=128,
-                        stop_token_ids=[32003]),
-         LoRARequest("sql-lora", 1, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+            SamplingParams(temperature=0.0,
+                           logprobs=1,
+                           prompt_logprobs=1,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora", 1, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+            SamplingParams(n=3,
+                           best_of=3,
+                           use_beam_search=True,
+                           temperature=0,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora", 1, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
+            SamplingParams(temperature=0.0,
+                           logprobs=1,
+                           prompt_logprobs=1,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora2", 2, lora_path)),
+        (
+            "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
+            SamplingParams(n=3,
+                           best_of=3,
+                           use_beam_search=True,
+                           temperature=0,
+                           max_tokens=128,
+                           stop_token_ids=[32003]),
+            LoRARequest("sql-lora", 1, lora_path)),
    ]


--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@ -5,11 +5,13 @@ distributively on a multi-nodes cluster.
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """

-from vllm import LLM, SamplingParams
 from typing import Dict
+
 import numpy as np
 import ray

+from vllm import LLM, SamplingParams
+
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
@ -12,17 +12,20 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

 # Create an LLM.
 llm = LLM(
-    model="openlm-research/open_llama_3b",
+    model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    max_num_seqs=8,
-    # The max_model_len and block_size arguments are required to be same as max sequence length,
-    # when targeting neuron device. Currently, this is a known limitation in continuous batching
-    # support in transformers-neuronx.
+    # The max_model_len and block_size arguments are required to be same as
+    # max sequence length when targeting neuron device.
+    # Currently, this is a known limitation in continuous batching support
+    # in transformers-neuronx.
    # TODO(liangfu): Support paged-attention in transformers-neuronx.
    max_model_len=128,
    block_size=128,
    # The device can be automatically detected when AWS Neuron SDK is installed.
-    # The device argument can be either unspecified for automated detection, or explicitly assigned.
-    device="neuron")
+    # The device argument can be either unspecified for automated detection,
+    # or explicitly assigned.
+    device="neuron",
+    tensor_parallel_size=2)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@ -22,7 +22,7 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.0)

 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)

 generating_prompts = [prefix + prompt for prompt in prompts]

@ -37,20 +37,14 @@ for output in outputs:

 print("-" * 80)

-# -1 since the last token can change when concatenating prompts.
-prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1
-
-# The llm.generate call will batch all prompts and send the batch at once if resources allow.
-# The prefix will only be cached after the first batch is processed, so we need to call generate once
-# to calculate the prefix and cache it.
-outputs = llm.generate(generating_prompts[0],
-                       sampling_params,
-                       prefix_pos=[prefix_pos])
+# The llm.generate call will batch all prompts and send the batch at once
+# if resources allow. The prefix will only be cached after the first batch
+# is processed, so we need to call generate once to calculate the prefix
+# and cache it.
+outputs = llm.generate(generating_prompts[0], sampling_params)

 # Subsequent batches can leverage the cached prefix
-outputs = llm.generate(generating_prompts,
-                       sampling_params,
-                       prefix_pos=[prefix_pos] * len(generating_prompts))
+outputs = llm.generate(generating_prompts, sampling_params)

 # Print the outputs. You should see the same outputs as before
 for output in outputs:
--- a/examples/production_monitoring/grafana.json
+++ b/examples/production_monitoring/grafana.json
@ -1,35 +1,4 @@
 {
-  "__inputs": [
-    {
-      "name": "DS_PROMETHEUS",
-      "label": "prometheus",
-      "description": "",
-      "type": "datasource",
-      "pluginId": "prometheus",
-      "pluginName": "Prometheus"
-    }
-  ],
-  "__elements": {},
-  "__requires": [
-    {
-      "type": "grafana",
-      "id": "grafana",
-      "name": "Grafana",
-      "version": "10.2.3"
-    },
-    {
-      "type": "datasource",
-      "id": "prometheus",
-      "name": "Prometheus",
-      "version": "1.0.0"
-    },
-    {
-      "type": "panel",
-      "id": "timeseries",
-      "name": "Time series",
-      "version": ""
-    }
-  ],
  "annotations": {
    "list": [
      {
@ -42,6 +11,12 @@
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
+        "target": {
+          "limit": 100,
+          "matchAny": false,
+          "tags": [],
+          "type": "dashboard"
+        },
        "type": "dashboard"
      }
    ]
@ -50,14 +25,14 @@
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
-  "id": null,
+  "id": 29,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
      },
      "description": "End to end request latency measured in seconds.",
      "fieldConfig": {
@ -66,7 +41,6 @@
            "mode": "palette-classic"
          },
          "custom": {
-            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@ -80,7 +54,6 @@
              "tooltip": false,
              "viz": false
            },
-            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@ -138,11 +111,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "instant": false,
@ -154,11 +127,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -171,11 +144,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -188,11 +161,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -205,10 +178,10 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "editorMode": "code",
-          "expr": "rate(vllm:e2e_request_latency_seconds_sum[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count[$__rate_interval])",
+          "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
          "hide": false,
          "instant": false,
          "legendFormat": "Average",
@ -222,7 +195,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
      },
      "description": "Number of tokens processed per second",
      "fieldConfig": {
@ -231,7 +204,6 @@
            "mode": "palette-classic"
          },
          "custom": {
-            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@ -245,7 +217,6 @@
              "tooltip": false,
              "viz": false
            },
-            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@ -302,11 +273,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "rate(vllm:prompt_tokens_total[$__rate_interval])",
+          "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "instant": false,
@ -318,11 +289,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "rate(vllm:generation_tokens_total[$__rate_interval])",
+          "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -339,7 +310,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
      },
      "description": "Inter token latency in seconds.",
      "fieldConfig": {
@ -348,7 +319,6 @@
            "mode": "palette-classic"
          },
          "custom": {
-            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@ -362,7 +332,6 @@
              "tooltip": false,
              "viz": false
            },
-            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@ -420,11 +389,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "instant": false,
@ -436,11 +405,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -453,11 +422,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -470,11 +439,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -487,10 +456,10 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "editorMode": "code",
-          "expr": "rate(vllm:time_per_output_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count[$__rate_interval])",
+          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
          "hide": false,
          "instant": false,
          "legendFormat": "Mean",
@ -504,7 +473,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
      },
      "description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
      "fieldConfig": {
@ -513,7 +482,6 @@
            "mode": "palette-classic"
          },
          "custom": {
-            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@ -527,7 +495,6 @@
              "tooltip": false,
              "viz": false
            },
-            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@ -585,11 +552,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "vllm:num_requests_running",
+          "expr": "vllm:num_requests_running{model_name=\"$model_name\"}",
          "fullMetaSearch": false,
          "includeNullMetadata": true,
          "instant": false,
@ -601,11 +568,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "vllm:num_requests_swapped",
+          "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": true,
@ -618,11 +585,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "vllm:num_requests_waiting",
+          "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": true,
@ -639,7 +606,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
      },
      "description": "P50, P90, P95, and P99 TTFT latency in seconds.",
      "fieldConfig": {
@ -648,7 +615,6 @@
            "mode": "palette-classic"
          },
          "custom": {
-            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@ -662,7 +628,6 @@
              "tooltip": false,
              "viz": false
            },
-            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@ -720,11 +685,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -737,11 +702,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "includeNullMetadata": false,
          "instant": false,
@ -753,11 +718,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -770,11 +735,11 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
          "fullMetaSearch": false,
          "hide": false,
          "includeNullMetadata": false,
@ -787,10 +752,10 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "editorMode": "code",
-          "expr": "rate(vllm:time_to_first_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count[$__rate_interval])",
+          "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
          "hide": false,
          "instant": false,
          "legendFormat": "Average",
@ -804,7 +769,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "prometheus"
      },
      "description": "Percentage of used cache blocks by vLLM.",
      "fieldConfig": {
@ -813,7 +778,6 @@
            "mode": "palette-classic"
          },
          "custom": {
-            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@ -827,7 +791,6 @@
              "tooltip": false,
              "viz": false
            },
-            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@ -885,10 +848,10 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "editorMode": "code",
-          "expr": "vllm:gpu_cache_usage_perc",
+          "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
          "instant": false,
          "legendFormat": "GPU Cache Usage",
          "range": true,
@ -897,10 +860,10 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
+            "uid": "prometheus"
          },
          "editorMode": "code",
-          "expr": "vllm:cpu_cache_usage_perc",
+          "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
          "hide": false,
          "instant": false,
          "legendFormat": "CPU Cache Usage",
@ -913,10 +876,39 @@
    }
  ],
  "refresh": "",
-  "schemaVersion": 39,
+  "schemaVersion": 37,
+  "style": "dark",
  "tags": [],
  "templating": {
-    "list": []
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "vllm",
+          "value": "vllm"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "definition": "label_values(model_name)",
+        "hide": 0,
+        "includeAll": false,
+        "label": "model_name",
+        "multi": false,
+        "name": "model_name",
+        "options": [],
+        "query": {
+          "query": "label_values(model_name)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      }
+    ]
  },
  "time": {
    "from": "now-5m",
--- a/examples/template_baichuan.jinja
+++ b/examples/template_baichuan.jinja
@ -1,22 +1,13 @@
 {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}

-{% for message in messages %}
-{% if message['role'] == 'user' %}
-<reserved_106>
-{{ message['content']|trim -}}
-{% if not loop.last %}
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '<reserved_106>' + message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- '<reserved_107>' + message['content'] -}}
+    {%- endif -%}
+{%- endfor -%}

-
-{% endif %}
-{% elif message['role'] == 'assistant' %}
-<reserved_107>
-{{ message['content']|trim -}}
-{% if not loop.last %}
-
-
-{% endif %}
-{% endif %}
-{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
-<reserved_107>
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '<reserved_107>' -}}
 {% endif %}
--- a/examples/template_chatglm.jinja
+++ b/examples/template_chatglm.jinja
@ -0,0 +1,18 @@
+{%- set counter = namespace(index=0) -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
+        {%- set counter.index = counter.index + 1 -%}
+    {%- endif -%}
+    {%- if message['role'] == 'assistant' -%}
+        {{- '\n答：' + message['content'] -}}
+        {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+            {{- '\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '\n答：' -}}
+{%- endif -%}
--- a/examples/template_chatglm2.jinja
+++ b/examples/template_chatglm2.jinja
@ -0,0 +1,18 @@
+{%- set counter = namespace(index=1) -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '[Round ' + counter.index|string + ']\n\n问：' + message['content'] -}}
+        {%- set counter.index = counter.index + 1 -%}
+    {%- endif -%}
+    {%- if message['role'] == 'assistant' -%}
+        {{- '\n\n答：' + message['content'] -}}
+        {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+            {{- '\n\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '\n\n答：' -}}
+{%- endif -%}
--- a/examples/template_falcon.jinja
+++ b/examples/template_falcon.jinja
@ -0,0 +1,15 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- 'User: ' + message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- 'Assistant: ' + message['content'] -}}
+    {%- endif -%}
+    {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+        {{- '\n' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- 'Assistant:' -}}
+{% endif %}
--- a/examples/template_falcon_180b.jinja
+++ b/examples/template_falcon_180b.jinja
@ -0,0 +1,17 @@
+{%- for message in messages -%}
+    {%- if message['role'] == 'system' -%}
+        {{- 'System: ' + message['content'] -}}
+    {%- elif message['role'] == 'user' -%}
+        {{- 'User: ' + message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- 'Falcon: ' + message['content'] -}}
+    {%- endif -%}
+    {%- if (loop.last and add_generation_prompt) or not loop.last -%}
+        {{- '\n' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- 'Falcon:' -}}
+{% endif %}
--- a/format.sh
+++ b/format.sh
@ -25,6 +25,7 @@ YAPF_VERSION=$(yapf --version | awk '{print $2}')
 RUFF_VERSION=$(ruff --version | awk '{print $2}')
 MYPY_VERSION=$(mypy --version | awk '{print $2}')
 CODESPELL_VERSION=$(codespell --version)
+ISORT_VERSION=$(isort --vn)

 # # params: tool name, tool version, required version
 tool_version_check() {
@ -37,6 +38,7 @@ tool_version_check() {
 tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
 tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
 tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)"
 tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"

 YAPF_FLAGS=(
@ -95,13 +97,17 @@ echo 'vLLM yapf: Done'
 # echo 'vLLM mypy:'
 # mypy

+CODESPELL_EXCLUDES=(
+    '--skip' '*docs/source/_build/**'
+)
+
 # check spelling of specified files
 spell_check() {
    codespell "$@"
 }

 spell_check_all(){
-  codespell --toml pyproject.toml
+  codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
 }

 # Spelling  check of files that differ from main branch.
@ -116,7 +122,7 @@ spell_check_changed() {

    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             codespell
+             codespell "${CODESPELL_EXCLUDES[@]}"
    fi
 }

@ -174,6 +180,46 @@ else
   lint_changed
 fi

+# check spelling of specified files
+isort_check() {
+    isort "$@"
+}
+
+isort_check_all(){
+  isort .
+}
+
+# Spelling  check of files that differ from main branch.
+isort_check_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause ruff to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+             isort
+    fi
+}
+
+# Run Isort
+# This flag runs spell check of individual files. --files *must* be the first command line
+# arg to use this option.
+if [[ "$1" == '--files' ]]; then
+   isort_check "${@:2}"
+   # If `--all` is passed, then any further arguments are ignored and the
+   # entire python directory is linted.
+elif [[ "$1" == '--all' ]]; then
+   isort_check_all
+else
+   # Check spelling only of the files that changed in last commit.
+   isort_check_changed
+fi
+echo 'vLLM isort: Done'
+
 if ! git diff --quiet &>/dev/null; then
    echo 'Reformatted files. Please review and stage the changes.'
    echo 'Changes not staged for commit:'
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,7 @@
 [build-system]
 # Should be mirrored in requirements-build.txt
 requires = [
+    "cmake>=3.21",
    "ninja",
    "packaging",
    "setuptools >= 49.4.0",
@ -9,6 +10,10 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"

+[tool.ruff]
+# Allow lines to be as long as 80.
+line-length = 80
+
 [tool.ruff.lint]
 select = [
    # pycodestyle
@ -29,10 +34,6 @@ ignore = [
    "F405", "F403",
    # lambda expression assignment
    "E731",
-    # line too long, handled by black formatting
-    "E501",
-    # .strip() with multi-character strings
-    "B005",
    # Loop control variable not used within loop body
    "B007",
 ]
@ -49,4 +50,8 @@ exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"

 [tool.codespell]
 ignore-words-list = "dout, te, indicies"
-skip = "./tests/prompts"
+skip = "./tests/prompts,./benchmarks/sonnet.txt"
+
+[tool.isort]
+use_parentheses = true
+skip_gitignore = true
--- a/requirements-build.txt
+++ b/requirements-build.txt
@ -1,6 +1,7 @@
 # Should be mirrored in pyproject.toml
+cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
 torch==2.1.2
-wheel
+wheel
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -4,6 +4,7 @@ toml==0.10.2
 tomli==2.0.1
 ruff==0.1.5
 codespell==2.2.6
+isort==5.13.2

 # type checking
 mypy==0.991
@ -16,8 +17,18 @@ pytest
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures
+pytest-shard
 httpx
 einops # required for MPT
 openai
 requests
 ray
+peft
+awscli
+ai2-olmo # required for OLMo
+
+# Benchmarking
+aiohttp
+
+# Multimodal
+pillow
--- a/requirements-neuron.txt
+++ b/requirements-neuron.txt
@ -7,3 +7,6 @@ fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
+requests
+psutil
+py-cpuinfo
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@ -1,13 +1,17 @@
+cmake>=3.21
 ninja  # For faster builds.
 typing-extensions>=4.8.0
 starlette
+requests
+py-cpuinfo
 psutil
-ray >= 2.9
+ray == 2.9.3
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 tokenizers>=0.15.0
-transformers >= 4.38.0  # Required for Gemma.
+transformers >= 4.39.1  # Required for StarCoder2 & Llava.
 fastapi
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
+outlines == 0.0.34
--- a/requirements.txt
+++ b/requirements.txt
@ -1,10 +1,14 @@
+cmake>=3.21
 ninja  # For faster builds.
 psutil
 ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.1.2
-transformers >= 4.38.0  # Required for Gemma.
+requests
+psutil
+py-cpuinfo
+transformers >= 4.39.1  # Required for StarCoder2 & Llava.
 xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
@ -12,5 +16,5 @@ pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 pynvml == 11.5.0
 triton >= 2.1.0
-outlines >= 0.0.27
-cupy-cuda12x == 12.1.0  # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
+outlines == 0.0.34
+tiktoken == 0.6.0 # Required for DBRX tokenizer
--- a/setup.py
+++ b/setup.py
@ -1,31 +1,191 @@
-import contextlib
 import io
+import logging
 import os
 import re
 import subprocess
-import warnings
-from pathlib import Path
-from typing import List, Set
+import sys
+from shutil import which
+from typing import List

-from packaging.version import parse, Version
-import setuptools
 import torch
-import torch.utils.cpp_extension as torch_cpp_ext
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
+from packaging.version import Version, parse
+from setuptools import Extension, find_packages, setup
+from setuptools.command.build_ext import build_ext
+from torch.utils.cpp_extension import CUDA_HOME

 ROOT_DIR = os.path.dirname(__file__)
+logger = logging.getLogger(__name__)

-# If you are developing the C++ backend of vLLM, consider building vLLM with
-# `python setup.py develop` since it will give you incremental builds.
-# The downside is that this method is deprecated, see
-# https://github.com/pypa/setuptools/issues/917
+# vLLM only supports Linux platform
+assert sys.platform.startswith(
+    "linux"), "vLLM only supports Linux platform (including WSL)."

 MAIN_CUDA_VERSION = "12.1"

-# Supported NVIDIA GPU architectures.
-NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx908", "gfx90a", "gfx942", "gfx1100"}
-# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
+
+def is_sccache_available() -> bool:
+    return which("sccache") is not None
+
+
+def is_ccache_available() -> bool:
+    return which("ccache") is not None
+
+
+def is_ninja_available() -> bool:
+    return which("ninja") is not None
+
+
+def remove_prefix(text, prefix):
+    if text.startswith(prefix):
+        return text[len(prefix):]
+    return text
+
+
+class CMakeExtension(Extension):
+
+    def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
+        super().__init__(name, sources=[], **kwa)
+        self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
+
+
+class cmake_build_ext(build_ext):
+    # A dict of extension directories that have been configured.
+    did_config = {}
+
+    #
+    # Determine number of compilation jobs and optionally nvcc compile threads.
+    #
+    def compute_num_jobs(self):
+        # `num_jobs` is either the value of the MAX_JOBS environment variable
+        # (if defined) or the number of CPUs available.
+        num_jobs = os.environ.get("MAX_JOBS", None)
+        if num_jobs is not None:
+            num_jobs = int(num_jobs)
+            logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
+        else:
+            try:
+                # os.sched_getaffinity() isn't universally available, so fall
+                #  back to os.cpu_count() if we get an error here.
+                num_jobs = len(os.sched_getaffinity(0))
+            except AttributeError:
+                num_jobs = os.cpu_count()
+
+        nvcc_threads = None
+        if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
+            # `nvcc_threads` is either the value of the NVCC_THREADS
+            # environment variable (if defined) or 1.
+            # when it is set, we reduce `num_jobs` to avoid
+            # overloading the system.
+            nvcc_threads = os.getenv("NVCC_THREADS", None)
+            if nvcc_threads is not None:
+                nvcc_threads = int(nvcc_threads)
+                logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
+                            " of nvcc threads.")
+            else:
+                nvcc_threads = 1
+            num_jobs = max(1, num_jobs // nvcc_threads)
+
+        return num_jobs, nvcc_threads
+
+    #
+    # Perform cmake configuration for a single extension.
+    #
+    def configure(self, ext: CMakeExtension) -> None:
+        # If we've already configured using the CMakeLists.txt for
+        # this extension, exit early.
+        if ext.cmake_lists_dir in cmake_build_ext.did_config:
+            return
+
+        cmake_build_ext.did_config[ext.cmake_lists_dir] = True
+
+        # Select the build type.
+        # Note: optimization level + debug info are set by the build type
+        default_cfg = "Debug" if self.debug else "RelWithDebInfo"
+        cfg = os.getenv("CMAKE_BUILD_TYPE", default_cfg)
+
+        # where .so files will be written, should be the same for all extensions
+        # that use the same CMakeLists.txt.
+        outdir = os.path.abspath(
+            os.path.dirname(self.get_ext_fullpath(ext.name)))
+
+        cmake_args = [
+            '-DCMAKE_BUILD_TYPE={}'.format(cfg),
+            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}'.format(outdir),
+            '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY={}'.format(self.build_temp),
+        ]
+
+        verbose = bool(int(os.getenv('VERBOSE', '0')))
+        if verbose:
+            cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
+
+        if is_sccache_available():
+            cmake_args += [
+                '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
+                '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
+            ]
+        elif is_ccache_available():
+            cmake_args += [
+                '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
+                '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
+            ]
+
+        # Pass the python executable to cmake so it can find an exact
+        # match.
+        cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
+
+        if _install_punica():
+            cmake_args += ['-DVLLM_INSTALL_PUNICA_KERNELS=ON']
+
+        #
+        # Setup parallelism and build tool
+        #
+        num_jobs, nvcc_threads = self.compute_num_jobs()
+
+        if nvcc_threads:
+            cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
+
+        if is_ninja_available():
+            build_tool = ['-G', 'Ninja']
+            cmake_args += [
+                '-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
+                '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
+            ]
+        else:
+            # Default build tool to whatever cmake picks.
+            build_tool = []
+
+        subprocess.check_call(
+            ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
+            cwd=self.build_temp)
+
+    def build_extensions(self) -> None:
+        # Ensure that CMake is present and working
+        try:
+            subprocess.check_output(['cmake', '--version'])
+        except OSError as e:
+            raise RuntimeError('Cannot find CMake executable') from e
+
+        # Create build directory if it does not exist.
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        # Build all the extensions
+        for ext in self.extensions:
+            self.configure(ext)
+
+            ext_target_name = remove_prefix(ext.name, "vllm.")
+            num_jobs, _ = self.compute_num_jobs()
+
+            build_args = [
+                '--build', '.', '--target', ext_target_name, '-j',
+                str(num_jobs)
+            ]
+
+            subprocess.check_call(['cmake', *build_args], cwd=self.build_temp)
+
+
+def _is_cuda() -> bool:
+    return torch.version.cuda is not None and not _is_neuron()


 def _is_hip() -> bool:
@ -36,36 +196,13 @@ def _is_neuron() -> bool:
    torch_neuronx_installed = True
    try:
        subprocess.run(["neuron-ls"], capture_output=True, check=True)
-    except (FileNotFoundError, PermissionError):
+    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
        torch_neuronx_installed = False
    return torch_neuronx_installed


-def _is_cuda() -> bool:
-    return (torch.version.cuda is not None) and not _is_neuron()
-
-
-# Compiler flags.
-CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
-# TODO(woosuk): Should we use -O3?
-NVCC_FLAGS = ["-O2", "-std=c++17"]
-
-if _is_hip():
-    if ROCM_HOME is None:
-        raise RuntimeError(
-            "Cannot find ROCM_HOME. ROCm must be available to build the package."
-        )
-    NVCC_FLAGS += ["-DUSE_ROCM"]
-    NVCC_FLAGS += ["-U__HIP_NO_HALF_CONVERSIONS__"]
-    NVCC_FLAGS += ["-U__HIP_NO_HALF_OPERATORS__"]
-
-if _is_cuda() and CUDA_HOME is None:
-    raise RuntimeError(
-        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
-
-ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
-CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
-NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+def _install_punica() -> bool:
+    return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))


 def get_hipcc_rocm_version():
@ -90,11 +227,6 @@ def get_hipcc_rocm_version():
        return None


-def glob(pattern: str):
-    root = Path(__name__).parent
-    return [str(p) for p in root.glob(pattern)]
-
-
 def get_neuronxcc_version():
    import sysconfig
    site_dir = sysconfig.get_paths()["purelib"]
@ -114,12 +246,12 @@ def get_neuronxcc_version():
        raise RuntimeError("Could not find HIP version in the output")


-def get_nvcc_cuda_version(cuda_dir: str) -> Version:
+def get_nvcc_cuda_version() -> Version:
    """Get the CUDA version from nvcc.

    Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
    """
-    nvcc_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"],
+    nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
    output = nvcc_output.split()
    release_idx = output.index("release") + 1
@ -127,249 +259,6 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
    return nvcc_cuda_version


-def get_pytorch_rocm_arch() -> Set[str]:
-    """Get the cross section of Pytorch,and vllm supported gfx arches
-
-    ROCM can get the supported gfx architectures in one of two ways
-    Either through the PYTORCH_ROCM_ARCH env var, or output from
-    rocm_agent_enumerator.
-
-    In either case we can generate a list of supported arch's and
-    cross reference with VLLM's own ROCM_SUPPORTED_ARCHs.
-    """
-    env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None)
-
-    # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
-    if env_arch_list is None:
-        command = "rocm_agent_enumerator"
-        env_arch_list = subprocess.check_output([command]).decode('utf-8')\
-                        .strip().replace("\n", ";")
-        arch_source_str = "rocm_agent_enumerator"
-    else:
-        arch_source_str = "PYTORCH_ROCM_ARCH env variable"
-
-    # List are separated by ; or space.
-    pytorch_rocm_arch = set(env_arch_list.replace(" ", ";").split(";"))
-
-    # Filter out the invalid architectures and print a warning.
-    arch_list = pytorch_rocm_arch.intersection(ROCM_SUPPORTED_ARCHS)
-
-    # If none of the specified architectures are valid, raise an error.
-    if not arch_list:
-        raise RuntimeError(
-            f"None of the ROCM architectures in {arch_source_str} "
-            f"({env_arch_list}) is supported. "
-            f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.")
-    invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS
-    if invalid_arch_list:
-        warnings.warn(
-            f"Unsupported ROCM architectures ({invalid_arch_list}) are "
-            f"excluded from the {arch_source_str} output "
-            f"({env_arch_list}). Supported ROCM architectures are: "
-            f"{ROCM_SUPPORTED_ARCHS}.",
-            stacklevel=2)
-    return arch_list
-
-
-def get_torch_arch_list() -> Set[str]:
-    # TORCH_CUDA_ARCH_LIST can have one or more architectures,
-    # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
-    # compiler to additionally include PTX code that can be runtime-compiled
-    # and executed on the 8.6 or newer architectures. While the PTX code will
-    # not give the best performance on the newer architectures, it provides
-    # forward compatibility.
-    env_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
-    if env_arch_list is None:
-        return set()
-
-    # List are separated by ; or space.
-    torch_arch_list = set(env_arch_list.replace(" ", ";").split(";"))
-    if not torch_arch_list:
-        return set()
-
-    # Filter out the invalid architectures and print a warning.
-    valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
-        {s + "+PTX"
-         for s in NVIDIA_SUPPORTED_ARCHS})
-    arch_list = torch_arch_list.intersection(valid_archs)
-    # If none of the specified architectures are valid, raise an error.
-    if not arch_list:
-        raise RuntimeError(
-            "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
-            f"variable ({env_arch_list}) is supported. "
-            f"Supported CUDA architectures are: {valid_archs}.")
-    invalid_arch_list = torch_arch_list - valid_archs
-    if invalid_arch_list:
-        warnings.warn(
-            f"Unsupported CUDA architectures ({invalid_arch_list}) are "
-            "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
-            f"({env_arch_list}). Supported CUDA architectures are: "
-            f"{valid_archs}.",
-            stacklevel=2)
-    return arch_list
-
-
-if _is_hip():
-    rocm_arches = get_pytorch_rocm_arch()
-    NVCC_FLAGS += ["--offload-arch=" + arch for arch in rocm_arches]
-else:
-    # First, check the TORCH_CUDA_ARCH_LIST environment variable.
-    compute_capabilities = get_torch_arch_list()
-
-if _is_cuda() and not compute_capabilities:
-    # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
-    # GPUs on the current machine.
-    device_count = torch.cuda.device_count()
-    for i in range(device_count):
-        major, minor = torch.cuda.get_device_capability(i)
-        if major < 7:
-            raise RuntimeError(
-                "GPUs with compute capability below 7.0 are not supported.")
-        compute_capabilities.add(f"{major}.{minor}")
-
-ext_modules = []
-
-if _is_cuda():
-    nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
-    if not compute_capabilities:
-        # If no GPU is specified nor available, add all supported architectures
-        # based on the NVCC CUDA version.
-        compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
-        if nvcc_cuda_version < Version("11.1"):
-            compute_capabilities.remove("8.6")
-        if nvcc_cuda_version < Version("11.8"):
-            compute_capabilities.remove("8.9")
-            compute_capabilities.remove("9.0")
-    # Validate the NVCC CUDA version.
-    if nvcc_cuda_version < Version("11.0"):
-        raise RuntimeError(
-            "CUDA 11.0 or higher is required to build the package.")
-    if (nvcc_cuda_version < Version("11.1")
-            and any(cc.startswith("8.6") for cc in compute_capabilities)):
-        raise RuntimeError(
-            "CUDA 11.1 or higher is required for compute capability 8.6.")
-    if nvcc_cuda_version < Version("11.8"):
-        if any(cc.startswith("8.9") for cc in compute_capabilities):
-            # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
-            # However, GPUs with compute capability 8.9 can also run the code generated by
-            # the previous versions of CUDA 11 and targeting compute capability 8.0.
-            # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
-            # instead of 8.9.
-            warnings.warn(
-                "CUDA 11.8 or higher is required for compute capability 8.9. "
-                "Targeting compute capability 8.0 instead.",
-                stacklevel=2)
-            compute_capabilities = set(cc for cc in compute_capabilities
-                                       if not cc.startswith("8.9"))
-            compute_capabilities.add("8.0+PTX")
-        if any(cc.startswith("9.0") for cc in compute_capabilities):
-            raise RuntimeError(
-                "CUDA 11.8 or higher is required for compute capability 9.0.")
-
-    NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy()
-
-    # Add target compute capabilities to NVCC flags.
-    for capability in compute_capabilities:
-        num = capability[0] + capability[2]
-        NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
-        if capability.endswith("+PTX"):
-            NVCC_FLAGS += [
-                "-gencode", f"arch=compute_{num},code=compute_{num}"
-            ]
-        if int(capability[0]) >= 8:
-            NVCC_FLAGS_PUNICA += [
-                "-gencode", f"arch=compute_{num},code=sm_{num}"
-            ]
-            if capability.endswith("+PTX"):
-                NVCC_FLAGS_PUNICA += [
-                    "-gencode", f"arch=compute_{num},code=compute_{num}"
-                ]
-
-    # Use NVCC threads to parallelize the build.
-    if nvcc_cuda_version >= Version("11.2"):
-        nvcc_threads = int(os.getenv("NVCC_THREADS", 8))
-        num_threads = min(os.cpu_count(), nvcc_threads)
-        NVCC_FLAGS += ["--threads", str(num_threads)]
-
-    if nvcc_cuda_version >= Version("11.8"):
-        NVCC_FLAGS += ["-DENABLE_FP8_E5M2"]
-
-    # changes for punica kernels
-    NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS
-    REMOVE_NVCC_FLAGS = [
-        '-D__CUDA_NO_HALF_OPERATORS__',
-        '-D__CUDA_NO_HALF_CONVERSIONS__',
-        '-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
-        '-D__CUDA_NO_HALF2_OPERATORS__',
-    ]
-    for flag in REMOVE_NVCC_FLAGS:
-        with contextlib.suppress(ValueError):
-            torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag)
-
-    install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
-    device_count = torch.cuda.device_count()
-    for i in range(device_count):
-        major, minor = torch.cuda.get_device_capability(i)
-        if major < 8:
-            install_punica = False
-            break
-    if install_punica:
-        ext_modules.append(
-            CUDAExtension(
-                name="vllm._punica_C",
-                sources=["csrc/punica/punica_ops.cc"] +
-                glob("csrc/punica/bgmv/*.cu"),
-                extra_compile_args={
-                    "cxx": CXX_FLAGS,
-                    "nvcc": NVCC_FLAGS_PUNICA,
-                },
-            ))
-elif _is_neuron():
-    neuronxcc_version = get_neuronxcc_version()
-
-vllm_extension_sources = [
-    "csrc/cache_kernels.cu",
-    "csrc/attention/attention_kernels.cu",
-    "csrc/pos_encoding_kernels.cu",
-    "csrc/activation_kernels.cu",
-    "csrc/layernorm_kernels.cu",
-    "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
-    "csrc/quantization/gptq/q_gemm.cu",
-    "csrc/cuda_utils_kernels.cu",
-    "csrc/moe_align_block_size_kernels.cu",
-    "csrc/pybind.cpp",
-]
-
-if _is_cuda():
-    vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
-    vllm_extension_sources.append(
-        "csrc/quantization/marlin/marlin_cuda_kernel.cu")
-    vllm_extension_sources.append("csrc/custom_all_reduce.cu")
-
-    # Add MoE kernels.
-    ext_modules.append(
-        CUDAExtension(
-            name="vllm._moe_C",
-            sources=glob("csrc/moe/*.cu") + glob("csrc/moe/*.cpp"),
-            extra_compile_args={
-                "cxx": CXX_FLAGS,
-                "nvcc": NVCC_FLAGS,
-            },
-        ))
-
-if not _is_neuron():
-    vllm_extension = CUDAExtension(
-        name="vllm._C",
-        sources=vllm_extension_sources,
-        extra_compile_args={
-            "cxx": CXX_FLAGS,
-            "nvcc": NVCC_FLAGS,
-        },
-        libraries=["cuda"] if _is_cuda() else [],
-    )
-    ext_modules.append(vllm_extension)
-
-
 def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)

@ -390,7 +279,12 @@ def find_version(filepath: str) -> str:
 def get_vllm_version() -> str:
    version = find_version(get_path("vllm", "__init__.py"))

-    if _is_hip():
+    if _is_cuda():
+        cuda_version = str(get_nvcc_cuda_version())
+        if cuda_version != MAIN_CUDA_VERSION:
+            cuda_version_str = cuda_version.replace(".", "")[:3]
+            version += f"+cu{cuda_version_str}"
+    elif _is_hip():
        # Get the HIP version
        hipcc_version = get_hipcc_rocm_version()
        if hipcc_version != MAIN_CUDA_VERSION:
@ -398,15 +292,12 @@ def get_vllm_version() -> str:
            version += f"+rocm{rocm_version_str}"
    elif _is_neuron():
        # Get the Neuron version
-        neuron_version = str(neuronxcc_version)
+        neuron_version = str(get_neuronxcc_version())
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"+neuron{neuron_version_str}"
    else:
-        cuda_version = str(nvcc_cuda_version)
-        if cuda_version != MAIN_CUDA_VERSION:
-            cuda_version_str = cuda_version.replace(".", "")[:3]
-            version += f"+cu{cuda_version_str}"
+        raise RuntimeError("Unknown runtime environment")

    return version

@ -422,26 +313,40 @@ def read_readme() -> str:

 def get_requirements() -> List[str]:
    """Get Python package dependencies from requirements.txt."""
-    if _is_hip():
+    if _is_cuda():
+        with open(get_path("requirements.txt")) as f:
+            requirements = f.read().strip().split("\n")
+    elif _is_hip():
        with open(get_path("requirements-rocm.txt")) as f:
            requirements = f.read().strip().split("\n")
    elif _is_neuron():
        with open(get_path("requirements-neuron.txt")) as f:
            requirements = f.read().strip().split("\n")
    else:
-        with open(get_path("requirements.txt")) as f:
-            requirements = f.read().strip().split("\n")
+        raise ValueError(
+            "Unsupported platform, please use CUDA, ROCM or Neuron.")
+
    return requirements


+ext_modules = []
+
+if _is_cuda():
+    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+
+    if _install_punica():
+        ext_modules.append(CMakeExtension(name="vllm._punica_C"))
+
+if not _is_neuron():
+    ext_modules.append(CMakeExtension(name="vllm._C"))
+
 package_data = {
    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
 if os.environ.get("VLLM_USE_PRECOMPILED"):
-    ext_modules = []
    package_data["vllm"].append("*.so")

-setuptools.setup(
+setup(
    name="vllm",
    version=get_vllm_version(),
    author="vLLM Team",
@ -463,11 +368,11 @@ setuptools.setup(
        "License :: OSI Approved :: Apache Software License",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
-    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
-                                               "examples", "tests")),
+    packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples",
+                                    "tests")),
    python_requires=">=3.8",
    install_requires=get_requirements(),
    ext_modules=ext_modules,
-    cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {},
+    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
    package_data=package_data,
 )
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -25,23 +25,21 @@ def _query_server_long(prompt: str) -> dict:


@pytest.fixture
-def api_server():
+def api_server(tokenizer_pool_size: int):
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
    uvicorn_process = subprocess.Popen([
-        sys.executable,
-        "-u",
-        str(script_path),
-        "--model",
-        "facebook/opt-125m",
-        "--host",
-        "127.0.0.1",
+        sys.executable, "-u",
+        str(script_path), "--model", "facebook/opt-125m", "--host",
+        "127.0.0.1", "--tokenizer-pool-size",
+        str(tokenizer_pool_size)
    ])
    yield
    uvicorn_process.terminate()


-def test_api_server(api_server):
+@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
+def test_api_server(api_server, tokenizer_pool_size: int):
    """
    Run the API server and test it.

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@ -25,12 +25,8 @@ class MockEngine:
        return [RequestOutput(
            request_id=self.request_id)] if self.request_id else []

-    async def encode_request_async(
-        self,
-        *args,
-        **kwargs,
-    ):
-        return [1]
+    async def encode_request_async(self, *args, **kwargs):
+        pass

    def generate(self, request_id):
        self.request_id = request_id
@ -43,13 +39,16 @@ class MockEngine:
        self.add_request_calls += 1

    async def add_request_async(self, **kwargs):
-        del kwargs  # Unused
        self.add_request_calls += 1
+        return

    def abort_request(self, request_id):
        del request_id  # Unused
        self.abort_request_calls += 1

+    def has_unfinished_requests(self):
+        return self.request_id is not None
+

 class MockAsyncLLMEngine(AsyncLLMEngine):

@ -72,20 +71,24 @@ async def test_new_requests_event():
    await engine.add_request("2", "", None)
    engine.engine.generate("2")
    await asyncio.sleep(0)
+    await asyncio.sleep(0)
    assert engine.engine.add_request_calls == 2
-    assert engine.engine.step_calls == 2
-    await asyncio.sleep(0)
-    assert engine.engine.step_calls == 3
+    assert engine.engine.step_calls >= 2
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls >= 3
    engine.engine.stop_generating()
-    await asyncio.sleep(0)
-    assert engine.engine.step_calls == 4
-    await asyncio.sleep(0)
-    assert engine.engine.step_calls == 4
+    await asyncio.sleep(0.001)
+    old_step_calls = engine.engine.step_calls
+    await asyncio.sleep(0.001)
+    assert engine.engine.step_calls == old_step_calls

    await engine.add_request("3", "", None)
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == 5
+    assert engine.engine.step_calls == old_step_calls + 1
    await asyncio.sleep(0.01)
    assert engine.engine.add_request_calls == 3
-    assert engine.engine.step_calls == 5
+    assert engine.engine.step_calls == old_step_calls + 1
+
+    engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True)
+    assert engine.get_tokenizer() is not None
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@ -1,12 +1,12 @@
-from dataclasses import dataclass
 import os
 import pathlib
+from dataclasses import dataclass

 import pytest

-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.transformers_utils.tokenizer import get_tokenizer

 chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
    __file__))).parent.parent / "examples/template_chatml.jinja"
@ -73,7 +73,7 @@ def test_load_chat_template():
    assert template_content is not None
    # Hard coded value for template_chatml.jinja
    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""  # noqa: E501


 def test_no_load_chat_template():
@ -117,4 +117,6 @@ async def test_get_gen_prompt(model, template, add_generation_prompt,
        add_generation_prompt=mock_request.add_generation_prompt)

    # Test assertion
-    assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}"
+    assert result == expected_output, (
+        f"The generated prompt does not match the expected output for "
+        f"model {model} and template {template}")
--- a/tests/async_engine/test_request_tracker.py
+++ b/tests/async_engine/test_request_tracker.py
@ -4,25 +4,14 @@ from vllm.engine.async_llm_engine import RequestTracker
 from vllm.outputs import RequestOutput


-class DummyEvent:
-
-    def __init__(self):
-        self.flag = False
-
-    def set(self):
-        self.flag = True
-
-    def clear(self):
-        self.flag = False
-
-
-def test_request_tracker():
+@pytest.mark.asyncio
+async def test_request_tracker():
    tracker = RequestTracker()
-    tracker.new_requests_event = DummyEvent()
    stream_1 = tracker.add_request("1")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
    new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
    assert len(new) == 1
    assert new[0]["request_id"] == "1"
    assert not finished
@ -30,9 +19,10 @@ def test_request_tracker():

    stream_2 = tracker.add_request("2")
    stream_3 = tracker.add_request("3")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
    new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
    assert len(new) == 2
    assert new[0]["request_id"] == "2"
    assert new[1]["request_id"] == "3"
@ -43,7 +33,7 @@ def test_request_tracker():
    # request_ids must be unique
    with pytest.raises(KeyError):
        tracker.add_request("1")
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()

    tracker.abort_request("1")
    new, finished = tracker.get_new_and_finished_requests()
@ -54,7 +44,8 @@ def test_request_tracker():

    stream_4 = tracker.add_request("4")
    tracker.abort_request("4")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
+    await tracker.wait_for_new_requests()
    new, finished = tracker.get_new_and_finished_requests()
    assert len(finished) == 1
    assert "4" in finished
@ -62,11 +53,12 @@ def test_request_tracker():
    assert stream_4.finished

    stream_5 = tracker.add_request("5")
-    assert tracker.new_requests_event.flag
+    assert tracker.new_requests_event.is_set()
    tracker.process_request_output(
-        RequestOutput("2", "output", [], [], [], bool(finished)))
+        RequestOutput("2", "output", [], [], [], finished=True))
+    await tracker.wait_for_new_requests()
    new, finished = tracker.get_new_and_finished_requests()
-    assert not tracker.new_requests_event.flag
+    assert not tracker.new_requests_event.is_set()
    assert len(finished) == 1
    assert "2" in finished
    assert len(new) == 1
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@ -1,6 +1,6 @@
 """Compare the short outputs of HF and vLLM when using greedy sampling.

-Run `pytest tests/basic_correctness/test_basic_correctness.py --forked`.
+Run `pytest tests/basic_correctness/test_basic_correctness.py`.
 """
 import pytest

@ -13,6 +13,7 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("enforce_eager", [False, True])
 def test_models(
    hf_runner,
    vllm_runner,
@ -20,12 +21,13 @@ def test_models(
    model: str,
    dtype: str,
    max_tokens: int,
+    enforce_eager: bool,
 ) -> None:
    hf_model = hf_runner(model, dtype=dtype)
    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    del hf_model

-    vllm_model = vllm_runner(model, dtype=dtype)
+    vllm_model = vllm_runner(model, dtype=dtype, enforce_eager=enforce_eager)
    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
    del vllm_model

--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -1,17 +1,45 @@
+import contextlib
+import gc
 import os
 from typing import List, Optional, Tuple

 import pytest
 import torch
-from transformers import AutoModelForCausalLM
+from PIL import Image
+from transformers import (AutoModelForCausalLM, AutoProcessor,
+                          LlavaForConditionalGeneration)

 from vllm import LLM, SamplingParams
+from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+from vllm.model_executor.parallel_utils.parallel_state import (
+    destroy_model_parallel)
+from vllm.sequence import MultiModalData
 from vllm.transformers_utils.tokenizer import get_tokenizer

 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]

+# Multi modal related
+_PIXEL_VALUES_FILES = [
+    os.path.join(_TEST_DIR, "images", filename) for filename in
+    ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
+]
+_IMAGE_FEATURES_FILES = [
+    os.path.join(_TEST_DIR, "images", filename) for filename in
+    ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
+]
+_IMAGE_FILES = [
+    os.path.join(_TEST_DIR, "images", filename)
+    for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
+]
+_IMAGE_PROMPTS = [
+    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "<image>\nUSER: What is the season?\nASSISTANT:"
+]
+assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(
+    _IMAGE_FILES) == len(_IMAGE_PROMPTS)
+

 def _read_prompts(filename: str) -> List[str]:
    with open(filename, "r") as f:
@ -19,6 +47,53 @@ def _read_prompts(filename: str) -> List[str]:
        return prompts


+def cleanup():
+    destroy_model_parallel()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+@pytest.fixture(autouse=True)
+def cleanup_fixture():
+    yield
+    cleanup()
+
+
+@pytest.fixture(scope="session")
+def hf_image_prompts() -> List[str]:
+    return _IMAGE_PROMPTS
+
+
+@pytest.fixture(scope="session")
+def hf_images() -> List[Image.Image]:
+    return [Image.open(filename) for filename in _IMAGE_FILES]
+
+
+@pytest.fixture()
+def vllm_images(request) -> "torch.Tensor":
+    vision_language_config = request.getfixturevalue("model_and_config")[1]
+    all_images = []
+    if vision_language_config.image_input_type == (
+            VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
+        filenames = _IMAGE_FEATURES_FILES
+    else:
+        filenames = _PIXEL_VALUES_FILES
+    for filename in filenames:
+        all_images.append(torch.load(filename))
+    return torch.concat(all_images, dim=0)
+
+
+@pytest.fixture()
+def vllm_image_prompts(request) -> List[str]:
+    vision_language_config = request.getfixturevalue("model_and_config")[1]
+    return [
+        "<image>" * (vision_language_config.image_feature_size - 1) + p
+        for p in _IMAGE_PROMPTS
+    ]
+
+
@pytest.fixture
 def example_prompts() -> List[str]:
    prompts = []
@ -41,6 +116,10 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
    "float": torch.float,
 }

+_VISION_LANGUAGE_MODELS = {
+    "llava-hf/llava-1.5-7b-hf": LlavaForConditionalGeneration,
+}
+

 class HfRunner:

@ -52,11 +131,24 @@ class HfRunner:
    ) -> None:
        assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch_dtype,
-            trust_remote_code=True,
-        ).cuda()
+        self.model_name = model_name
+        if model_name not in _VISION_LANGUAGE_MODELS:
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            ).cuda()
+            self.processor = None
+        else:
+            self.model = _VISION_LANGUAGE_MODELS[model_name].from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+            ).cuda()
+            self.processor = AutoProcessor.from_pretrained(
+                model_name,
+                torch_dtype=torch_dtype,
+            )
        if tokenizer_name is None:
            tokenizer_name = model_name
        self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
@ -64,13 +156,28 @@ class HfRunner:
    def generate(
        self,
        prompts: List[str],
+        images: Optional[List[Image.Image]] = None,
        **kwargs,
    ) -> List[Tuple[List[int], str]]:
        outputs: List[Tuple[List[int], str]] = []
-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        if images:
+            assert len(prompts) == len(images)
+        for i, prompt in enumerate(prompts):
+            if self.model_name not in _VISION_LANGUAGE_MODELS:
+                input_ids = self.tokenizer(prompt,
+                                           return_tensors="pt").input_ids
+                inputs = {"input_ids": input_ids.cuda()}
+            else:
+                image = images[i] if images else None
+                inputs = self.processor(text=prompt,
+                                        images=image,
+                                        return_tensors="pt")
+                inputs = {
+                    key: value.cuda() if value is not None else None
+                    for key, value in inputs.items()
+                }
            output_ids = self.model.generate(
-                input_ids.cuda(),
+                **inputs,
                use_cache=True,
                **kwargs,
            )
@ -87,10 +194,12 @@ class HfRunner:
        self,
        prompts: List[str],
        max_tokens: int,
+        images: Optional["torch.Tensor"] = None,
    ) -> List[Tuple[List[int], str]]:
        outputs = self.generate(prompts,
                                do_sample=False,
-                                max_new_tokens=max_tokens)
+                                max_new_tokens=max_tokens,
+                                images=images)
        for i in range(len(outputs)):
            output_ids, output_str = outputs[i]
            outputs[i] = (output_ids[0], output_str[0])
@ -150,6 +259,10 @@ class HfRunner:
            all_logprobs.append(seq_logprobs)
        return all_logprobs

+    def __del__(self):
+        del self.model
+        cleanup()
+

@pytest.fixture
 def hf_runner():
@ -162,9 +275,14 @@ class VllmRunner:
        self,
        model_name: str,
        tokenizer_name: Optional[str] = None,
+        # Use smaller max model length, otherwise bigger model cannot run due
+        # to kv cache size limit.
+        max_model_len=1024,
        dtype: str = "half",
        disable_log_stats: bool = True,
        tensor_parallel_size: int = 1,
+        block_size: int = 16,
+        enable_chunked_prefill: bool = False,
        **kwargs,
    ) -> None:
        self.model = LLM(
@ -175,6 +293,9 @@ class VllmRunner:
            swap_space=0,
            disable_log_stats=disable_log_stats,
            tensor_parallel_size=tensor_parallel_size,
+            max_model_len=max_model_len,
+            block_size=block_size,
+            enable_chunked_prefill=enable_chunked_prefill,
            **kwargs,
        )

@ -182,9 +303,16 @@ class VllmRunner:
        self,
        prompts: List[str],
        sampling_params: SamplingParams,
+        images: Optional["torch.Tensor"] = None,
    ) -> List[Tuple[List[int], str]]:
-        req_outputs = self.model.generate(prompts,
-                                          sampling_params=sampling_params)
+        if images is not None:
+            assert len(prompts) == images.shape[0]
+        req_outputs = self.model.generate(
+            prompts,
+            sampling_params=sampling_params,
+            multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE,
+                                            data=images)
+            if images is not None else None)
        outputs = []
        for req_output in req_outputs:
            prompt_str = req_output.prompt
@ -221,9 +349,10 @@ class VllmRunner:
        self,
        prompts: List[str],
        max_tokens: int,
+        images: Optional[torch.Tensor] = None,
    ) -> List[Tuple[List[int], str]]:
        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        outputs = self.generate(prompts, greedy_params)
+        outputs = self.generate(prompts, greedy_params, images=images)
        return [(output_ids[0], output_str[0])
                for output_ids, output_str in outputs]

@ -254,7 +383,21 @@ class VllmRunner:
        outputs = self.generate(prompts, beam_search_params)
        return outputs

+    def __del__(self):
+        del self.model
+        cleanup()
+

@pytest.fixture
 def vllm_runner():
    return VllmRunner
+
+
+def get_tokenizer_pool_config(tokenizer_group_type):
+    if tokenizer_group_type is None:
+        return None
+    if tokenizer_group_type == "ray":
+        return TokenizerPoolConfig(pool_size=1,
+                                   pool_type="ray",
+                                   extra_config={})
+    raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}")
--- a/Show More
+++ b/Show More