Merge remote-tracking branch 'upstream/main' into benchmark-output

Update convert_to_csv.py
Improve Transformers backend model loading QoL (#17039 )
2025-10-20 23:03:52 +08:00 · 2025-04-23 14:55:50 +00:00 · 2025-04-23 10:51:38 -04:00 · 2025-04-23 07:33:51 -07:00 · 2025-04-23 07:33:14 -07:00 · 2025-04-23 07:32:26 -07:00
638 changed files with 41406 additions and 11977 deletions
--- a/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen1.5-MoE-W4A16-compressed-tensors.yaml
@ -0,0 +1,11 @@
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
 model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.30
  - name: "exact_match,flexible-extract"
    value: 0.465
 limit: 1319
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -4,7 +4,7 @@ Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
-Minitron-4B-Base-FP8.yaml
+Qwen1.5-MoE-W4A16-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
 Qwen2-1.5B-Instruct-FP8W8.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -16,7 +16,7 @@ import numpy
 import pytest
 import yaml
-RTOL = 0.05
+RTOL = 0.08
 TEST_DATA_FILE = os.environ.get(
    "LM_EVAL_TEST_DATA_FILE",
    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -86,3 +86,18 @@ steps:
      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
  - block: "Build Neuron release image"
    key: block-neuron-release-image-build
    depends_on: ~
  - label: "Build and publish Neuron release image"
    depends_on: block-neuron-release-image-build
    agents:
      queue: neuron-postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -98,6 +98,13 @@ if [[ $commands == *" kernels "* ]]; then
  --ignore=kernels/test_machete_mm.py \
  --ignore=kernels/test_mha_attn.py \
  --ignore=kernels/test_block_fp8.py \
  --ignore=kernels/test_cutlass_moe.py \
  --ignore=kernels/test_mamba_ssm_ssd.py \
  --ignore=kernels/test_attention.py \
  --ignore=kernels/test_block_int8.py \
  --ignore=kernels/test_fused_quant_layernorm.py \
  --ignore=kernels/test_int8_kernel.py \
  --ignore=kernels/test_triton_moe_ptpc_fp8.py \
  --ignore=kernels/test_permute_cols.py"
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@ -5,10 +5,34 @@
 set -ex
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
+remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
-docker build -t cpu-test -f docker/Dockerfile.ppc64le .
+podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
 # Run the image
 podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
 function cpu_tests() {
  # offline inference
  podman exec cpu-test-ubi9-ppc bash -c "
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run basic model test
  podman exec cpu-test-ubi9-ppc bash -c "
    set -e
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
    pip install sentence-transformers datamodel_code_generator
    pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
    pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
    pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
 timeout 40m bash -c cpu_tests
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh
@ -0,0 +1,13 @@
 #!/bin/bash
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
 trap remove_docker_container EXIT
 remove_docker_container
 # Try building the docker image
 docker build -t cpu-test -f docker/Dockerfile.s390x .
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -17,10 +17,12 @@ source /etc/environment
 docker run --privileged --net host --shm-size=16G -it \
    -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
    vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
-    && python3 -m pip install pytest \
+    && python3 -m pip install pytest pytest-asyncio tpu-info \
    && python3 -m pip install lm_eval[api]==0.4.4 \
    && export VLLM_USE_V1=1 \
    && export VLLM_XLA_CHECK_RECOMPILATION=1 \
    && echo HARDWARE \
    && tpu-info \
    && echo TEST_0 \
    && pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
    && echo TEST_1 \
@ -40,7 +42,11 @@ docker run --privileged --net host --shm-size=16G -it \
    && echo TEST_8 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
    && echo TEST_9 \
-    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
+    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
    && echo TEST_10 \
    && pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
    && echo TEST_11 \
    && pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
 # TODO: This test fails because it uses RANDOM_SEED sampling
--- a/.buildkite/scripts/run-benchmarks.sh
+++ b/.buildkite/scripts/run-benchmarks.sh
@ -5,8 +5,8 @@
 set -ex
 set -o pipefail
-# cd into parent directory of this file
+# cd 2 levels into the working directory
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+cd "$(dirname "${BASH_SOURCE[0]}")/../.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -8,6 +8,7 @@
 # Documentation
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 # command(str): the single command to run for tests. incompatible with commands.
@ -70,6 +71,7 @@ steps:
 - label: Basic Correctness Test # 30min
  #mirror_hardwares: [amd]
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
@ -104,6 +106,7 @@ steps:
 - label: Entrypoints Test # 40min
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  torch_nightly: true
  #mirror_hardwares: [amd]
  source_file_dependencies:
  - vllm/
@ -118,7 +121,7 @@ steps:
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
  - pytest -v -s entrypoints/test_chat_utils.py
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
@ -163,11 +166,6 @@ steps:
  - tests/tracing
  commands:
  - pytest -v -s metrics
  - "pip install \
      'opentelemetry-sdk>=1.26.0,<1.27.0' \
      'opentelemetry-api>=1.26.0,<1.27.0' \
      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
  - pytest -v -s tracing
 ##### fast check tests  #####
@ -210,6 +208,8 @@ steps:
    - pytest -v -s v1/sample
    - pytest -v -s v1/worker
    - pytest -v -s v1/structured_output
    - pytest -v -s v1/spec_decode
    - pytest -v -s v1/test_serial_utils.py
    - pytest -v -s v1/test_stats.py
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
@ -292,6 +292,14 @@ steps:
  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
  parallelism: 4
 - label: PyTorch Compilation Unit Tests
  source_file_dependencies:
    - vllm/
    - tests/compile
  commands:
    - pytest -v -s compile/test_pass_manager.py
    - pytest -v -s compile/test_fusion.py
 - label: PyTorch Fullgraph Smoke Test # 9min
  source_file_dependencies:
  - vllm/
@ -301,7 +309,6 @@ steps:
  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
  - pytest -v -s compile/test_pass_manager.py
 - label: PyTorch Fullgraph Test # 18min
  source_file_dependencies:
@ -310,15 +317,46 @@ steps:
  commands:
  - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 1h each
+- label: Kernels Core Operation Test
  # mirror_hardwares: [amd]
  source_file_dependencies:
  - csrc/
-  - vllm/attention
+  - tests/kernels/core
  - tests/kernels
  commands:
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+    - pytest -v -s kernels/core
-  parallelism: 4
+
 - label: Kernels Attention Test %N
  source_file_dependencies:
  - csrc/attention/
  - vllm/attention
  - vllm/v1/attention
  - tests/kernels/attention
  commands:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels Quantization Test %N
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization
  commands:
    - pytest -v -s kernels/quantization  --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
 - label: Kernels MoE Test
  source_file_dependencies:
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  commands:
    - pytest -v -s kernels/moe
 - label: Kernels Mamba Test
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  commands:
    - pytest -v -s kernels/mamba
 - label: Tensorizer Test # 11min
  # mirror_hardwares: [amd]
@ -339,6 +377,13 @@ steps:
  commands:
  - bash scripts/run-benchmarks.sh
 - label: Benchmarks CLI Test # 10min
  source_file_dependencies:
  - vllm/
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/
 - label: Quantization Test # 33min
  source_file_dependencies:
  - csrc/
@ -376,8 +421,10 @@ steps:
  source_file_dependencies:
    - vllm/
    - tests/tool_use
    - tests/mistral_tool_use
  commands:
    - pytest -v -s tool_use
    - pytest -v -s mistral_tool_use
 #####  models test  #####
@ -389,7 +436,9 @@ steps:
    - pytest -v -s models/test_transformers.py
    - pytest -v -s models/test_registry.py
    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
 - label: Language Models Test (Standard) # 32min
  #mirror_hardwares: [amd]
@ -399,6 +448,8 @@ steps:
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
    - pip install causal-conv1d
    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/language -m core_model
@ -410,6 +461,8 @@ steps:
  - tests/models/embedding/language
  - tests/models/encoder_decoder/language
  commands:
    # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
    - pip install causal-conv1d
    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/language -m 'not core_model'
@ -426,7 +479,7 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal
    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
    - pytest -v -s models/embedding/vision_language -m core_model
    - pytest -v -s models/encoder_decoder/audio_language -m core_model
    - pytest -v -s models/encoder_decoder/language -m core_model
@ -445,10 +498,7 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-    # HACK - run phi3v tests separately to sidestep this transformers bug
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
    # https://github.com/huggingface/transformers/issues/34307
    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
    - pytest -v -s models/embedding/vision_language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
@ -538,6 +588,7 @@ steps:
  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 - label: Plugin Tests (2 GPUs) # 40min
  working_dir: "/vllm-workspace/tests"
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -14,7 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/600-new-model.yml
+++ b/.github/ISSUE_TEMPLATE/600-new-model.yml
@ -9,7 +9,7 @@ body:
    value: >
      #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
-      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
+      #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/index.html first to understand how to add a new model.
 - type: textarea
  attributes:
    label: The model to consider.
--- a/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml
@ -35,7 +35,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm/main/vllm/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -3,4 +3,4 @@ FILL IN THE PR DESCRIPTION HERE
 FIX #xxxx (*link existing issues this PR will resolve*)
 <!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
+**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -55,11 +55,19 @@ pull_request_rules:
  description: Automatically apply structured-output label
  conditions:
    - or:
      - files~=^benchmarks/structured_schemas/
      - files=benchmarks/benchmark_serving_structured_output.py
      - files=benchmarks/run_structured_output_benchmark.sh
      - files=docs/source/features/structured_outputs.md
      - files=examples/offline_inference/structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
      - files~=^vllm/model_executor/guided_decoding/
      - files=tests/model_executor/test_guided_processors.py
      - files=tests/entrypoints/llm/test_guided_generate.py
-      - files=benchmarks/benchmark_serving_guided.py
+      - files~=^tests/v1/structured_output/
-      - files=benchmarks/benchmark_guided.py
+      - files=tests/v1/entrypoints/llm/test_guided_generate.py
      - files~=^vllm/v1/structured_output/
  actions:
    label:
      add:
--- a/.gitignore
+++ b/.gitignore
@ -203,3 +203,6 @@ benchmarks/**/*.json
 # Linting
 actionlint
 shellcheck*/
 # Ingore moe/marlin_moe gen code
 csrc/moe/marlin_moe_wna16/kernel_*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -11,7 +11,6 @@ repos:
  hooks:
  - id: yapf
    args: [--in-place, --verbose]
    additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.9.3
  hooks:
@ -122,6 +121,12 @@ repos:
    language: system
    always_run: true
    pass_filenames: false
  - id: update-dockerfile-graph
    name: Update Dockerfile dependency graph
    entry: tools/update-dockerfile-graph.sh
    language: script
    files: ^docker/Dockerfile$
    pass_filenames: false
  # Keep `suggestion` last
  - id: suggestion
    name: Suggestion
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -230,6 +230,7 @@ set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
  "csrc/attention/paged_attention_v1.cu"
  "csrc/attention/paged_attention_v2.cu"
  "csrc/attention/merge_attn_states.cu"
  "csrc/pos_encoding_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
@ -608,21 +609,51 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
  if (MARLIN_MOE_ARCHS)
    set(MARLIN_MOE_SRC
        "csrc/moe/marlin_kernels/marlin_moe_kernel.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h"
        "csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu"
        "csrc/moe/marlin_moe_ops.cu")
    #
    # For the Marlin MOE kernels we automatically generate sources for various
    # preselected input type pairs and schedules.
    # Generate sources:
    set(MOE_MARLIN_GEN_SCRIPT
      ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
    file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
      execute_process(
        COMMAND ${CMAKE_COMMAND} -E env
        PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
        RESULT_VARIABLE moe_marlin_generation_result
        OUTPUT_VARIABLE moe_marlin_generation_output
        OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
        ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
      )
      if (NOT moe_marlin_generation_result EQUAL 0)
        message(FATAL_ERROR "Marlin MOE generation failed."
                            " Result: \"${moe_marlin_generation_result}\""
                            "\nCheck the log for details: "
                            "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
      else()
        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
            CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
        message(STATUS "Marlin MOE generation completed successfully.")
      endif()
    else()
      message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
    endif()
    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
    set_gencode_flags_for_srcs(
-      SRCS "${MARLIN_MOE_SRC}"
+      SRCS "${MOE_WNAA16_MARLIN_SRC}"
      CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
-    list(APPEND VLLM_MOE_EXT_SRC "${MARLIN_MOE_SRC}")
+    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
    message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
  else()
    message(STATUS "Not building Marlin MOE kernels as no compatible archs found"
@ -647,6 +678,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
  #
  set(VLLM_ROCM_EXT_SRC
    "csrc/rocm/torch_bindings.cpp"
    "csrc/rocm/skinny_gemms.cu"
    "csrc/rocm/attention.cu")
  define_gpu_extension_target(
--- a/README.md
+++ b/README.md
@ -10,16 +10,13 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 ---
 [2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
 ---
 *Latest News* 🔥
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -204,6 +204,24 @@ python3 vllm/benchmarks/benchmark_serving.py \
    --seed 42
 ```
 ### Running With Sampling Parameters
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
 ```bash
 python3 vllm/benchmarks/benchmark_serving.py \
  --backend vllm \
  --model NousResearch/Hermes-3-Llama-3.1-8B \
  --endpoint /v1/completions \
  --dataset-name sharegpt \
  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
  --top-k 10 \
  --top-p 0.9 \
  --temperature 0.5 \
  --num-prompts 10
 ```
 ---
 ## Example - Offline Throughput Benchmark
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import io
 import json
 import os
 import sys
@ -32,6 +33,7 @@ class RequestFuncInput:
    extra_body: Optional[dict] = None
    multi_modal_content: Optional[dict] = None
    ignore_eos: bool = False
    language: Optional[str] = None
@dataclass
@ -436,6 +438,110 @@ async def async_request_openai_chat_completions(
    return output
 async def async_request_openai_audio(
    request_func_input: RequestFuncInput,
    pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
    # Lazy import without PlaceholderModule to avoid vllm dep.
    import soundfile
    api_url = request_func_input.api_url
    assert api_url.endswith(
        ("transcriptions", "translations"
         )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
    "or `translations`."
    async with aiohttp.ClientSession(trust_env=True,
                                     timeout=AIOHTTP_TIMEOUT) as session:
        content = [{"type": "text", "text": request_func_input.prompt}]
        payload = {
            "model": request_func_input.model_name \
                if request_func_input.model_name else request_func_input.model,
            "temperature": 0.0,
            "max_completion_tokens": request_func_input.output_len,
            "stream": True,
            "language": "en",
            # Flattened due to multipart/form-data
            "stream_include_usage": True,
            "stream_continuous_usage_stats": True
        }
        if request_func_input.extra_body:
            payload.update(request_func_input.extra_body)
        headers = {
            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
        }
        # Send audio file
        def to_bytes(y, sr):
            buffer = io.BytesIO()
            soundfile.write(buffer, y, sr, format="WAV")
            buffer.seek(0)
            return buffer
        with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
            form = aiohttp.FormData()
            form.add_field('file', f, content_type='audio/wav')
            for key, value in payload.items():
                form.add_field(key, str(value))
            output = RequestFuncOutput()
            output.prompt_len = request_func_input.prompt_len
            generated_text = ""
            ttft = 0.0
            st = time.perf_counter()
            most_recent_timestamp = st
            try:
                async with session.post(url=api_url,
                                        data=form,
                                        headers=headers) as response:
                    if response.status == 200:
                        async for chunk_bytes in response.content:
                            chunk_bytes = chunk_bytes.strip()
                            if not chunk_bytes:
                                continue
                            chunk = chunk_bytes.decode("utf-8").removeprefix(
                                "data: ")
                            if chunk != "[DONE]":
                                timestamp = time.perf_counter()
                                data = json.loads(chunk)
                                if choices := data.get("choices"):
                                    content = choices[0]["delta"].get(
                                        "content")
                                    # First token
                                    if ttft == 0.0:
                                        ttft = timestamp - st
                                        output.ttft = ttft
                                    # Decoding phase
                                    else:
                                        output.itl.append(
                                            timestamp - most_recent_timestamp)
                                    generated_text += content or ""
                                elif usage := data.get("usage"):
                                    output.output_tokens = usage.get(
                                        "completion_tokens")
                                most_recent_timestamp = timestamp
                        output.generated_text = generated_text
                        output.success = True
                        output.latency = most_recent_timestamp - st
                    else:
                        output.error = response.reason or ""
                        output.success = False
            except Exception:
                output.success = False
                exc_info = sys.exc_info()
                output.error = "".join(traceback.format_exception(*exc_info))
        if pbar:
            pbar.update(1)
        return output
 def get_model(pretrained_model_name_or_path: str) -> str:
    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
        from modelscope import snapshot_download
@ -493,7 +599,14 @@ ASYNC_REQUEST_FUNCS = {
    "deepspeed-mii": async_request_deepspeed_mii,
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
    "openai-audio": async_request_openai_audio,
    "tensorrt-llm": async_request_trt_llm,
    "scalellm": async_request_openai_completions,
    "sglang": async_request_openai_completions,
 }
 OPENAI_COMPATIBLE_BACKENDS = [
    k for k, v in ASYNC_REQUEST_FUNCS.items()
    if v in (async_request_openai_completions,
             async_request_openai_chat_completions)
 ]
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -64,6 +64,7 @@ class SampleRequest:
 class BenchmarkDataset(ABC):
    DEFAULT_SEED = 0
    IS_MULTIMODAL = False
    def __init__(
        self,
@ -288,7 +289,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
 class RandomDataset(BenchmarkDataset):
    # Default values copied from benchmark_serving.py for the random dataset.
    DEFAULT_PREFIX_LEN = 0
-    DEFAULT_RANGE_RATIO = 1.0
+    DEFAULT_RANGE_RATIO = 0.0
    DEFAULT_INPUT_LEN = 1024
    DEFAULT_OUTPUT_LEN = 128
@ -308,19 +309,32 @@ class RandomDataset(BenchmarkDataset):
        output_len: int = DEFAULT_OUTPUT_LEN,
        **kwargs,
    ) -> list[SampleRequest]:
        # Enforce range_ratio < 1
        assert range_ratio < 1.0, (
            "random_range_ratio must be < 1.0 to ensure a valid sampling range"
        )
        vocab_size = tokenizer.vocab_size
        prefix_token_ids = (np.random.randint(
            0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
-        input_low = int(input_len * range_ratio)
+        # New sampling logic: [X * (1 - b), X * (1 + b)]
-        output_low = int(output_len * range_ratio)
+        input_low = int(input_len * (1 - range_ratio))
        input_high = int(input_len * (1 + range_ratio))
        output_low = int(output_len * (1 - range_ratio))
        output_high = int(output_len * (1 + range_ratio))
        # Add logging for debugging
        logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
        logger.info("Sampling output_len from [%s, %s]", output_low,
                    output_high)
        input_lens = np.random.randint(input_low,
-                                       input_len + 1,
+                                       input_high + 1,
                                       size=num_requests)
        output_lens = np.random.randint(output_low,
-                                        output_len + 1,
+                                        output_high + 1,
                                        size=num_requests)
        offsets = np.random.randint(0, vocab_size, size=num_requests)
@ -472,11 +486,11 @@ class SonnetDataset(BenchmarkDataset):
        # Determine how many poem lines to use.
        num_input_lines = round((input_len - base_offset) / avg_len)
-        num_prefix_lines = round((prefix_len - base_offset) / avg_len)
+        num_prefix_lines = max(round((prefix_len - base_offset) / avg_len), 0)
        prefix_lines = self.data[:num_prefix_lines]
        samples = []
-        for _ in range(num_requests):
+        while len(samples) < num_requests:
            extra_lines = random.choices(self.data,
                                         k=num_input_lines - num_prefix_lines)
            prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
@ -484,13 +498,14 @@ class SonnetDataset(BenchmarkDataset):
            prompt_formatted = tokenizer.apply_chat_template(
                msg, add_generation_prompt=True, tokenize=False)
            prompt_len = len(tokenizer(prompt_formatted).input_ids)
-            samples.append(
+            if prompt_len <= input_len:
-                SampleRequest(
+                samples.append(
-                    prompt=prompt_formatted
+                    SampleRequest(
-                    if return_prompt_formatted else prompt,
+                        prompt=prompt_formatted
-                    prompt_len=prompt_len,
+                        if return_prompt_formatted else prompt,
-                    expected_output_len=output_len,
+                        prompt_len=prompt_len,
-                ))
+                        expected_output_len=output_len,
                    ))
        return samples
@ -607,6 +622,7 @@ class ConversationDataset(HuggingFaceDataset):
    SUPPORTED_DATASET_PATHS = {
        'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
    }
    IS_MULTIMODAL = True
    def sample(self,
               tokenizer: PreTrainedTokenizerBase,
@ -671,6 +687,7 @@ class VisionArenaDataset(HuggingFaceDataset):
        "lmarena-ai/vision-arena-bench-v0.1":
        lambda x: x["turns"][0][0]["content"]
    }
    IS_MULTIMODAL = True
    def sample(
        self,
@ -801,3 +818,80 @@ class AIMODataset(HuggingFaceDataset):
                ))
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
 # -----------------------------------------------------------------------------
 # ASR Dataset Implementation
 # -----------------------------------------------------------------------------
 class ASRDataset(HuggingFaceDataset):
    """
    Dataset class for processing a ASR dataset for transcription.
    Tested on the following set:
    +----------------+----------------------------------------+--------------------------+-----------------------------+
    | Dataset        | Domain                                 | Speaking Style           | hf-subset                   |
    +----------------+----------------------------------------+--------------------------+-----------------------------+
    | TED-LIUM       | TED talks                              | Oratory                  | release1, release2, release3|
    |                |                                        |                          | release3-speaker-adaptation |
    | VoxPopuli      | European Parliament                    | Oratory                  | en, de, it, fr,  ...        |
    | LibriSpeech    | Audiobook                              | Narrated                 | "LIUM/tedlium"              |
    | GigaSpeech     | Audiobook, podcast, YouTube            | Narrated, spontaneous    | xs, s, m, l, xl, dev, test  |
    | SPGISpeech     | Financial meetings                     | Oratory, spontaneous     | S, M, L, dev, test          |
    | AMI            | Meetings                               | Spontaneous              | ihm, sdm                    |
    +----------------+----------------------------------------+--------------------------+-----------------------------+
    """ # noqa: E501
    SUPPORTED_DATASET_PATHS = {
        "openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
        "edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
    }
    DEFAULT_OUTPUT_LEN = 128
    IS_MULTIMODAL = True
    # TODO Whisper-specific. Abstract interface when more models are supported.
    TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
                              "<|notimestamps|>"
    skip_long_audios: bool = True
    def sample(
        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        output_len: Optional[int] = None,
        **kwargs,
    ) -> list:
        import librosa
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
        prompt_len = len(tokenizer(prompt).input_ids)
        sampled_requests = []
        skipped = 0
        for item in self.data:
            if len(sampled_requests) >= num_requests:
                break
            audio = item["audio"]
            y, sr = audio["array"], audio["sampling_rate"]
            duration_s = librosa.get_duration(y=y, sr=sr)
            # Whisper max supported duration
            if self.skip_long_audios and duration_s > 30:
                skipped += 1
                continue
            mm_content = {"audio": (y, sr)}
            sampled_requests.append(
                SampleRequest(
                    prompt=prompt,
                    prompt_len=prompt_len,
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
        if skipped:
            logger.warning("%d samples discarded from dataset due to" \
                           " their length being greater than" \
                           " what Whisper supports.", skipped)
        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -34,7 +34,8 @@ from datetime import datetime
 from typing import Any, Optional
 import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+from backend_request_func import (ASYNC_REQUEST_FUNCS,
                                  OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
                                  RequestFuncOutput)
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase
@ -49,7 +50,7 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser
-from benchmark_dataset import (AIMODataset, BurstGPTDataset,
+from benchmark_dataset import (AIMODataset, ASRDataset, BurstGPTDataset,
                               ConversationDataset, HuggingFaceDataset,
                               InstructCoderDataset, RandomDataset,
                               SampleRequest, ShareGPTDataset, SonnetDataset,
@ -155,7 +156,7 @@ def calculate_metrics(
        if outputs[i].success:
            output_len = outputs[i].output_tokens
-            if output_len is None:
+            if not output_len:
                # We use the tokenizer to count the number of output tokens
                # for some serving backends instead of looking at
                # len(outputs[i].itl) since multiple output tokens may be
@ -260,6 +261,7 @@ async def benchmark(
    goodput_config_dict: dict[str, float],
    max_concurrency: Optional[int],
    lora_modules: Optional[Iterable[str]],
    extra_body: Optional[dict],
 ):
    if backend in ASYNC_REQUEST_FUNCS:
        request_func = ASYNC_REQUEST_FUNCS[backend]
@ -272,10 +274,6 @@ async def benchmark(
        input_requests[0].expected_output_len, \
            input_requests[0].multi_modal_data
    if backend != "openai-chat" and test_mm_content is not None:
        # multi-modal benchmark is only available on OpenAI Chat backend.
        raise ValueError(
            "Multi-modal content is only supported on 'openai-chat' backend.")
    assert test_mm_content is None or isinstance(test_mm_content, dict)
    test_input = RequestFuncInput(
        model=model_id,
@ -287,6 +285,7 @@ async def benchmark(
        logprobs=logprobs,
        multi_modal_content=test_mm_content,
        ignore_eos=ignore_eos,
        extra_body=extra_body,
    )
    test_output = await request_func(request_func_input=test_input)
@ -313,7 +312,8 @@ async def benchmark(
                                         output_len=test_output_len,
                                         logprobs=logprobs,
                                         multi_modal_content=test_mm_content,
-                                         ignore_eos=ignore_eos)
+                                         ignore_eos=ignore_eos,
                                         extra_body=extra_body)
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
            print("Profiler started")
@ -363,7 +363,8 @@ async def benchmark(
                                              output_len=output_len,
                                              logprobs=logprobs,
                                              multi_modal_content=mm_content,
-                                              ignore_eos=ignore_eos)
+                                              ignore_eos=ignore_eos,
                                              extra_body=extra_body)
        tasks.append(
            asyncio.create_task(
                limited_request_func(request_func_input=request_func_input,
@ -599,6 +600,9 @@ def main(args: argparse.Namespace):
        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
            dataset_class = AIMODataset
            args.hf_split = "train"
        elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
            dataset_class = ASRDataset
            args.hf_split = "train"
        else:
            supported_datasets = set([
                dataset_name for cls in HuggingFaceDataset.__subclasses__()
@ -610,6 +614,13 @@ def main(args: argparse.Namespace):
                f" from one of following: {supported_datasets}. "
                "Please consider contributing if you would "
                "like to add support for additional dataset formats.")
        if (dataset_class.IS_MULTIMODAL and backend not in \
            ["openai-chat", "openai-audio"]):
            # multi-modal benchmark is only available on OpenAI Chat backend.
            raise ValueError(
                "Multi-modal content is only supported on 'openai-chat' and " \
                "'openai-audio' backend.")
        input_requests = dataset_class(
            dataset_path=args.dataset_path,
            dataset_subset=args.hf_subset,
@ -652,6 +663,26 @@ def main(args: argparse.Namespace):
            raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
    goodput_config_dict = check_goodput_args(args)
    # Collect the sampling parameters.
    sampling_params = {
        k: v
        for k, v in {
            "top_p": args.top_p,
            "top_k": args.top_k,
            "min_p": args.min_p,
            "temperature": args.temperature
        }.items() if v is not None
    }
    # Sampling parameters are only supported by openai-compatible backend.
    if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
        raise ValueError(
            "Sampling parameters are only supported by openai-compatible "
            "backends.")
    if "temperature" not in sampling_params:
        sampling_params["temperature"] = 0.0  # Default to greedy decoding.
    # Avoid GC processing "static" data - reduce pause times.
    gc.collect()
    gc.freeze()
@ -678,6 +709,7 @@ def main(args: argparse.Namespace):
            goodput_config_dict=goodput_config_dict,
            max_concurrency=args.max_concurrency,
            lora_modules=args.lora_modules,
            extra_body=sampling_params,
        ))
    # Save config and results to json
@ -711,6 +743,8 @@ def main(args: argparse.Namespace):
            ]:
                if field in result_json:
                    del result_json[field]
                if field in benchmark_result:
                    del benchmark_result[field]
        # Traffic
        result_json["request_rate"] = (args.request_rate if args.request_rate
@ -730,7 +764,10 @@ def main(args: argparse.Namespace):
            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
-        with open(file_name, "w", encoding='utf-8') as outfile:
+        with open(file_name, mode="a+", encoding='utf-8') as outfile:
            # Append a newline.
            if outfile.tell() != 0:
                outfile.write("\n")
            json.dump(result_json, outfile)
        save_to_pytorch_benchmark_format(args, result_json, file_name)
@ -895,7 +932,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@ -903,7 +940,7 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
@ -970,18 +1007,23 @@ if __name__ == "__main__":
    random_group.add_argument(
        "--random-range-ratio",
        type=float,
-        default=1.0,
+        default=0.0,
-        help="Range of sampled ratio of input/output length, "
+        help="Range ratio for sampling input/output length, "
-        "used only for random sampling.",
+        "used only for random sampling. Must be in the range [0, 1) to define "
        "a symmetric sampling range"
        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
    )
    random_group.add_argument(
        "--random-prefix-len",
        type=int,
        default=0,
-        help="Number of fixed prefix tokens before random "
+        help=("Number of fixed prefix tokens before the random context "
-        " context. The length range of context in a random "
+              "in a request. "
-        " request is [random-prefix-len, "
+              "The total input length is the sum of `random-prefix-len` and "
-        " random-prefix-len + random-prefix-len * random-range-ratio).")
+              "a random "
              "context length sampled from [input_len * (1 - range_ratio), "
              "input_len * (1 + range_ratio)]."),
    )
    hf_group = parser.add_argument_group("hf dataset options")
    hf_group.add_argument("--hf-subset",
@ -1000,6 +1042,33 @@ if __name__ == "__main__":
        "from the sampled HF dataset.",
    )
    sampling_group = parser.add_argument_group("sampling parameters")
    sampling_group.add_argument(
        "--top-p",
        type=float,
        default=None,
        help="Top-p sampling parameter. Only has effect on openai-compatible "
        "backends.")
    sampling_group.add_argument(
        "--top-k",
        type=int,
        default=None,
        help="Top-k sampling parameter. Only has effect on openai-compatible "
        "backends.")
    sampling_group.add_argument(
        "--min-p",
        type=float,
        default=None,
        help="Min-p sampling parameter. Only has effect on openai-compatible "
        "backends.")
    sampling_group.add_argument(
        "--temperature",
        type=float,
        default=None,
        help="Temperature sampling parameter. Only has effect on "
        "openai-compatible backends. If not specified, default to greedy "
        "decoding (i.e. temperature==0.0).")
    parser.add_argument(
        '--tokenizer-mode',
        type=str,
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -11,7 +11,7 @@ On the client side, run:
        --model <your_model> \
        --dataset json \
        --structured-output-ratio 1.0 \
-        --structured-output-backend xgrammar \
+        --structured-output-backend auto \
        --request-rate 10 \
        --num-prompts 1000
@ -51,7 +51,7 @@ try:
 except ImportError:
    from argparse import ArgumentParser as FlexibleArgumentParser
-from vllm.v1.structured_output.utils import (
+from vllm.v1.structured_output.backend_xgrammar import (
    has_xgrammar_unsupported_json_features)
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -130,10 +130,11 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                        "description":
                        "An unique optional field to avoid cached schemas"
                    }
        else:
            json_schemas = [schema] * args.num_prompts
        def gen_prompt(index: int):
-            schema = json_schemas[index % len(json_schemas)]
+            return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}"  # noqa: E501
            return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}"  # noqa: E501
        def get_schema(index: int):
            return json_schemas[index % len(json_schemas)]
@ -963,7 +964,7 @@ if __name__ == "__main__":
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-seperated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
        "Default value is \"ttft,tpot,itl\".")
@ -971,7 +972,7 @@ if __name__ == "__main__":
        "--metric-percentiles",
        type=str,
        default="99",
-        help="Comma-seperated list of percentiles for selected metrics. "
+        help="Comma-separated list of percentiles for selected metrics. "
        "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
        "Default value is \"99\". "
        "Use \"--percentile-metrics\" to select metrics.",
@ -996,12 +997,14 @@ if __name__ == "__main__":
                        type=float,
                        default=1.0,
                        help="Ratio of Structured Outputs requests")
-    parser.add_argument(
+    parser.add_argument("--structured-output-backend",
-        "--structured-output-backend",
+                        type=str,
-        type=str,
+                        choices=[
-        choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
+                            "outlines", "lm-format-enforcer", "xgrammar",
-        default="xgrammar",
+                            "guidance", "auto"
-        help="Backend to use for structured outputs")
+                        ],
                        default="auto",
                        help="Backend to use for structured outputs")
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -213,14 +213,17 @@ def run_hf(
    max_prompt_len = 0
    max_output_len = 0
    for i in range(len(requests)):
-        prompt, prompt_len, output_len = requests[i]
+        prompt = requests[i].prompt
        prompt_len = requests[i].prompt_len
        output_len = requests[i].expected_output_len
        # Add the prompt to the batch.
        batch.append(prompt)
        max_prompt_len = max(max_prompt_len, prompt_len)
        max_output_len = max(max_output_len, output_len)
        if len(batch) < max_batch_size and i != len(requests) - 1:
            # Check if we can add more requests to the batch.
-            _, next_prompt_len, next_output_len = requests[i + 1]
+            next_prompt_len = requests[i + 1].prompt_len
            next_output_len = requests[i + 1].expected_output_len
            if (max(max_prompt_len, next_prompt_len) +
                    max(max_output_len, next_output_len)) <= 2048:
                # We can add more requests to the batch.
@ -520,6 +523,13 @@ def validate_args(args):
        raise ValueError(
            "Tokenizer must be the same as the model for MII backend.")
    # --data-parallel is not supported currently.
    # https://github.com/vllm-project/vllm/issues/16222
    if args.data_parallel_size > 1:
        raise ValueError(
            "Data parallel is not supported in offline benchmark, \
            please use benchmark serving instead")
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
@ -591,18 +601,30 @@ if __name__ == "__main__":
        default=None,
        help="Path to the lora adapters to use. This can be an absolute path, "
        "a relative path, or a Hugging Face model identifier.")
-    parser.add_argument("--prefix-len",
+    parser.add_argument(
-                        type=int,
+        "--prefix-len",
-                        default=None,
+        type=int,
-                        help="Number of prefix tokens per request."
+        default=None,
-                        "This is for the RandomDataset and SonnetDataset")
+        help=f"Number of prefix tokens to be used in RandomDataset "
        "and SonnetDataset. For RandomDataset, the total input "
        "length is the sum of prefix-len (default: "
        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
        "sampled from [input_len * (1 - range_ratio), "
        "input_len * (1 + range_ratio)]. For SonnetDataset, "
        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
        "controls how much of the input is fixed lines versus "
        "random lines, but the total input length remains approximately "
        "input_len tokens.")
    # random dataset
    parser.add_argument(
        "--random-range-ratio",
        type=float,
        default=None,
-        help="Range of sampled ratio of input/output length, "
+        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
-        "used only for RandomDataSet.",
+        "for sampling input/output length, "
        "used only for RandomDataset. Must be in the range [0, 1) to "
        "define a symmetric sampling range "
        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
    )
    # hf dtaset
--- a/benchmarks/convert_to_csv.py
+++ b/benchmarks/convert_to_csv.py
@ -0,0 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 from argparse import ArgumentParser
 import pandas as pd
 parser = ArgumentParser()
 parser.add_argument("--input-path", type=str, required=True)
 parser.add_argument("--output-path", type=str, required=True)
 if __name__ == "__main__":
    args = parser.parse_args()
    df = pd.read_json(args.input_path, lines=True)
    df.to_csv(args.output_path)
--- a/benchmarks/kernels/benchmark_bitblas.py
+++ b/benchmarks/kernels/benchmark_bitblas.py
@ -0,0 +1,236 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
    MINIMUM_BITBLAS_VERSION)
 try:
    import bitblas
    if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
        raise ImportError("bitblas version is wrong. Please "
                          f"install bitblas>={MINIMUM_BITBLAS_VERSION}")
 except ImportError as e:
    bitblas_import_exception = e
    raise ValueError("Trying to use the bitblas backend, but could not import"
                     f"with the following error: {bitblas_import_exception}. "
                     "Please install bitblas through the following command: "
                     f"`pip install bitblas>={MINIMUM_BITBLAS_VERSION}`"
                     ) from bitblas_import_exception
 from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
 from vllm.utils import FlexibleArgumentParser
 parser = FlexibleArgumentParser(
    description="Benchmark BitBLAS int4 on a specific target.")
 # Add arguments to the parser
 parser.add_argument(
    "--target",
    type=str,
    default=auto_detect_nvidia_target(),
    help="Specify the target device for benchmarking.",
 )
 parser.add_argument("--group_size",
                    type=int,
                    default=None,
                    help="Group size for grouped quantization.")
 parser.add_argument(
    "--A_dtype",
    type=str,
    default="float16",
    choices=["float16", "float32", "float64", "int32", "int8"],
    help="Data type of activation A.",
 )
 parser.add_argument(
    "--W_dtype",
    type=str,
    default="int4",
    choices=[
        "float16",
        "float32",
        "float64",
        "int32",
        "int8",
        "int4",
        "int2",
        "int1",
        "nf4",
        "fp4_e2m1",
    ],
    help="Data type of weight W.",
 )
 parser.add_argument(
    "--accum_dtype",
    type=str,
    default="float16",
    choices=["float16", "int32"],
    help="Data type for accumulation.",
 )
 parser.add_argument(
    "--out_dtype",
    type=str,
    default="float16",
    choices=["float16", "float32", "int32", "int8"],
    help="Data type for output.",
 )
 parser.add_argument(
    "--layout",
    type=str,
    default="nt",
    choices=["nt", "nn"],
    help="Matrix layout, 'nt' for non-transpose A and transpose W.",
 )
 parser.add_argument("--with_bias",
                    action="store_true",
                    help="Include bias in the benchmark.")
 parser.add_argument(
    "--with_scaling",
    action="store_true",
    help="Include scaling factor in the quantization.",
 )
 parser.add_argument("--with_zeros",
                    action="store_true",
                    help="Include zeros in the quantization.")
 parser.add_argument(
    "--zeros_mode",
    type=str,
    default=None,
    choices=["original", "rescale", "quantized"],
    help="Specify the mode for calculating zeros.",
 )
 # Parse the arguments
 args = parser.parse_args()
 # Assign arguments to variables
 target = args.target
 A_dtype = args.A_dtype
 W_dtype = args.W_dtype
 accum_dtype = args.accum_dtype
 out_dtype = args.out_dtype
 layout = args.layout
 with_bias = args.with_bias
 group_size = args.group_size
 with_scaling = args.with_scaling
 with_zeros = args.with_zeros
 zeros_mode = args.zeros_mode
 # Define a list of shared arguments that repeat in every config
 shared_args = [
    A_dtype,
    W_dtype,
    out_dtype,
    accum_dtype,
    layout,
    with_bias,
    group_size,
    with_scaling,
    with_zeros,
    zeros_mode,
 ]
 # Define just the (M, K, N) shapes in a more compact list
 shapes = [
    # square test
    (1, 16384, 16384),
    # BLOOM-176B
    (1, 43008, 14336),
    (1, 14336, 14336),
    (1, 57344, 14336),
    (1, 14336, 57344),
    # OPT-65B
    (1, 9216, 9216),
    (1, 36864, 9216),
    (1, 9216, 36864),
    (1, 22016, 8192),
    # LLAMA-70B/65B
    (1, 8192, 22016),
    (1, 8192, 8192),
    (1, 28672, 8192),
    (1, 8192, 28672),
    # square test
    (16384, 16384, 16384),
    # BLOOM-176B
    (8192, 43008, 14336),
    (8192, 14336, 14336),
    (8192, 57344, 14336),
    (8192, 14336, 57344),
    # OPT-65B
    (8192, 9216, 9216),
    (8192, 36864, 9216),
    (8192, 9216, 36864),
    (8192, 22016, 8192),
    # LLAMA-70B/65B
    (8192, 8192, 22016),
    (8192, 8192, 8192),
    (8192, 28672, 8192),
    (8192, 8192, 28672),
 ]
 # Build test shapes with all the shared arguments
 test_shapes = [(MatmulConfig, Matmul, (*shape, *shared_args))
               for shape in shapes]
 benchmark_sets = []
 benchmark_sets.extend(test_shapes)
 benchmark_results = {}
 for config_class, operator, input_args in benchmark_sets:
    config = config_class(*input_args)
    matmul = operator(config, target=target, enable_tuning=True)
    kernel_latency = matmul.profile_latency()
    print("Time cost is: {:.3f} ms".format(kernel_latency))
    profile_config = {
        f"{operator.__name__}-{'-'.join([str(i) for i in input_args])}": {
            "BitBLAS_top20_latency": kernel_latency,
        }
    }
    benchmark_results.update(profile_config)
 # Define headers for the table
 headers = [
    "PrimFunc",
    "Input Arguments",
    "BitBLAS Top20 Latency",
 ]
 # Calculate column widths for pretty printing
 col_widths = [0, 0, 0]
 for config_key, values in benchmark_results.items():
    args_split = config_key.split("-")
    func_name = args_split[0]
    input_args_str = "-".join(args_split[1:])
    col_widths[0] = max(col_widths[0], len(func_name) + 2, len(headers[0]) + 2)
    col_widths[1] = max(col_widths[1],
                        len(input_args_str) + 2,
                        len(headers[1]) + 2)
    col_widths[2] = max(col_widths[2],
                        len(f"{values['BitBLAS_top20_latency']:.3f} ms") + 2,
                        len(headers[2]) + 2)
    # break only if you want to measure widths from a single example;
    # otherwise, let it loop over all items.
 # Print header
 for i, header in enumerate(headers):
    headers[i] = header.ljust(col_widths[i])
 print("".join(headers))
 print("-" * sum(col_widths))
 # Print rows
 for config_key, values in benchmark_results.items():
    args_split = config_key.split("-")
    func_name = args_split[0]
    input_args_str = "-".join(args_split[1:])
    row = [
        func_name,
        input_args_str,
        f"{values['BitBLAS_top20_latency']:.3f} ms",
    ]
    row_str = "".join(
        [str(cell).ljust(col_widths[idx]) for idx, cell in enumerate(row)])
    print(row_str)
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -553,6 +553,9 @@ def main(args: argparse.Namespace):
        intermediate_size = config.moe_intermediate_size
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
    else:
        if not hasattr(config, "hidden_size"):
            # Support for llama4
            config = config.text_config
        # Default: Mixtral.
        E = config.num_local_experts
        topk = config.num_experts_per_tok
--- a/benchmarks/sweep.sh
+++ b/benchmarks/sweep.sh
@ -0,0 +1,27 @@
 MODEL=meta-llama/Llama-3.1-8B-Instruct
 REQUEST_RATES=(1 10 15 20)
 INPUT_LEN=1000
 OUTPUT_LEN=100
 TOTAL_SECONDS=120
 for REQUEST_RATE in "${REQUEST_RATES[@]}";
 do
    NUM_PROMPTS=$(($TOTAL_SECONDS * $REQUEST_RATE))
    echo ""
    echo "===== RUNNING $MODEL FOR $NUM_PROMPTS PROMPTS WITH $REQUEST_RATE QPS ====="
    echo ""
    python3 vllm/benchmarks/benchmark_serving.py \
        --model $MODEL \
        --dataset-name random \
        --random-input-len $INPUT_LEN \
        --random-output-len $OUTPUT_LEN \
        --request-rate $REQUEST_RATE \
        --num-prompts $NUM_PROMPTS \
        --seed $REQUEST_RATE \
        --ignore-eos \
        --result-filename "results.json" \
        --save-result
 done
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -33,8 +33,6 @@ endif()
 if(MACOSX_FOUND)
    list(APPEND CXX_COMPILE_FLAGS
        "-Xpreprocessor"
        "-fopenmp"
        "-DVLLM_CPU_EXTENSION")
 else()
    list(APPEND CXX_COMPILE_FLAGS
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG dc9d410b3e2d6534a4c70724c2515f4def670a22
+          GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@ -0,0 +1,178 @@
 #include <optional>
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <algorithm>
 #include "attention_dtypes.h"
 #include "attention_utils.cuh"
 namespace vllm {
 // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
 // can be used to combine partial attention results (in the split-KV case)
 template <typename scalar_t, const uint NUM_THREADS>
 __global__ void merge_attn_states_kernel(
    scalar_t* output, float* output_lse, const scalar_t* prefix_output,
    const float* prefix_lse, const scalar_t* suffix_output,
    const float* suffix_lse, const uint num_tokens, const uint num_heads,
    const uint head_size) {
  using pack_128b_t = uint4;
  const uint pack_size = 16 / sizeof(scalar_t);
  const uint threads_per_head = head_size / pack_size;
  const uint global_idx = blockIdx.x * NUM_THREADS + threadIdx.x;
  const uint token_head_threads = num_tokens * num_heads * threads_per_head;
  if (global_idx >= token_head_threads) return;
  // global_idx -> token_idx + head_idx + pack_idx
  const uint token_head_idx = global_idx / threads_per_head;
  const uint pack_idx = global_idx % threads_per_head;
  const uint token_idx = token_head_idx / num_heads;
  const uint head_idx = token_head_idx % num_heads;
  const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
  const uint head_offset =
      token_idx * num_heads * head_size + head_idx * head_size;
  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
  scalar_t* output_head_ptr = output + head_offset;
  float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
  float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
  p_lse = std::isinf(p_lse) ? -std::numeric_limits<float>::infinity() : p_lse;
  s_lse = std::isinf(s_lse) ? -std::numeric_limits<float>::infinity() : s_lse;
  const float max_lse = fmaxf(p_lse, s_lse);
  p_lse = p_lse - max_lse;
  s_lse = s_lse - max_lse;
  const float p_se = expf(p_lse);
  const float s_se = expf(s_lse);
  const float out_se = p_se + s_se;
  const float p_scale = p_se / out_se;
  const float s_scale = s_se / out_se;
  if (pack_offset < head_size) {
    // Pack 128b load
    pack_128b_t p_out_pack = reinterpret_cast<const pack_128b_t*>(
        prefix_head_ptr)[pack_offset / pack_size];
    pack_128b_t s_out_pack = reinterpret_cast<const pack_128b_t*>(
        suffix_head_ptr)[pack_offset / pack_size];
    pack_128b_t o_out_pack;
 #pragma unroll
    for (uint i = 0; i < pack_size; ++i) {
      // Always use float for FMA to keep high precision.
      // half(uint16_t), bfloat16, float -> float.
      const float p_out_f =
          vllm::to_float(reinterpret_cast<const scalar_t*>(&p_out_pack)[i]);
      const float s_out_f =
          vllm::to_float(reinterpret_cast<const scalar_t*>(&s_out_pack)[i]);
      // fma: a * b + c = p_out_f * p_scale + (s_out_f * s_scale)
      const float o_out_f = p_out_f * p_scale + (s_out_f * s_scale);
      // float -> half(uint16_t), bfloat16, float.
      vllm::from_float(reinterpret_cast<scalar_t*>(&o_out_pack)[i], o_out_f);
    }
    // Pack 128b storage
    reinterpret_cast<pack_128b_t*>(output_head_ptr)[pack_offset / pack_size] =
        o_out_pack;
  }
  // We only need to write to output_lse once per head.
  if (output_lse != nullptr && pack_idx == 0) {
    float out_lse = logf(out_se) + max_lse;
    output_lse[head_idx * num_tokens + token_idx] = out_lse;
  }
 }
 }  // namespace vllm
 // The following macro is used to dispatch the conversion function based on
 // the output data type. The FN is a macro that calls a function with
 // template<typename scalar_t>.
 #define DISPATCH_BY_SCALAR_DTYPE(scalar_dtype, fn)                      \
  {                                                                     \
    if (scalar_dtype == at::ScalarType::Float) {                        \
      fn(float);                                                        \
    } else if (scalar_dtype == at::ScalarType::Half) {                  \
      fn(uint16_t);                                                     \
    } else if (scalar_dtype == at::ScalarType::BFloat16) {              \
      fn(__nv_bfloat16);                                                \
    } else {                                                            \
      TORCH_CHECK(false, "Unsupported data type of O: ", scalar_dtype); \
    }                                                                   \
  }
 #define LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS)                     \
  {                                                                         \
    vllm::merge_attn_states_kernel<scalar_t, NUM_THREADS>                   \
        <<<grid, block, 0, stream>>>(                                       \
            reinterpret_cast<scalar_t*>(output.data_ptr()), output_lse_ptr, \
            reinterpret_cast<scalar_t*>(prefix_output.data_ptr()),          \
            reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
            reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
            reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
            num_heads, head_size);                                          \
  }
 /*@brief Merges the attention states from prefix and suffix
 * into the output tensor. NUM_TOKENS: n, NUM_HEADS: h, HEAD_SIZE: d
 *
 * @param output [n,h,d] The output tensor to store the merged attention states.
 * @param output_lse [h,d] Optional tensor to store the log-sum-exp values.
 * @param prefix_output [n,h,d] The prefix attention states.
 * @param prefix_lse [h,n] The log-sum-exp values for the prefix attention
 * states.
 * @param suffix_output [n,h,d] The suffix attention states.
 * @param suffix_lse [h,n] The log-sum-exp values for the suffix attention
 * states.
 */
 template <typename scalar_t>
 void merge_attn_states_launcher(torch::Tensor& output,
                                std::optional<torch::Tensor> output_lse,
                                const torch::Tensor& prefix_output,
                                const torch::Tensor& prefix_lse,
                                const torch::Tensor& suffix_output,
                                const torch::Tensor& suffix_lse) {
  constexpr uint NUM_THREADS = 128;
  const uint num_tokens = output.size(0);
  const uint num_heads = output.size(1);
  const uint head_size = output.size(2);
  const uint pack_size = 16 / sizeof(scalar_t);
  TORCH_CHECK(head_size % pack_size == 0,
              "headsize must be multiple of pack_size:", pack_size);
  float* output_lse_ptr = nullptr;
  if (output_lse.has_value()) {
    output_lse_ptr = output_lse.value().data_ptr<float>();
  }
  // Process one pack elements per thread. for float, the
  // pack_size is 4 for half/bf16, the pack_size is 8.
  const uint threads_per_head = head_size / pack_size;
  const uint total_threads = num_tokens * num_heads * threads_per_head;
  dim3 block(NUM_THREADS);
  dim3 grid((total_threads + NUM_THREADS - 1) / NUM_THREADS);
  const c10::cuda::OptionalCUDAGuard device_guard(prefix_output.device());
  auto stream = at::cuda::getCurrentCUDAStream();
  LAUNCH_MERGE_ATTN_STATES(scalar_t, NUM_THREADS);
 }
 #define CALL_MERGE_ATTN_STATES_LAUNCHER(scalar_t)                           \
  {                                                                         \
    merge_attn_states_launcher<scalar_t>(output, output_lse, prefix_output, \
                                         prefix_lse, suffix_output,         \
                                         suffix_lse);                       \
  }
 void merge_attn_states(torch::Tensor& output,
                       std::optional<torch::Tensor> output_lse,
                       const torch::Tensor& prefix_output,
                       const torch::Tensor& prefix_lse,
                       const torch::Tensor& suffix_output,
                       const torch::Tensor& suffix_lse) {
  DISPATCH_BY_SCALAR_DTYPE(output.dtype(), CALL_MERGE_ATTN_STATES_LAUNCHER);
 }
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@ -4,6 +4,11 @@
  #include <string>
  #include <sched.h>
 #endif
 #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
  #include <unistd.h>
  #include <sys/syscall.h>
  #define gettid() syscall(SYS_gettid)
 #endif
 #include "cpu_types.hpp"
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -375,7 +375,7 @@ class CustomAllreduce {
  bool fully_connected_;
  RankSignals sg_;
-  // Stores an map from a pointer to its peer pointers from all ranks.
+  // Stores a map from a pointer to its peer pointers from all ranks.
  std::unordered_map<void*, RankData*> buffers_;
  Signal* self_sg_;
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@ -422,7 +422,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
        int final_state_position =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
        // in case the final state is separated between the last "smem_exchange" and 
        // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
-        // (which occurs when `final_state_position` is a non-positivie index)
+        // (which occurs when `final_state_position` is a non-positive index)
        // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
        if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
            input_t vals_load[kNElts] = {0};
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@ -0,0 +1,103 @@
 # SPDX-License-Identifier: Apache-2.0
 import glob
 import itertools
 import os
 import subprocess
 import jinja2
 FILE_HEAD = """
 // auto generated by generate.py
 // clang-format off
 #include "kernel.h"
 #include "marlin_template.h"
 namespace MARLIN_NAMESPACE_NAME {
 """.strip()
 TEMPLATE = ("template __global__ void Marlin<"
            "{{scalar_t}}, "
            "{{w_type_id}}, "
            "{{threads}}, "
            "{{thread_m_blocks}}, "
            "{{thread_n_blocks}}, "
            "{{thread_k_blocks}}, "
            "{{'true' if m_block_size_8 else 'false'}}, "
            "{{stages}}, "
            "{{'true' if has_act_order else 'false'}}, "
            "{{'true' if has_zp else 'false'}}, "
            "{{group_blocks}}, "
            "{{'true' if is_zp_float else 'false'}}>"
            "( MARLIN_KERNEL_PARAMS );")
 # int8 with zero point case (vllm::kU8) is also supported,
 # we don't add it to reduce wheel size.
 SCALAR_TYPES = ["vllm::kU4", "vllm::kU4B8", "vllm::kU8B128"]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
 # group_blocks:
 #   = 0 : act order case
 #   = -1 : channelwise quantization
 #   > 0 : group_size=16*group_blocks
 GROUP_BLOCKS = [0, -1, 2, 4, 8]
 DTYPES = ["fp16", "bf16"]
 def remove_old_kernels():
    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
        subprocess.call(["rm", "-f", filename])
 def generate_new_kernels():
    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
        has_zp = "B" not in scalar_type
        all_template_str_list = []
        for group_blocks, m_blocks, thread_configs in itertools.product(
                GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS):
            has_act_order = group_blocks == 0
            if has_zp and has_act_order:
                continue
            if thread_configs[2] == 256:
                if m_blocks <= 1 and thread_configs[0] != 128:
                    continue
                if m_blocks > 1 and thread_configs[0] != 64:
                    continue
            k_blocks = thread_configs[0] // 16
            n_blocks = thread_configs[1] // 16
            threads = thread_configs[2]
            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
            template_str = jinja2.Template(TEMPLATE).render(
                scalar_t=c_dtype,
                w_type_id=scalar_type + ".id()",
                threads=threads,
                thread_m_blocks=max(m_blocks, 1),
                thread_n_blocks=n_blocks,
                thread_k_blocks=k_blocks,
                m_block_size_8=m_blocks == 0.5,
                stages="pipe_stages",
                has_act_order=has_act_order,
                has_zp=has_zp,
                group_blocks=group_blocks,
                is_zp_float=False,
            )
            all_template_str_list.append(template_str)
        file_content = FILE_HEAD + "\n\n"
        file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
        with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
            f.write(file_content)
 if __name__ == "__main__":
    remove_old_kernels()
    generate_new_kernels()
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@ -0,0 +1,44 @@
 #ifndef MARLIN_NAMESPACE_NAME
  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
 #endif
 #include "quantization/gptq_marlin/marlin.cuh"
 #include "quantization/gptq_marlin/marlin_dtypes.cuh"
 #include "core/scalar_type.hpp"
 #define MARLIN_KERNEL_PARAMS                                                \
  const int4 *__restrict__ A, const int4 *__restrict__ B,                   \
      int4 *__restrict__ C, int4 *__restrict__ C_tmp,                       \
      const int4 *__restrict__ scales_ptr, const int4 *__restrict__ zp_ptr, \
      const int *__restrict__ g_idx,                                        \
      const int32_t *__restrict__ sorted_token_ids_ptr,                     \
      const int32_t *__restrict__ expert_ids_ptr,                           \
      const int32_t *__restrict__ num_tokens_past_padded_ptr,               \
      const float *__restrict__ topk_weights_ptr, int top_k,                \
      bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,        \
      int prob_n, int prob_k, int *locks, bool use_atomic_add,              \
      bool use_fp32_reduce
 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t,  // compute dtype, half or nv_float16
          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
          const int threads,          // number of threads in a threadblock
          const int thread_m_blocks,  // number of 16x16 blocks in the m
                                      // dimension (batchsize) of the
                                      // threadblock
          const int thread_n_blocks,  // same for n dimension (output)
          const int thread_k_blocks,  // same for k dimension (reduction)
          const bool m_block_size_8,  // whether m_block_size == 8
                                      // only works when thread_m_blocks == 1
          const int stages,  // number of stages for the async global->shared
                             // fetch pipeline
          const bool has_act_order,  // whether act_order is enabled
          const bool has_zp,         // whether zero-points are enabled
          const int group_blocks,    // number of consecutive 16x16 blocks
                                     // with a separate quantization scale
          const bool is_zp_float     // is zero point of float16 type?
          >
 __global__ void Marlin(MARLIN_KERNEL_PARAMS);
 }
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@ -0,0 +1,927 @@
 /*
 * Modified by Neural Magic
 * Copyright (C) Marlin.2024 Elias Frantar
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /*
 * Adapted from https://github.com/IST-DASLab/marlin
 */
 #ifndef MARLIN_NAMESPACE_NAME
  #define MARLIN_NAMESPACE_NAME marlin_moe_wna16
 #endif
 #include "kernel.h"
 #include "core/registration.h"
 #define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t)               \
  static_assert(std::is_same<scalar_t, half>::value ||          \
                    std::is_same<scalar_t, nv_bfloat16>::value, \
                "only float16 and bfloat16 is supported");
 namespace MARLIN_NAMESPACE_NAME {
 __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
 using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 template <int moe_block_size>
 __global__ void permute_cols_kernel(
    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
    int4* __restrict__ out_int4_ptr,
    const int32_t* __restrict__ sorted_token_ids_ptr,
    const int32_t* __restrict__ expert_ids_ptr,
    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
    int size_k, int top_k) {};
 }  // namespace marlin
 torch::Tensor moe_wna16_marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
    bool is_zp_float) {
  TORCH_CHECK_NOT_IMPLEMENTED(false,
                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
  return torch::empty({1, 1});
 }
 #else
 // For a given "a" of size [M,K] performs a permutation of the K columns based
 // on the given "perm" indices.
 template <int moe_block_size>
 __global__ void permute_cols_kernel(
    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
    int4* __restrict__ out_int4_ptr,
    const int32_t* __restrict__ sorted_token_ids_ptr,
    const int32_t* __restrict__ expert_ids_ptr,
    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
    int size_k, int top_k) {
  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
  int num_moe_blocks = div_ceil(num_tokens_past_padded, moe_block_size);
  int32_t block_sorted_ids[moe_block_size];
  int block_num_valid_tokens = 0;
  int64_t old_expert_id = 0;
  int64_t expert_id = 0;
  int row_stride = size_k * sizeof(half) / 16;
  auto read_moe_block_data = [&](int block_id) {
    block_num_valid_tokens = moe_block_size;
    int4* tmp_block_sorted_ids = reinterpret_cast<int4*>(block_sorted_ids);
    for (int i = 0; i < moe_block_size / 4; i++) {
      tmp_block_sorted_ids[i] =
          ((int4*)sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
    }
    for (int i = 0; i < moe_block_size; i++) {
      if (block_sorted_ids[i] >= size_m * top_k) {
        block_num_valid_tokens = i;
        break;
      };
    }
  };
  auto permute_row = [&](int row) {
    int iters = size_k / default_threads;
    int rest = size_k % default_threads;
    int in_offset = (row / top_k) * row_stride;
    int out_offset = row * row_stride;
    half const* a_row_half =
        reinterpret_cast<half const*>(a_int4_ptr + in_offset);
    half* out_half = reinterpret_cast<half*>(out_int4_ptr + out_offset);
    int base_k = 0;
    for (int i = 0; i < iters; i++) {
      int cur_k = base_k + threadIdx.x;
      int src_pos = perm_int_ptr[cur_k];
      out_half[cur_k] = a_row_half[src_pos];
      base_k += default_threads;
    }
    if (rest) {
      if (threadIdx.x < rest) {
        int cur_k = base_k + threadIdx.x;
        int src_pos = perm_int_ptr[cur_k];
        out_half[cur_k] = a_row_half[src_pos];
      }
    }
  };
  for (int index = blockIdx.x; index < num_moe_blocks; index += gridDim.x) {
    old_expert_id = expert_id;
    int tmp_expert_id = expert_ids_ptr[index];
    if (tmp_expert_id == -1) continue;
    expert_id = tmp_expert_id;
    perm_int_ptr += (expert_id - old_expert_id) * size_k;
    read_moe_block_data(index);
    for (int i = 0; i < block_num_valid_tokens; i++)
      permute_row(block_sorted_ids[i]);
  }
 }
 typedef struct {
  int thread_k;
  int thread_n;
  int num_threads;
 } thread_config_t;
 thread_config_t small_batch_thread_configs[] = {
    // Ordered by priority
    // thread_k, thread_n, num_threads
    {128, 128, 256},
    {64, 128, 128}};
 thread_config_t large_batch_thread_configs[] = {
    // Ordered by priority
    // thread_k, thread_n, num_threads
    {64, 256, 256},
    {64, 128, 128}};
 typedef struct {
  int blocks_per_sm;
  thread_config_t tb_cfg;
 } exec_config_t;
 int get_scales_cache_size(thread_config_t const& th_config, int prob_m,
                          int prob_n, int prob_k, int num_bits, int group_size,
                          bool has_act_order, bool is_k_full) {
  bool cache_scales_chunk = has_act_order && !is_k_full;
  int tb_n = th_config.thread_n;
  int tb_k = th_config.thread_k;
  // Get max scale groups per thread-block
  int tb_groups;
  if (group_size == -1) {
    tb_groups = 1;
  } else if (group_size == 0) {
    tb_groups = div_ceil(tb_k, 32);  // Worst case is 32 group size
  } else {
    tb_groups = div_ceil(tb_k, group_size);
  }
  if (cache_scales_chunk) {
    int load_groups =
        tb_groups * pipe_stages * 2;     // Chunk size is 2x pipeline over dim K
    load_groups = max(load_groups, 32);  // We load at least 32 scale groups
    return load_groups * tb_n * 2;
  } else {
    int tb_scales = tb_groups * tb_n * 2;
    return tb_scales * pipe_stages;
  }
 }
 int get_kernel_cache_size(thread_config_t const& th_config, int thread_m_blocks,
                          int prob_m, int prob_n, int prob_k, int num_bits,
                          int group_size, bool has_act_order, bool is_k_full,
                          int has_zp, int is_zp_float) {
  int pack_factor = 32 / num_bits;
  // Get B size
  int tb_k = th_config.thread_k;
  int tb_n = th_config.thread_n;
  int tb_m = thread_m_blocks * 16;
  // shm size for block_sorted_ids/block_topk_weights
  // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
  int sh_block_meta_size = tb_m * 4 * 2;
  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
  int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
  int sh_s_size =
      get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
                            group_size, has_act_order, is_k_full);
  int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0;
  int sh_zp_size = 0;
  if (has_zp) {
    if (is_zp_float)
      sh_zp_size = sh_s_size;
    else if (num_bits == 4)
      sh_zp_size = sh_s_size / 4;
    else if (num_bits == 8)
      sh_zp_size = sh_s_size / 2;
  }
  int total_size = sh_a_size + sh_b_size + sh_s_size + sh_zp_size +
                   sh_g_idx_size + sh_block_meta_size;
  return total_size;
 }
 bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
                     int prob_m, int prob_n, int prob_k, int num_bits,
                     int group_size, bool has_act_order, bool is_k_full,
                     int has_zp, int is_zp_float, int max_shared_mem) {
  // Sanity
  if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
      th_config.num_threads == -1) {
    return false;
  }
  // Verify K/N are divisible by thread K/N
  if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) {
    return false;
  }
  // Verify min for thread K/N
  if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) {
    return false;
  }
  // num_threads must be at least 128 (= 4 warps)
  if (th_config.num_threads < 128) {
    return false;
  }
  // Check that pipeline fits into cache
  int cache_size = get_kernel_cache_size(
      th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
      has_act_order, is_k_full, has_zp, is_zp_float);
  return cache_size <= max_shared_mem;
 }
  #define __GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
                   M_BLOCK_SIZE_8, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS,       \
                   NUM_THREADS, IS_ZP_FLOAT)                                  \
    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&        \
             thread_n_blocks == THREAD_N_BLOCKS &&                            \
             thread_k_blocks == THREAD_K_BLOCKS &&                            \
             m_block_size_8 == M_BLOCK_SIZE_8 &&                              \
             has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP &&            \
             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&    \
             is_zp_float == IS_ZP_FLOAT) {                                    \
      kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS,    \
                      THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8,       \
                      pipe_stages, HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS,       \
                      IS_ZP_FLOAT>;                                           \
    }
  #define GPTQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)              \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, true, false, 0, NUM_THREADS, \
             false)                                                            \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, false, 0,             \
             NUM_THREADS, false)                                               \
                                                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, -1,            \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 2,             \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 4,             \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, false, 8,             \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, -1,           \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 2,            \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 4,            \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, false, 8,            \
             NUM_THREADS, false)
  #define GPTQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)  \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, false, 0,   \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, false, 0,   \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, false, 0,   \
             NUM_THREADS, false)                                     \
                                                                     \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 2,  \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 4,  \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, false, 8,  \
             NUM_THREADS, false)                                     \
                                                                     \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 2,  \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 4,  \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, false, 8,  \
             NUM_THREADS, false)                                     \
                                                                     \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, -1, \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 2,  \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 4,  \
             NUM_THREADS, false)                                     \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, false, 8,  \
             NUM_THREADS, false)
  #define AWQ_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, -1,             \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 2, NUM_THREADS, \
             false)                                                            \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \
             false)                                                            \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 8, NUM_THREADS, \
             false)                                                            \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, -1,            \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 2,             \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
             NUM_THREADS, false)                                               \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 8,             \
             NUM_THREADS, false)
  #define AWQ_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)  \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
             NUM_THREADS, false)                                    \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 2,  \
             NUM_THREADS, false)                                    \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4,  \
             NUM_THREADS, false)                                    \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 8,  \
             NUM_THREADS, false)                                    \
                                                                    \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
             NUM_THREADS, false)                                    \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 2,  \
             NUM_THREADS, false)                                    \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4,  \
             NUM_THREADS, false)                                    \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 8,  \
             NUM_THREADS, false)                                    \
                                                                    \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, -1, \
             NUM_THREADS, false)                                    \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 2,  \
             NUM_THREADS, false)                                    \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4,  \
             NUM_THREADS, false)                                    \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 8,  \
             NUM_THREADS, false)
  // We currently have 4-bit models only with group_blocks == 4
  #define HQQ_GET_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)                  \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, true, 4, NUM_THREADS, \
             true)                                                             \
    __GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
             NUM_THREADS, true)                                                \
    __GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
             NUM_THREADS, true)                                                \
    __GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
             NUM_THREADS, true)                                                \
    __GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, true, 4,             \
             NUM_THREADS, true)
 template <typename scalar_t>
 MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
                                int thread_m_blocks, int thread_n_blocks,
                                int thread_k_blocks, bool m_block_size_8,
                                bool has_act_order, bool has_zp,
                                int group_blocks, int num_threads,
                                bool is_zp_float) {
  int num_bits = q_type.size_bits();
  auto kernel = MarlinDefault;
  if (false) {
  }
  GPTQ_GET_IF_M1(vllm::kU4B8, 8, 8, 256)
  GPTQ_GET_IF_M1(vllm::kU4B8, 8, 4, 128)
  GPTQ_GET_IF_M234(vllm::kU4B8, 16, 4, 256)
  GPTQ_GET_IF_M234(vllm::kU4B8, 8, 4, 128)
  GPTQ_GET_IF_M1(vllm::kU8B128, 8, 8, 256)
  GPTQ_GET_IF_M1(vllm::kU8B128, 8, 4, 128)
  GPTQ_GET_IF_M234(vllm::kU8B128, 16, 4, 256)
  GPTQ_GET_IF_M234(vllm::kU8B128, 8, 4, 128)
  AWQ_GET_IF_M1(vllm::kU4, 8, 8, 256)
  AWQ_GET_IF_M1(vllm::kU4, 8, 4, 128)
  AWQ_GET_IF_M234(vllm::kU4, 16, 4, 256)
  AWQ_GET_IF_M234(vllm::kU4, 8, 4, 128)
  return kernel;
 }
 template <typename scalar_t>
 exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
                                    int prob_n, int prob_k, int thread_m_blocks,
                                    bool m_block_size_8, int num_bits,
                                    int group_size, bool has_act_order,
                                    bool is_k_full, bool has_zp,
                                    bool is_zp_float, int max_shared_mem) {
  exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
  thread_config_t* thread_configs = thread_m_blocks > 1
                                        ? large_batch_thread_configs
                                        : small_batch_thread_configs;
  int thread_configs_size =
      thread_m_blocks > 1
          ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t)
          : sizeof(small_batch_thread_configs) / sizeof(thread_config_t);
  int count = 0;
  constexpr int device_max_reg_size = 255 * 1024;
  for (int i = 0; i < thread_configs_size; i++) {
    thread_config_t th_config = thread_configs[i];
    if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
                         num_bits, group_size, has_act_order, is_k_full, has_zp,
                         is_zp_float, max_shared_mem)) {
      continue;
    }
    int cache_size = get_kernel_cache_size(
        th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits,
        group_size, has_act_order, is_k_full, has_zp, is_zp_float);
    int group_blocks = 0;
    if (!has_act_order) {
      group_blocks = group_size == -1 ? -1 : group_size / 16;
    }
    auto kernel = get_marlin_kernel<scalar_t>(
        q_type, thread_m_blocks, th_config.thread_n / 16,
        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
        group_blocks, th_config.num_threads, is_zp_float);
    if (kernel == MarlinDefault) continue;
    if (thread_m_blocks > 1) {
      exec_cfg = {1, th_config};
      break;
    } else {
      cudaFuncAttributes attr;
      cudaFuncGetAttributes(&attr, kernel);
      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
      int allow_count = min(device_max_reg_size / reg_size,
                            max_shared_mem / (cache_size + 1024));
      allow_count = max(min(allow_count, 4), 1);
      if (allow_count > count) {
        count = allow_count;
        exec_cfg = {count, th_config};
      };
    }
  }
  return exec_cfg;
 }
 template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
               void* zp, void* g_idx, void* perm, void* a_tmp,
               void* sorted_token_ids, void* expert_ids,
               void* num_tokens_past_padded, void* topk_weights,
               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
               int prob_m, int prob_n, int prob_k, void* workspace,
               vllm::ScalarType const& q_type, bool has_act_order,
               bool is_k_full, bool has_zp, int num_groups, int group_size,
               int dev, cudaStream_t stream, int thread_k, int thread_n,
               int sms, bool use_atomic_add, bool use_fp32_reduce,
               bool is_zp_float) {
  int thread_m_blocks = div_ceil(moe_block_size, 16);
  bool m_block_size_8 = moe_block_size == 8;
  if (has_zp) {
    TORCH_CHECK(
        q_type == vllm::kU4 || q_type == vllm::kU8,
        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
  } else {
    TORCH_CHECK(
        q_type == vllm::kU4B8 || q_type == vllm::kU8B128,
        "q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
        q_type.str());
  }
  TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
              ", ", prob_n, ", ", prob_k, "]");
  int group_blocks = 0;
  if (has_act_order) {
    if (is_k_full) {
      TORCH_CHECK(group_size != -1);
      group_blocks = group_size / 16;
      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                  " is not divisible by group_blocks = ", group_blocks);
    } else {
      TORCH_CHECK(group_size == 0);
      group_blocks = 0;
    }
  } else {
    if (group_size == -1) {
      group_blocks = -1;
    } else {
      group_blocks = group_size / 16;
      TORCH_CHECK(prob_k % group_blocks == 0, "prob_k = ", prob_k,
                  " is not divisible by group_blocks = ", group_blocks);
    }
  }
  int num_bits = q_type.size_bits();
  const int4* A_ptr = (const int4*)A;
  const int4* B_ptr = (const int4*)B;
  int4* C_ptr = (int4*)C;
  int4* C_tmp_ptr = (int4*)C_tmp;
  const int4* s_ptr = (const int4*)s;
  const int4* zp_ptr = (const int4*)zp;
  const int* g_idx_ptr = (const int*)g_idx;
  const int* perm_ptr = (const int*)perm;
  int4* a_tmp_ptr = (int4*)a_tmp;
  const int32_t* sorted_token_ids_ptr = (const int32_t*)sorted_token_ids;
  const int32_t* expert_ids_ptr = (const int32_t*)expert_ids;
  const int32_t* num_tokens_past_padded_ptr =
      (const int32_t*)num_tokens_past_padded;
  const float* topk_weights_ptr = (const float*)topk_weights;
  int* locks = (int*)workspace;
  if (has_act_order) {
    // Permute A columns
    auto kernel = permute_cols_kernel<8>;
    if (moe_block_size == 8) {
    } else if (moe_block_size == 16)
      kernel = permute_cols_kernel<16>;
    else if (moe_block_size == 32)
      kernel = permute_cols_kernel<32>;
    else if (moe_block_size == 48)
      kernel = permute_cols_kernel<48>;
    else if (moe_block_size == 64)
      kernel = permute_cols_kernel<64>;
    else
      TORCH_CHECK(false, "unsupported moe_block_size ", moe_block_size);
    // avoid ">>>" being formatted to "> > >"
    // clang-format off
    kernel<<<sms, default_threads, 0, stream>>>(
        A_ptr, perm_ptr, a_tmp_ptr, sorted_token_ids_ptr, expert_ids_ptr,
        num_tokens_past_padded_ptr, prob_m, prob_k, top_k);
    // clang-format on
    A_ptr = a_tmp_ptr;
    prob_m = prob_m * top_k;
    top_k = 1;
    // If we have a full K, then we can run the non-act-order version of Marlin
    // (since the weight rows are reordered by increasing group ids, and by
    // having a full K, we have full original groups)
    if (is_k_full) has_act_order = false;
  }
  int max_shared_mem = 0;
  cudaDeviceGetAttribute(&max_shared_mem,
                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
  TORCH_CHECK(max_shared_mem > 0);
  // Set thread config
  exec_config_t exec_cfg;
  thread_config_t thread_tfg;
  if (thread_k != -1 && thread_n != -1) {
    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
    exec_cfg = exec_config_t{1, thread_tfg};
    TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
                " is not divisible by thread_n = ", thread_n);
    TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
                " is not divisible by thread_k = ", thread_k);
  } else {
    // Auto config
    exec_cfg = determine_exec_config<scalar_t>(
        q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
        max_shared_mem);
    thread_tfg = exec_cfg.tb_cfg;
  }
  int num_threads = thread_tfg.num_threads;
  thread_k = thread_tfg.thread_k;
  thread_n = thread_tfg.thread_n;
  int blocks = sms * exec_cfg.blocks_per_sm;
  if (exec_cfg.blocks_per_sm > 1)
    max_shared_mem = max_shared_mem / exec_cfg.blocks_per_sm - 1024;
  int thread_k_blocks = thread_k / 16;
  int thread_n_blocks = thread_n / 16;
  TORCH_CHECK(is_valid_config(thread_tfg, thread_m_blocks, prob_m, prob_n,
                              prob_k, num_bits, group_size, has_act_order,
                              is_k_full, has_zp, is_zp_float, max_shared_mem),
              "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
              ", thread_k = ", thread_tfg.thread_k,
              ", thread_n = ", thread_tfg.thread_n,
              ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
              ", group_size = ", group_size,
              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
              ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
              ", max_shared_mem = ", max_shared_mem);
  auto kernel = get_marlin_kernel<scalar_t>(
      q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
      has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
  if (kernel == MarlinDefault) {
    TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
                ", ", prob_k, "]", ", has_act_order = ", has_act_order,
                ", num_groups = ", num_groups, ", group_size = ", group_size,
                ", thread_m_blocks = ", thread_m_blocks,
                ", thread_n_blocks = ", thread_n_blocks,
                ", thread_k_blocks = ", thread_k_blocks,
                ", num_bits = ", num_bits);
  }
  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                       max_shared_mem);
  // avoid ">>>" being formatted to "> > >"
  // clang-format off
  kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
      A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,
      sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
      topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
      prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce);
  // clang-format on
 }
 }  // namespace MARLIN_NAMESPACE_NAME
 torch::Tensor moe_wna16_marlin_gemm(
    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
    torch::Tensor& b_q_weight, torch::Tensor& b_scales,
    std::optional<torch::Tensor> const& b_zeros_or_none,
    std::optional<torch::Tensor> const& g_idx_or_none,
    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
    bool is_zp_float) {
  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
  int pack_factor = 32 / b_q_type.size_bits();
  if (moe_block_size != 8) {
    TORCH_CHECK(moe_block_size % 16 == 0,
                "unsupported moe_block_size=", moe_block_size);
    TORCH_CHECK(moe_block_size >= 16 && moe_block_size <= 64,
                "unsupported moe_block_size=", moe_block_size);
  }
  // Verify A
  TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
              ", size_m = ", size_m);
  TORCH_CHECK(a.size(1) == size_k, "Shape mismatch: a.size(1) = ", a.size(1),
              ", size_k = ", size_k);
  // Verify B
  TORCH_CHECK(
      size_k % MARLIN_NAMESPACE_NAME::tile_size == 0, "size_k = ", size_k,
      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
  TORCH_CHECK((size_k / MARLIN_NAMESPACE_NAME::tile_size) == b_q_weight.size(1),
              "Shape mismatch: b_q_weight.size(1) = ", b_q_weight.size(1),
              ", size_k = ", size_k,
              ", tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
  TORCH_CHECK(
      b_q_weight.size(2) % MARLIN_NAMESPACE_NAME::tile_size == 0,
      "b_q_weight.size(2) = ", b_q_weight.size(2),
      " is not divisible by tile_size = ", MARLIN_NAMESPACE_NAME::tile_size);
  int actual_size_n =
      (b_q_weight.size(2) / MARLIN_NAMESPACE_NAME::tile_size) * pack_factor;
  TORCH_CHECK(size_n == actual_size_n, "size_n = ", size_n,
              ", actual_size_n = ", actual_size_n);
  // Verify device and strides
  TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
  TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
  TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
  TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
  TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
  TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
  // auto -1)
  int thread_k = -1;
  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
  // auto -1)
  int thread_n = -1;
  // sms: number of SMs to use for the kernel
  int sms = -1;
  cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
  // Alloc buffers
  const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
  torch::Tensor c;
  if (c_or_none.has_value()) {
    c = c_or_none.value();
    TORCH_CHECK(c.device().is_cuda(), "c is not on GPU");
    TORCH_CHECK(c.is_contiguous(), "c is not contiguous");
    TORCH_CHECK(c.size(0) == size_m * top_k,
                "Shape mismatch: c.size(0) = ", c.size(0),
                ", size_m * topk = ", size_m * top_k);
    TORCH_CHECK(c.size(1) == size_n, "Shape mismatch: c.size(1) = ", c.size(1),
                ", size_n = ", size_n);
  } else {
    c = torch::empty({size_m * top_k, size_n}, options);
  }
  // Alloc C tmp buffer that is going to be used for the global reduce
  torch::Tensor c_tmp;
  auto options_fp32 =
      torch::TensorOptions().dtype(at::kFloat).device(a.device());
  if (use_fp32_reduce && !use_atomic_add) {
    // max num of threadblocks is sms * 4
    long max_c_tmp_size = min(
        (long)size_n * sorted_token_ids.size(0),
        (long)sms * 4 * moe_block_size * MARLIN_NAMESPACE_NAME::max_thread_n);
    if (moe_block_size == 8) max_c_tmp_size *= 2;
    c_tmp = torch::empty({max_c_tmp_size}, options_fp32);
  } else {
    c_tmp = torch::empty({0}, options_fp32);
  }
  // Detect groupsize and act_order
  int num_groups = -1;
  int group_size = -1;
  int rank = b_scales.sizes().size();
  TORCH_CHECK(rank == 3, "b_scales rank = ", rank, " is not 3");
  TORCH_CHECK(b_scales.size(2) == size_n, "b_scales dim 2 = ", b_scales.size(2),
              " is not size_n = ", size_n);
  num_groups = b_scales.size(1);
  torch::Tensor g_idx, perm, a_tmp;
  ;
  if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
    g_idx = g_idx_or_none.value();
    perm = perm_or_none.value();
    TORCH_CHECK(g_idx.device().is_cuda(), "g_idx is not on GPU");
    TORCH_CHECK(g_idx.is_contiguous(), "g_idx is not contiguous");
    TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
    TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
    // Verify g_idx and perm
    TORCH_CHECK((g_idx.size(-1) == 0 && perm.size(-1) == 0) ||
                    (g_idx.size(-1) == size_k && perm.size(-1) == size_k),
                "Unexpected g_idx.size(-1) = ", g_idx.size(-1),
                " and perm.size(-1) = ", perm.size(-1),
                ", where size_k = ", size_k);
  } else {
    g_idx = torch::empty({0}, options);
    perm = torch::empty({0}, options);
    a_tmp = torch::empty({0}, options);
  }
  bool has_act_order = g_idx.size(-1) > 0 && perm.size(-1) > 0;
  if (has_act_order) {
    a_tmp = torch::empty({size_m * top_k, size_k}, options);
    if (is_k_full) {
      TORCH_CHECK(num_groups > 1, "For act_order, num_groups must be > 1");
      TORCH_CHECK(size_k % num_groups == 0, "size_k = ", size_k,
                  ", is not divisible by num_groups = ", num_groups);
      group_size = size_k / num_groups;
    } else {
      group_size = 0;
    }
  } else {
    a_tmp = torch::empty({0}, options);
    if (num_groups > 1) {
      TORCH_CHECK(
          size_k % num_groups == 0, "size_k = ", size_k,
          ", is not divisible by b_scales.size(1) = ", b_scales.size(1));
      group_size = size_k / num_groups;
    } else {
      group_size = -1;
    }
  }
  torch::Tensor b_zeros;
  if (b_zeros_or_none.has_value()) {
    b_zeros = b_zeros_or_none.value();
    TORCH_CHECK(b_zeros.device().is_cuda(), "b_zeros is not on GPU");
    TORCH_CHECK(b_zeros.is_contiguous(), "b_zeros is not contiguous");
  } else {
    b_zeros = torch::empty({0}, options);
  }
  bool has_zp = b_zeros.size(-1) > 0;
  if (has_zp) {
    TORCH_CHECK(
        b_q_type == vllm::kU4,
        "b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str());
  } else {
    TORCH_CHECK(
        b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
        "b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ",
        b_q_type.str());
  }
  if (has_zp && is_zp_float) {
    TORCH_CHECK(a.scalar_type() == at::ScalarType::Half,
                "Computation type must be float16 (half) when using float zero "
                "points.");
  }
  // Verify b_zeros
  if (has_zp) {
    int rank = b_zeros.sizes().size();
    TORCH_CHECK(rank == 3, "b_zeros rank = ", rank, " is not 3");
    if (is_zp_float) {
      TORCH_CHECK(b_zeros.size(2) == size_n,
                  "b_zeros dim 2 = ", b_zeros.size(2),
                  " is not size_n = ", size_n);
      TORCH_CHECK(num_groups == b_zeros.size(1),
                  "b_zeros dim 1 = ", b_zeros.size(1),
                  " is not num_groups = ", num_groups);
      TORCH_CHECK(num_groups != -1, "num_groups must be != -1");
    } else {
      TORCH_CHECK(b_zeros.size(1) == num_groups,
                  "b_zeros dim 1 = ", b_zeros.size(1),
                  " is not num_groups = ", num_groups);
      TORCH_CHECK(b_zeros.size(2) == size_n / pack_factor,
                  "b_zeros dim 2 = ", b_zeros.size(2),
                  " is not size_n / pack_factor = ", size_n / pack_factor);
    }
  }
  // Verify workspace size
  TORCH_CHECK(size_n % MARLIN_NAMESPACE_NAME::min_thread_n == 0,
              "size_n = ", size_n, ", is not divisible by min_thread_n = ",
              MARLIN_NAMESPACE_NAME::min_thread_n);
  int max_n_tiles = size_n / MARLIN_NAMESPACE_NAME::min_thread_n;
  int min_workspace_size = min(
      max_n_tiles * (int)(sorted_token_ids.size(0) / moe_block_size), sms * 4);
  TORCH_CHECK(workspace.numel() >= min_workspace_size,
              "workspace.numel = ", workspace.numel(),
              " is below min_workspace_size = ", min_workspace_size);
  int dev = a.get_device();
  if (a.scalar_type() == at::ScalarType::Half) {
    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
        c_tmp.data_ptr<float>(), b_scales.data_ptr<at::Half>(),
        b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
        a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
        expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
        topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
        size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
        is_k_full, has_zp, num_groups, group_size, dev,
        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
        use_atomic_add, use_fp32_reduce, is_zp_float);
  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
        b_scales.data_ptr<at::BFloat16>(), b_zeros.data_ptr(), g_idx.data_ptr(),
        perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
        workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
        num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
        thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
  } else {
    TORCH_CHECK(false,
                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
  }
  return c;
 }
 #endif
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
 }
--- a/csrc/moe/moe_wna16.cu
+++ b/csrc/moe/moe_wna16.cu
@ -13,7 +13,6 @@
 template <typename scalar_t, int bit, int GROUPS>
 __global__ void moe_wna16_gemm_kernel(
    const scalar_t* __restrict__ input, scalar_t* __restrict__ output,
    const uint32_t* __restrict__ qweight, const scalar_t* __restrict__ scales,
    const uint32_t* __restrict__ qzeros,
@ -54,8 +53,6 @@ __global__ void moe_wna16_gemm_kernel(
      if (token_index / top_k >= size_m) break;
      num_valid_tokens = m + 1;
      if (blockIdx.z == 0 && offset_n < size_n)
        output[token_index * size_n + offset_n] = Dtype::int2num(0);
      if (expert_id != -1) {
        int k_per_thread = DIVIDE(BLOCK_SIZE_K, BLOCK_SIZE_N);
@ -284,8 +281,7 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                             int64_t BLOCK_SIZE_K, int64_t bit) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
-  auto options =
+  output.zero_();
      torch::TensorOptions().dtype(input.dtype()).device(input.device());
  const int num_experts = b_qweight.size(0);
  const int size_m = input.size(0);
@ -302,9 +298,9 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
  const uint32_t* b_qzeros_ptr;
  if (b_qzeros.has_value())
    b_qzeros_ptr = (const uint32_t*)b_qzeros.value().data_ptr<uint8_t>();
-  const float* topk_weights_ptr;
+  const float* topk_weights_ptr = nullptr;
  if (topk_weights.has_value())
-    topk_weights_ptr = (const float*)topk_weights.value().data_ptr();
+    topk_weights_ptr = (const float*)topk_weights.value().data_ptr<float>();
  int groups_per_block_row = BLOCK_SIZE_K / group_size;
  TORCH_CHECK(bit == 4 || bit == 8, "bit must be 4 or 8");
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -43,14 +43,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  m.impl("moe_wna16_gemm", torch::kCUDA, &moe_wna16_gemm);
  m.def(
-      "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
+      "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
-      "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
+      "Tensor! b_q_weight, Tensor! b_scales, Tensor? b_zeros_or_none,"
-      "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, "
+      "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
-      "int b_q_type, SymInt size_m, "
+      "Tensor sorted_token_ids,"
-      "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int "
+      "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
-      "topk, "
+      "Tensor! topk_weights, int moe_block_size, int top_k, "
-      "int moe_block_size, bool replicate_input, bool apply_weights)"
+      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
-      " -> Tensor");
+      "int size_m, int size_n, int size_k,"
      "bool is_full_k, bool use_atomic_add,"
      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
  // conditionally compiled so impl registration is in source file
 #endif
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -52,6 +52,15 @@ void paged_attention_v2(
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);
 #ifndef USE_ROCM
 void merge_attn_states(torch::Tensor& output,
                       std::optional<torch::Tensor> output_lse,
                       const torch::Tensor& prefix_output,
                       const torch::Tensor& prefix_lse,
                       const torch::Tensor& suffix_output,
                       const torch::Tensor& suffix_lse);
 #endif
 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
              double epsilon);
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
@ -46,14 +46,26 @@ __global__ void compute_expert_offsets(
 }
 __global__ void compute_arg_sorts(const int* __restrict__ topk_ids,
                                  const int32_t* __restrict__ expert_offsets,
                                  int32_t* input_permutation,
                                  int32_t* output_permutation,
                                  int32_t* atomic_buffer, const int topk_length,
                                  const int topk) {
-  int expert_id = blockIdx.x;
+  int const blk_expert_id = blockIdx.x;
  int const num_experts = gridDim.x;
  int32_t const num_tokens = expert_offsets[num_experts];
  for (int i = threadIdx.x; i < topk_length; i += THREADS_PER_EXPERT) {
-    if (topk_ids[i] == expert_id) {
+    int const expert_id = topk_ids[i];
    if (expert_id == -1 && blockIdx.x == 0) {
      // output_permutation is used to re-order the moe outputs. It is
      // used as c2 = c2[c_map], where c2 is a torch.tensor that is the
      // output of the cutlass kernels and c_map is the output_permutation.
      // c2 is initialized to zeros, therefore by setting the output_permutation
      // to num_tokens, we are guaranteed to fill the moe outputs to zero
      // for "invalid" topk_ids.
      output_permutation[i] = num_tokens;
    } else if (expert_id == blk_expert_id) {
      int start = atomicAdd(&atomic_buffer[expert_id], 1);
      input_permutation[start] = i / topk;
      output_permutation[i] = start;
@ -83,6 +95,7 @@ void get_cutlass_moe_mm_data_caller(
      static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
  compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
      static_cast<const int32_t*>(topk_ids.data_ptr()),
      static_cast<const int32_t*>(expert_offsets.data_ptr()),
      static_cast<int32_t*>(input_permutation.data_ptr()),
      static_cast<int32_t*>(output_permutation.data_ptr()),
      static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(),
--- a/csrc/quantization/gguf/moe.cuh
+++ b/csrc/quantization/gguf/moe.cuh
@ -129,7 +129,7 @@ static __device__ __forceinline__ void moe_q(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q4_0 64
+  #define MOE_X_Q4_0 8
  #define MOE_Y_Q4_0 128
  #define NWARPS_Q4_0 8
 #else
@ -190,7 +190,7 @@ static void ggml_moe_q4_0_q8_1_cuda(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q4_1 64
+  #define MOE_X_Q4_1 8
  #define MOE_Y_Q4_1 128
  #define NWARPS_Q4_1 8
 #else
@ -251,7 +251,7 @@ static void ggml_moe_q4_1_q8_1_cuda(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q5_0 64
+  #define MOE_X_Q5_0 8
  #define MOE_Y_Q5_0 128
  #define NWARPS_Q5_0 8
 #else
@ -312,7 +312,7 @@ static void ggml_moe_q5_0_q8_1_cuda(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q5_1 64
+  #define MOE_X_Q5_1 8
  #define MOE_Y_Q5_1 128
  #define NWARPS_Q5_1 8
 #else
@ -373,7 +373,7 @@ static void ggml_moe_q5_1_q8_1_cuda(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q8_0 64
+  #define MOE_X_Q8_0 8
  #define MOE_Y_Q8_0 128
  #define NWARPS_Q8_0 8
 #else
@ -434,7 +434,7 @@ static void ggml_moe_q8_0_q8_1_cuda(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q2_K 64
+  #define MOE_X_Q2_K 8
  #define MOE_Y_Q2_K 128
  #define NWARPS_Q2_K 8
 #else
@ -495,7 +495,7 @@ static void ggml_moe_q2_K_q8_1_cuda(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q3_K 64
+  #define MOE_X_Q3_K 8
  #define MOE_Y_Q3_K 128
  #define NWARPS_Q3_K 8
 #else
@ -556,7 +556,7 @@ static void ggml_moe_q3_K_q8_1_cuda(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q4_K 64
+  #define MOE_X_Q4_K 8
  #define MOE_Y_Q4_K 128
  #define NWARPS_Q4_K 8
 #else
@ -617,7 +617,7 @@ static void ggml_moe_q4_K_q8_1_cuda(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q5_K 64
+  #define MOE_X_Q5_K 8
  #define MOE_Y_Q5_K 128
  #define NWARPS_Q5_K 8
 #else
@ -678,7 +678,7 @@ static void ggml_moe_q5_K_q8_1_cuda(
 }
 #if defined(USE_ROCM)
-  #define MOE_X_Q6_K 64
+  #define MOE_X_Q6_K 8
  #define MOE_Y_Q6_K 128
  #define NWARPS_Q6_K 8
 #else
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@ -1785,7 +1785,7 @@ __global__ void Marlin(
            <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
                num_groups, prob_m, prob_n, prob_k, lda, locks,                \
-                use_atomic_add, use_fp32_reduce);                              \
+                part_use_atomic_add, use_fp32_reduce);                         \
      }                                                                        \
    }
@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
      thread_m_blocks = exec_cfg.max_m_blocks;
    }
    // atomic add reduce have better performance only when m * n is small
    bool part_use_atomic_add =
        use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
    if (false) {
    }
    GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@ -9,7 +9,11 @@
 #include <cuda_runtime.h>
 #include <iostream>
-namespace marlin {
+#ifndef MARLIN_NAMESPACE_NAME
  #define MARLIN_NAMESPACE_NAME marlin
 #endif
 namespace MARLIN_NAMESPACE_NAME {
 // Marlin params
@ -23,6 +27,7 @@ static constexpr int pipe_stages =
 static constexpr int min_thread_n = 64;
 static constexpr int min_thread_k = 64;
 static constexpr int max_thread_n = 256;
 static constexpr int tile_size = 16;
 static constexpr int max_par = 16;
@ -84,4 +89,4 @@ __device__ inline void cp_async_wait() {
 #endif
-}  // namespace marlin
+}  // namespace MARLIN_NAMESPACE_NAME
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@ -5,7 +5,11 @@
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
-namespace marlin {
+#ifndef MARLIN_NAMESPACE_NAME
  #define MARLIN_NAMESPACE_NAME marlin
 #endif
 namespace MARLIN_NAMESPACE_NAME {
 template <typename scalar_t>
 class ScalarType {};
@ -54,7 +58,7 @@ class ScalarType<nv_bfloat16> {
  using FragS = Vec<nv_bfloat162, 1>;
  using FragZP = Vec<nv_bfloat162, 4>;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
  static __device__ float inline num2float(const nv_bfloat16 x) {
    return __bfloat162float(x);
  }
@ -74,6 +78,6 @@ class ScalarType<nv_bfloat16> {
 #endif
 };
-}  // namespace marlin
+}  // namespace MARLIN_NAMESPACE_NAME
 #endif
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@ -2,6 +2,15 @@
 #include <torch/all.h>
 torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
                    const int64_t rows_per_block);
 torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
                       const int64_t CuCount);
 void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
               at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
 void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
                     torch::Tensor& max_logits, torch::Tensor& tmp_out,
                     torch::Tensor& query, torch::Tensor& key_cache,
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@ -14,6 +14,24 @@
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
  // vLLM custom ops for rocm
  // Custom gemm op for matrix-vector multiplication
  rocm_ops.def(
      "LLMM1(Tensor in_a, Tensor in_b, int rows_per_block) -> "
      "Tensor");
  rocm_ops.impl("LLMM1", torch::kCUDA, &LLMM1);
  // Custom gemm op for skinny matrix-matrix multiplication
  rocm_ops.def(
      "wvSplitK(Tensor in_a, Tensor in_b, int CuCount) -> "
      "Tensor");
  rocm_ops.impl("wvSplitK", torch::kCUDA, &wvSplitK);
  // wvSplitK for fp8
  rocm_ops.def(
      "wvSplitKQ(Tensor in_a, Tensor in_b, Tensor! out_c, Tensor scale_a, "
      "          Tensor scale_b, int CuCount) -> ()");
  rocm_ops.impl("wvSplitKQ", torch::kCUDA, &wvSplitKQ);
  // Custom attention op
  // Compute the attention between an input query and the cached
  // keys/values using PagedAttention.
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@ -64,6 +64,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "    int blocksparse_head_sliding_step) -> ()");
  ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 #ifndef USE_ROCM
  // Merge attn states
  // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
  // can be used to combine partial attention results (in the split-KV case)
  ops.def(
      "merge_attn_states("
      "    Tensor! output,"
      "    Tensor!? output_lse,"
      "    Tensor prefix_output,"
      "    Tensor prefix_lse,"
      "    Tensor suffix_output,"
      "    Tensor suffix_lse) -> ()");
  ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
 #endif
  // Activation ops
  // Activation function used in SwiGLU.
  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -240,6 +240,8 @@ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
    uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
 fi
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .
 # Although we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@ -18,6 +18,8 @@ WORKDIR /workspace/
 ARG PYTHON_VERSION=3.12
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
 ENV LD_PRELOAD=""
 # Install minimal dependencies and uv
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
@ -32,6 +34,7 @@ ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 ENV PATH="/root/.local/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/venv"
 ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python
 RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
@ -118,6 +121,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ADD ./tests/ ./tests/
 ADD ./examples/ ./examples/
 ADD ./benchmarks/ ./benchmarks/
 ADD ./vllm/collect_env.py .
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
 COPY ./ /workspace/vllm
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
 FROM $BASE_IMAGE
@ -21,9 +21,9 @@ VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}/vllm
 RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
-RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
+RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
-RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install pytest
 # uninstall transformers-neuronx package explicitly to avoid version conflict
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@ -0,0 +1,307 @@
 # The vLLM Dockerfile is used to construct vLLM image against torch nightly that can be directly used for testing
 # for torch nightly, cuda >=12.6 is required,
 # use 12.8 due to FlashAttention issue with cuda 12.6 (https://github.com/vllm-project/vllm/issues/15435#issuecomment-2775924628)
 ARG CUDA_VERSION=12.8.0
 #
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.8.0
 ARG PYTHON_VERSION=3.12
 ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 # Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl sudo \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version \
    && python3 -m pip --version
 # Install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
 RUN apt-get install -y gcc-10 g++-10
 RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
 RUN <<EOF
 gcc --version
 EOF
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements/common.txt requirements/common.txt
 COPY use_existing_torch.py use_existing_torch.py
 COPY pyproject.toml pyproject.toml
 # install build and runtime dependencies without stable torch version
 RUN python3 use_existing_torch.py
 # install torch nightly
 ARG PINNED_TORCH_VERSION
 RUN --mount=type=cache,target=/root/.cache/uv \
    if [ -n "$PINNED_TORCH_VERSION" ]; then \
      pkgs="$PINNED_TORCH_VERSION"; \
    else \
      pkgs="torch torchaudio torchvision"; \
    fi && \
    uv pip install --system $pkgs --index-url https://download.pytorch.org/whl/nightly/cu128
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system numba==0.61.2
 RUN --mount=type=cache,target=/root/.cache/uv \
 uv pip install --system -r requirements/common.txt
 # must put before installing xformers, so it can install the correct version of xfomrers.
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # Build xformers with cuda and torch nightly
 # following official xformers guidance: https://github.com/facebookresearch/xformers#build
 # todo(elainewy): cache xformers build result for faster build
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
 ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/uv \
    echo 'git clone xformers...' \
    && git clone https://github.com/facebookresearch/xformers.git --recursive \
    && cd xformers \
    && git checkout ${XFORMERS_COMMIT} \
    && git submodule update --init --recursive \
    && echo 'finish git clone xformers...' \
    && rm -rf build \
    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
    && cd .. \
    && rm -rf xformers
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system xformers-dist/*.whl --verbose
 # build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
 RUN cat torch_build_versions.txt
 # cuda arch list used by torch
 # can be useful for `test`
 # explicitly set the list to avoid issues with torch 2.2
 # see https://github.com/pytorch/pytorch/pull/123243
 # Override the arch list for flash-attn to reduce the binary size
 ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
 ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 #################### BASE BUILD IMAGE ####################
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 ARG TARGETPLATFORM
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 COPY . .
 RUN python3 use_existing_torch.py
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/build.txt
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
 # Max jobs used by Ninja to build extensions
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
 ARG nvcc_threads=2
 ENV NVCC_THREADS=$nvcc_threads
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
        echo "Installing sccache..." \
        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
        && tar -xzf sccache.tar.gz \
        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
        && export SCCACHE_IDLE_TIMEOUT=0 \
        && export CMAKE_BUILD_TYPE=Release \
        && sccache --show-stats \
        && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
        && sccache --show-stats; \
    fi
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git  \
    if [ "$USE_SCCACHE" != "1" ]; then \
        # Clean any existing CMake artifacts
        rm -rf .deps && \
        mkdir -p .deps && \
        python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
    fi
 #################### WHEEL BUILD IMAGE ####################
 ################### VLLM INSTALLED IMAGE ####################
 # Setup clean environment for vLLM and its dependencies for test and api server using ubuntu22.04 with AOT flashinfer
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
 # prepare for environment starts
 ARG CUDA_VERSION=12.8.0
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 # Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
    && apt-get update -y \
    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update -y \
    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 # get the nightly torch version used in the build to make sure the version is the same
 COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128
 # install the vllm wheel
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm-dist \
    --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system vllm-dist/*.whl --verbose
 # install xformers again for the new environment
 RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
    --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 # install package for build flashinfer
 # see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
 RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.post1
 # build flashinfer for torch nightly from source around 10 mins
 # release version: v0.2.2.post1
 # todo(elainewy): cache flashinfer build result for faster build
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    echo "git clone flashinfer..." \
    && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
    && cd flashinfer \
    && git checkout v0.2.2.post1 \
    && git submodule update --init --recursive \
    && echo "finish git clone flashinfer..." \
    && rm -rf build \
    && export TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} \
    && FLASHINFER_ENABLE_AOT=1 python3 setup.py bdist_wheel --dist-dir=../flashinfer-dist --verbose \
    && cd .. \
    && rm -rf flashinfer
 # install flashinfer
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system flashinfer-dist/*.whl --verbose
 # install common packages
 COPY requirements/common.txt requirements/common.txt
 COPY use_existing_torch.py use_existing_torch.py
 COPY pyproject.toml pyproject.toml
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .
 RUN python3 use_existing_torch.py
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/common.txt
 ################### VLLM INSTALLED IMAGE ####################
 #################### UNITTEST IMAGE #############################
 FROM vllm-base as test
 COPY tests/ tests/
 # install build and runtime dependencies without stable torch version
 COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
 # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -e tests/vllm_test_utils
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/nightly_torch_test.txt
 #################### UNITTEST IMAGE #############################
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@ -126,13 +126,16 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 FROM base-builder AS cv-builder
 ARG MAX_JOBS
-ARG OPENCV_VERSION=84
+ARG OPENCV_VERSION=86
 # patch for version 4.11.0.86
 ARG OPENCV_PATCH=97f3f39
 ARG ENABLE_HEADLESS=1
 RUN --mount=type=cache,target=/root/.cache/uv \
    source /opt/rh/gcc-toolset-13/enable && \
    git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
    cd opencv-python && \
-    sed -i 's/"setuptools==59.2.0",/"setuptools<70.0",/g' pyproject.toml && \
+    sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
    cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
    python -m build --wheel --installer=uv --outdir /opencvwheels/
 ###############################################################
@ -148,9 +151,15 @@ COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
 ARG VLLM_TARGET_DEVICE=cpu
 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 # this step installs vllm and populates uv cache
 # with all the transitive dependencies
 RUN --mount=type=cache,target=/root/.cache/uv \
    source /opt/rh/gcc-toolset-13/enable && \
    git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
    uv pip install maturin && \
    uv build --wheel --out-dir /hf_wheels/
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
@ -159,7 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    source /opt/rh/gcc-toolset-13/enable && \
    uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
    sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
-    uv pip install pandas pythran pybind11 && \
+    uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
    # sentencepiece.pc is in some pkgconfig inside uv cache
    export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
    uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
@ -247,8 +256,9 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
    --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
    --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
    --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
    --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
-    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /vllmwheel/*.whl
+    HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
 COPY ./ /workspace/vllm
 WORKDIR /workspace/vllm
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="8970b25b"
+ARG AITER_BRANCH="7e1ed08"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 FROM ${BASE_IMAGE} AS base
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@ -58,7 +58,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    cd ../../python && \
    export PYARROW_PARALLEL=4 && \
    export ARROW_BUILD_TYPE=release && \
-    uv pip install -r requirements/build.txt && \
+    uv pip install -r requirements-build.txt && \
    python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
 FROM python-install AS numa-build
@ -96,6 +96,22 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
    python setup.py bdist_wheel
 FROM python-install AS hf-xet-builder
 # Install hf-xet
 WORKDIR /tmp
 ENV CARGO_HOME=/root/.cargo
 ENV RUSTUP_HOME=/root/.rustup
 ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
    git clone https://github.com/huggingface/xet-core.git && \
    cd xet-core/hf_xet/ && \
    uv pip install maturin patchelf && \
    python -m maturin build --release --out dist && \
    mkdir -p /tmp/hf-xet/dist && \
    cp dist/*.whl /tmp/hf-xet/dist/
 # Final build stage
 FROM python-install AS vllm-cpu
 ARG PYTHON_VERSION
@ -120,12 +136,15 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
    --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
    --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
    --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
     sed -i '/^torch/d' requirements/build.txt && \
     ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
     VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
     HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
    uv pip install -v \    
        $ARROW_WHL_FILE  \
        $VISION_WHL_FILE \
        $HF_XET_WHL_FILE \
        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
        --index-strategy unsafe-best-match \
        -r requirements/build.txt \
@ -149,4 +168,5 @@ USER 2000
 WORKDIR /home/vllm
 # Set the default entrypoint
-ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
+ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docs/source/assets/deployment/open_webui.png
+++ b/docs/source/assets/deployment/open_webui.png
--- a/docs/source/community/meetups.md
+++ b/docs/source/community/meetups.md
@ -4,6 +4,7 @@
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 - [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day), April 3rd 2025. [[Slides]](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama), March 27th 2025. [[Slides]](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [The first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg), March 16th 2025. [[Slides]](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
 - [The East Coast vLLM Meetup](https://lu.ma/7mu4k4xx), March 11th 2025. [[Slides]](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0)
--- a/docs/source/contributing/model/multimodal.md
+++ b/docs/source/contributing/model/multimodal.md
@ -79,6 +79,17 @@ Further update the model as follows:
            return inputs_embeds
    ```
 - Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
    ```python
    class YourModelForImage2Seq(nn.Module):
        ...
        def get_language_model(self) -> torch.nn.Module:
            # Change `language_model` according to your implementation.
            return self.language_model
    ```
 - Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
  ```diff
@ -110,17 +121,19 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
    return {"image": None, "video": 1}
 ```
-### Maximum number of placeholder feature tokens
+## 3. Specify dummy inputs
-Also, override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`
+Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
-to return the maximum number of placeholder feature tokens per input item for each modality.
+HF processing as well as memory profiling.
-When calling the model, the output embeddings from the visual encoder are assigned to the input positions
+### For memory profiling
 containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal
 to the size of the output embeddings.
-:::::{tab-set}
+Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
-::::{tab-item} Basic example: LLaVA
+
 Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
 ::::{tab-set}
 :::{tab-item} Basic example: LLaVA
 :sync: llava
 Looking at the code of HF's `LlavaForConditionalGeneration`:
@ -229,41 +242,50 @@ def get_num_image_tokens(
 ```
 Notice that the number of image tokens doesn't depend on the image width and height.
-So, we can calculate the maximum number of image tokens using any image size:
+We can simply use a dummy `image_size` to calculate the multimodal profiling data:
 ```python
 # NOTE: In actuality, this is usually implemented as part of the
 # model's subclass of `BaseProcessingInfo`, but we show it as is
 # here for simplicity.
 def get_image_size_with_most_features(self) -> ImageSize:
    hf_config = self.get_hf_config()
    width = height = hf_config.image_size
    return ImageSize(width=width, height=height)
-def get_max_image_tokens(self) -> int:
+def get_dummy_mm_data(
    target_width, target_height = self.get_image_size_with_most_features()
    return self.get_num_image_tokens(
        image_width=target_width,
        image_height=target_height,
    )
 ```
 And thus, we can override the method as:
 ```python
 def get_mm_max_tokens_per_item(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
-) -> Mapping[str, int]:
+) -> MultiModalDataDict:
-    return {"image": self.get_max_image_tokens()}
+    num_images = mm_counts.get("image", 0)
    target_width, target_height = \
        self.info.get_image_size_with_most_features()
    return {
        "image":
        self._get_dummy_images(width=target_width,
                               height=target_height,
                               num_images=num_images)
    }
 ```
 For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
 ```python
 def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    num_images = mm_counts.get("image", 0)
    processor = self.info.get_hf_processor()
    image_token = processor.image_token
    return image_token * num_images
 ```
 :::{note}
 Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP.
 :::
-::::
+:::{tab-item} No input placeholders: Fuyu
 ::::{tab-item} Non-consecutive feature tokens: Fuyu
 :sync: fuyu
 Looking at the code of HF's `FuyuForCausalLM`:
@ -383,213 +405,42 @@ num_patches_per_dim_w = image_width // patch_width
 num_patches = num_patches_per_dim_h * num_patches_per_dim_w
 ```
-We can calculate this in vLLM using this code:
+These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
-
+to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
 ```python
 def get_num_image_patches(
    self,
    *,
    image_width: int,
    image_height: int,
 ) -> int:
    image_processor = self.get_image_processor()
    target_width = image_processor.size["width"]
    target_height = image_processor.size["height"]
    patch_width = image_processor.patch_size["width"]
    patch_height = image_processor.patch_size["height"]
    if not (image_width <= target_width and image_height <= target_height):
        height_scale_factor = target_height / image_height
        width_scale_factor = target_width / image_width
        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
        image_height = int(image_height * optimal_scale_factor)
        image_width = int(image_width * optimal_scale_factor)
    ncols = math.ceil(image_width / patch_width)
    nrows = math.ceil(image_height / patch_height)
    return ncols * nrows
 ```
 These image patches correspond to placeholder tokens (`|SPEAKER|`). However, the processor also
 inserts newline tokens (`|NEWLINE|`) as shown here:
 ```python
 # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L654-L670
 tensor_of_image_ids = torch.full(
    [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
 )
 patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
 assert num_patches == patches.shape[0]
 if variable_sized:
    # Now terminate each line with |NEWLINE|.
    tensor_of_image_ids = tensor_of_image_ids.reshape(-1, image_width // patch_width)
    newline_ids = torch.full(
        [tensor_of_image_ids.shape[0], 1],
        image_newline_id,
        dtype=torch.int32,
        device=image_input.device,
    )
    tensor_of_image_ids = torch.cat([tensor_of_image_ids, newline_ids], dim=1)
    tensor_of_image_ids = tensor_of_image_ids.reshape(-1)
 ```
 So, the layout of tokens for an image is:
 ```
 |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
 |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
 ...
 |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
 ```
 This makes the placeholder tokens non-consecutive in the prompt.
 Since vLLM requires the feature tokens to be consecutive, **we also treat the newline tokens as feature tokens**.
 So overall, the total number of feature tokens is
 ```python
 def get_num_image_tokens(
    self,
    *,
    image_width: int,
    image_height: int,
 ) -> int:
    image_processor = self.get_image_processor()
    target_width = image_processor.size["width"]
    target_height = image_processor.size["height"]
    patch_width = image_processor.patch_size["width"]
    patch_height = image_processor.patch_size["height"]
    if not (image_width <= target_width and image_height <= target_height):
        height_scale_factor = target_height / image_height
        width_scale_factor = target_width / image_width
        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
        image_height = int(image_height * optimal_scale_factor)
        image_width = int(image_width * optimal_scale_factor)
    ncols = math.ceil(image_width / patch_width)
    nrows = math.ceil(image_height / patch_height)
    return (ncols + 1) * nrows
 ```
 To calculate the maximum number of image tokens, recall that input images are first resized
 to fit within `image_processor.size`. The maximum possible dimensions of the image before
 being converted into patches is therefore equal to `image_processor.size`.
 ```python
 def get_image_size_with_most_features(self) -> ImageSize:
    image_processor = self.get_image_processor()
    return ImageSize(width=image_processor.size["width"],
                        height=image_processor.size["height"])
 def get_max_image_tokens(self) -> int:
    target_width, target_height = self.get_image_size_with_most_features()
    return self.get_num_image_tokens(
        image_width=target_width,
        image_height=target_height,
    )
 ```
-And thus, we can override the method as:
+Fuyu does not expect image placeholders in the inputs to HF processor, so
 the dummy prompt text is empty regardless of the number of images.
 ```python
-def get_mm_max_tokens_per_item(
+def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
    return ""
 ```
 For the multimodal image profiling data, the logic is very similar to LLaVA:
 ```python
 def get_dummy_mm_data(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
-) -> Mapping[str, int]:
+) -> MultiModalDataDict:
-    return {"image": self.get_max_image_tokens()}
+    target_width, target_height = \
-```
+        self.info.get_image_size_with_most_features()
 :::{note}
 Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) returns `ncols` and `nrows` directly instead of the total token count.
 This is because `ncols` and `nrows` are used to specify the layout of the feature tokens (as shown in Step 4 of this guide).
 :::
 ::::
 :::::
 ## 3. Specify dummy inputs
 Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
 HF processing as well as memory profiling.
 ### For memory profiling
 Override the abstract method {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`
 to construct dummy inputs for memory profiling. This dummy input should result in the worst-case memory usage of
 the model so that vLLM can reserve the correct amount of memory for it.
 Assuming that the memory usage increases with the number of tokens, the dummy input can be constructed based
 on the code for {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_mm_max_tokens_per_item`.
 ::::{tab-set}
 :::{tab-item} Basic example: LLaVA
 :sync: llava
 Making use of the `get_image_size_with_most_features` method implemented in Step 2:
 ```python
 def get_dummy_processor_inputs(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
 ) -> ProcessorInputs:
    num_images = mm_counts.get("image", 0)
-    processor = self.info.get_hf_processor()
+    return {
    image_token = processor.image_token
    hf_config = self.get_hf_config()
    target_width, target_height = self.info.get_image_size_with_most_features()
    mm_data = {
        "image":
        self._get_dummy_images(width=target_width,
                               height=target_height,
                               num_images=num_images)
    }
    return ProcessorInputs(
        prompt_text=image_token * num_images,
        mm_data=mm_data,
    )
 ```
 :::
 :::{tab-item} No input placeholders: Fuyu
 :sync: fuyu
 Fuyu does not expect image placeholders in the inputs to HF processor, so
 the dummy prompt text is empty regardless of the number of images.
 Otherwise, the logic of this method is very similar to LLaVA:
 ```python
 def get_dummy_processor_inputs(
    self,
    seq_len: int,
    mm_counts: Mapping[str, int],
 ) -> ProcessorInputs:
    target_width, target_height = \
        self.info.get_image_size_with_most_features()
    num_images = mm_counts.get("image", 0)
    mm_data = {
        "image":
        self._get_dummy_images(width=target_width,
                                height=target_height,
                                num_images=num_images)
    }
    return ProcessorInputs(
        prompt_text="",
        mm_data=mm_data,
    )
 ```
 :::
@ -860,8 +711,8 @@ prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
 )
 ```
-To accommodate this, instead of a string you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`
+To assign the vision embeddings to only the image tokens, instead of a string
-with different `full` and `feature` attributes:
+you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
 ```python
 hf_config = self.info.get_hf_config()
@ -879,9 +730,9 @@ def get_replacement_fuyu(item_idx: int):
    image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                    [_NEWLINE_TOKEN_ID]) * nrows
-    return PromptUpdateDetails(
+    return PromptUpdateDetails.select_token_id(
-        full=image_tokens + [bos_token_id],
+        image_tokens + [bos_token_id],
-        features=image_tokens,
+        embed_token_id=_IMAGE_TOKEN_ID,
    )
 ```
@ -914,9 +765,9 @@ def _get_prompt_updates(
        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
                        [_NEWLINE_TOKEN_ID]) * nrows
-        return PromptUpdateDetails(
+        return PromptUpdateDetails.select_token_id(
-            full=image_tokens + [bos_token_id],
+            image_tokens + [bos_token_id],
-            features=image_tokens,
+            embed_token_id=_IMAGE_TOKEN_ID,
        )
    return [
--- a/docs/source/deployment/docker.md
+++ b/docs/source/deployment/docker.md
@ -19,6 +19,18 @@ $ docker run --runtime nvidia --gpus all \
    --model mistralai/Mistral-7B-v0.1
 ```
 This image can also be used with other container engines such as [Podman](https://podman.io/).
 ```console
 $ podman run --gpus all \
  -v ~/.cache/huggingface:/root/.cache/huggingface \
  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
  -p 8000:8000 \
  --ipc=host \
  vllm/vllm-openai:latest \
  --model mistralai/Mistral-7B-v0.1
 ```
 You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
 :::{note}
@ -34,11 +46,11 @@ If you need to use those dependencies (having accepted the license terms),
 create a custom Dockerfile on top of the base image with an extra layer that installs them:
 ```Dockerfile
-FROM vllm/vllm-openai:v0.8.2
+FROM vllm/vllm-openai:v0.8.3
-# e.g. install the `audio` and `video` optional dependencies
+# e.g. install the `audio` optional dependencies
 # NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio,video]==0.8.2
+RUN uv pip install --system vllm[audio]==0.8.3
 ```
 :::
--- a/docs/source/deployment/frameworks/index.md
+++ b/docs/source/deployment/frameworks/index.md
@ -9,6 +9,7 @@ dstack
 helm
 lws
 modal
 open-webui
 skypilot
 triton
 :::
--- a/docs/source/deployment/frameworks/open-webui.md
+++ b/docs/source/deployment/frameworks/open-webui.md
@ -0,0 +1,29 @@
 (deployment-open-webui)=
 # Open WebUI
 1. Install the [Docker](https://docs.docker.com/engine/install/)
 2. Start the vLLM server with the supported chat completion model, e.g.
 ```console
 vllm serve qwen/Qwen1.5-0.5B-Chat
 ```
 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
 ```console
 docker run -d -p 3000:8080 \
 --name open-webui \
 -v open-webui:/app/backend/data \
 -e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
 --restart always \
 ghcr.io/open-webui/open-webui:main
 ```
 1. Open it in the browser: <http://open-webui-host:3000/>
 On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
 :::{image} /assets/deployment/open_webui.png
 :::
--- a/docs/source/deployment/integrations/production-stack.md
+++ b/docs/source/deployment/integrations/production-stack.md
@ -16,7 +16,7 @@ Ensure that you have a running Kubernetes environment with GPU (you can follow [
 ## Deployment using vLLM production stack
-The standard vLLM production stack install uses a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/tutorials/install-helm.sh) to install Helm on your GPU server.
+The standard vLLM production stack is installed using a Helm chart. You can run this [bash script](https://github.com/vllm-project/production-stack/blob/main/utils/install-helm.sh) to install Helm on your GPU server.
 To install the vLLM production stack, run the following commands on your desktop:
--- a/docs/source/deployment/k8s.md
+++ b/docs/source/deployment/k8s.md
@ -46,6 +46,7 @@ metadata:
 type: Opaque
 data:
  token: $(HF_TOKEN)
 EOF
 ```
 Next, start the vLLM server as a Kubernetes Deployment and Service:
--- a/docs/source/design/mm_processing.md
+++ b/docs/source/design/mm_processing.md
@ -8,7 +8,7 @@ Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModal
 ## Prompt Update Detection
-One of the main responsibilies of HF processor is to update the prompt with placeholder tokens. For example:
+One of the main responsibilities of HF processor is to update the prompt with placeholder tokens. For example:
 - Insert feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size) at the start of the string.
 - Replace existing input placeholder tokens (e.g. `<image>` for a single image) with feature placeholder tokens (e.g. `<image><image>...<image>`, the number of which equals to the feature size).
@ -47,7 +47,7 @@ Moreover, since the tokenized text has not passed through the HF processor, we h
 ### Dummy text
-We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_processor_inputs`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
 (mm-automatic-prompt-updating)=
--- a/docs/source/design/v1/metrics.md
+++ b/docs/source/design/v1/metrics.md
@ -66,8 +66,8 @@ vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 - `vllm:e2e_request_latency_seconds_bucket` - End to end request latency measured in seconds
- `vllm:prompt_tokens_total` - Prompt Tokens/Sec
+- `vllm:prompt_tokens_total` - Prompt Tokens
- `vllm:generation_tokens_total` - Generation Tokens/Sec
+- `vllm:generation_tokens_total` - Generation Tokens
 - `vllm:time_per_output_token_seconds` - Inter token latency (Time Per Output Token, TPOT) in second.
 - `vllm:time_to_first_token_seconds` - Time to First Token (TTFT) latency in seconds.
 - `vllm:num_requests_running` (also, `_swapped` and `_waiting`) - Number of requests in RUNNING, WAITING, and SWAPPED state
@ -86,6 +86,17 @@ See [the PR which added this Dashboard](gh-pr:2316) for interesting and useful b
 Prometheus support was initially added [using the aioprometheus library](gh-pr:1890), but a switch was made quickly to [prometheus_client](gh-pr:2730). The rationale is discussed in both linked PRs.
 With the switch to `aioprometheus`, we lost a `MetricsMiddleware` to track HTTP metrics, but this was reinstated [using prometheus_fastapi_instrumentator](gh-pr:15657):
 ```bash
 $ curl http://0.0.0.0:8000/metrics 2>/dev/null  | grep -P '^http_(?!.*(_bucket|_created|_sum)).*'
 http_requests_total{handler="/v1/completions",method="POST",status="2xx"} 201.0
 http_request_size_bytes_count{handler="/v1/completions"} 201.0
 http_response_size_bytes_count{handler="/v1/completions"} 201.0
 http_request_duration_highr_seconds_count 201.0
 http_request_duration_seconds_count{handler="/v1/completions",method="POST"} 201.0
 ```
 ### Multi-process Mode
 In v0, metrics are collected in the engine core process and we use multi-process mode to make them available in the API server process. See <gh-pr:7279>.
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/source/design/v1/torch_compile.md
@ -99,7 +99,7 @@ This time, Inductor compilation is completely bypassed, and we will load from di
 The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
 Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
@ -126,7 +126,7 @@ Unfortunately, because auto-tuning takes quite a long time (from seconds to minu
 ## Cudagraph Capture
-vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trival to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
+vLLM's V1 architecture uses piecewise cudagraph. The full computation graph is split as mentioned above, and we only capture the cudagraph for the piece of graph between attention operations (including the first graph before any attention operation, and the last graph after all the attention operation). This is based on a common observation: computation between attentions are usually token-wise and easy to deal with for cudagraph; while the attention operation is non-trivial to be cudagraph compatible. Thus, by running the attention operation in eager mode while the rest operations in cudagraph, we keep the flexibility of the attention operation.
 The piecewise cudagraph also has fine-grained memory management. The purpose is to only exclude the attention kernel from cudagraph, while keeping all the rest modules and the memory allocation operations in the cudagraph. This is why the attention operation in V1 has the output tensor as the input of the attention.
@ -134,6 +134,6 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
-`VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
+`vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
--- a/docs/source/features/lora.md
+++ b/docs/source/features/lora.md
@ -106,19 +106,18 @@ curl http://localhost:8000/v1/completions \
 ## Dynamically serving LoRA Adapters
-In addition to serving LoRA adapters at server startup, the vLLM server now supports dynamically loading and unloading
+In addition to serving LoRA adapters at server startup, the vLLM server supports dynamically configuring LoRA adapters at runtime through dedicated API endpoints and plugins. This feature can be particularly useful when the flexibility to change models on-the-fly is needed.
 LoRA adapters at runtime through dedicated API endpoints. This feature can be particularly useful when the flexibility
 to change models on-the-fly is needed.
 Note: Enabling this feature in production environments is risky as users may participate in model adapter management.
-To enable dynamic LoRA loading and unloading, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
+To enable dynamic LoRA configuration, ensure that the environment variable `VLLM_ALLOW_RUNTIME_LORA_UPDATING`
-is set to `True`. When this option is enabled, the API server will log a warning to indicate that dynamic loading is active.
+is set to `True`.
 ```bash
 export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
 ```
 ### Using API Endpoints
 Loading a LoRA Adapter:
 To dynamically load a LoRA adapter, send a POST request to the `/v1/load_lora_adapter` endpoint with the necessary
@ -153,6 +152,58 @@ curl -X POST http://localhost:8000/v1/unload_lora_adapter \
 }'
 ```
 ### Using Plugins
 Alternatively, you can use the LoRAResolver plugin to dynamically load LoRA adapters. LoRAResolver plugins enable you to load LoRA adapters from both local and remote sources such as local file system and S3. On every request, when there's a new model name that hasn't been loaded yet, the LoRAResolver will try to resolve and load the corresponding LoRA adapter.
 You can set up multiple LoRAResolver plugins if you want to load LoRA adapters from different sources. For example, you might have one resolver for local files and another for S3 storage. vLLM will load the first LoRA adapter that it finds.
 You can either install existing plugins or implement your own.
 Steps to implement your own LoRAResolver plugin:
 1. Implement the LoRAResolver interface.
    Example of a simple S3 LoRAResolver implementation:
    ```python
    import os
    import s3fs
    from vllm.lora.request import LoRARequest
    from vllm.lora.resolver import LoRAResolver
    class S3LoRAResolver(LoRAResolver):
        def __init__(self):
            self.s3 = s3fs.S3FileSystem()
            self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
            self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
        async def resolve_lora(self, base_model_name, lora_name):
            s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
            local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
            # Download the LoRA from S3 to the local path
            await self.s3._get(
                s3_path, local_path, recursive=True, maxdepth=1
            )
            lora_request = LoRARequest(
                lora_name=lora_name,
                lora_path=local_path,
                lora_int_id=abs(hash(lora_name))
            )
            return lora_request
    ```
 2. Register LoRAResolver plugin.
     ```python
    from vllm.lora.resolver import LoRAResolverRegistry
    s3_resolver = S3LoRAResolver()
    LoRAResolverRegistry.register_resolver("s3_resolver", s3_resolver)
    ```
    For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md).
 ## New format for `--lora-modules`
 In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example:
--- a/docs/source/features/quantization/bitblas.md
+++ b/docs/source/features/quantization/bitblas.md
@ -0,0 +1,48 @@
 (bitblas)=
 # BitBLAS
 vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
 :::{note}
 Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
 Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
 For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
 :::
 Below are the steps to utilize BitBLAS with vLLM.
 ```console
 pip install bitblas>=0.1.0
 ```
 vLLM reads the model's config file and supports pre-quantized checkpoints.
 You can find pre-quantized models on:
 - [Hugging Face (BitBLAS)](https://huggingface.co/models?other=bitblas)
 - [Hugging Face (GPTQ)](https://huggingface.co/models?other=gptq)
 Usually, these repositories have a `quantize_config.json` file that includes a `quantization_config` section.
 ## Read bitblas format checkpoint
 ```python
 from vllm import LLM
 import torch
 # "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
 model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
 llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
 ```
 ## Read gptq format checkpoint
 ```python
 from vllm import LLM
 import torch
 # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
 model_id = "hxbgsyxh/llama-13b-4bit-g-1"
 llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
 ```
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/source/features/quantization/bnb.md
@ -19,17 +19,20 @@ And usually, these repositories have a config.json file that includes a quantiza
 ## Read quantized checkpoint
 For pre-quantized checkpoints, vLLM will try to infer the quantization method from the config file, so you don't need to explicitly specify the quantization argument.
 ```python
 from vllm import LLM
 import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
 quantization="bitsandbytes")
 ```
 ## Inflight quantization: load as 4bit quantization
 For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify the quantization argument.
 ```python
 from vllm import LLM
 import torch
@ -40,7 +43,7 @@ quantization="bitsandbytes")
 ## OpenAI Compatible Server
-Append the following to your 4bit model arguments:
+Append the following to your model arguments for 4bit inflight quantization:
 ```console
 --quantization bitsandbytes
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/source/features/quantization/gguf.md
@ -29,7 +29,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlam
 We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
 :::
-GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-confing-path
+GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
 ```console
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@ -11,6 +11,7 @@ Quantization trades off model precision for smaller memory footprint, allowing l
 supported_hardware
 auto_awq
 bnb
 bitblas
 gguf
 gptqmodel
 int4
@ -18,4 +19,5 @@ int8
 fp8
 quark
 quantized_kvcache
 torchao
 :::
--- a/docs/source/features/quantization/supported_hardware.md
+++ b/docs/source/features/quantization/supported_hardware.md
@ -62,7 +62,7 @@ The table below shows the compatibility of various quantization implementations
  * ❌
  * ✅︎
  * ❌
-  * ❌
+  * ✅︎
 - * FP8 (W8A8)
  * ❌
  * ❌
@ -74,6 +74,17 @@ The table below shows the compatibility of various quantization implementations
  * ❌
  * ❌
  * ❌
 - * BitBLAS (GPTQ)
  * ✅︎
  * ✅︎
  * ✅︎
  * ✅︎
  * ✅︎
  * ✅︎
  * ❌
  * ❌
  * ❌
  * ❌
 - * AQLM
  * ✅︎
  * ✅︎
--- a/docs/source/features/quantization/torchao.md
+++ b/docs/source/features/quantization/torchao.md
@ -0,0 +1,34 @@
 # TorchAO
 TorchAO is an architecture optimization library for PyTorch, it provides high performance dtypes, optimization techniques and kernels for inference and training, featuring composability with native PyTorch features like torch.compile, FSDP etc.. Some benchmark numbers can be found [here](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks).
 We recommend installing the latest torchao nightly with
 ```console
 # Install the latest TorchAO nightly build
 # Choose the CUDA version that matches your system (cu126, cu128, etc.)
 pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126
 ```
 ## Quantizing HuggingFace Models
 You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
 ```Python
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 from torchao.quantization import Int8WeightOnlyConfig
 model_name = "meta-llama/Meta-Llama-3-8B"
 quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
 quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 hub_repo = # YOUR HUB REPO ID
 tokenizer.push_to_hub(hub_repo)
 quantized_model.push_to_hub(hub_repo, safe_serialization=False)
 ```
 Alternatively, you can use the TorchAO Quantization space for quantizing models with a simple UI.
 See: https://huggingface.co/spaces/medmekk/TorchAO_Quantization
--- a/docs/source/features/tool_calling.md
+++ b/docs/source/features/tool_calling.md
@ -152,12 +152,13 @@ Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_cha
 Supported models:
-* `meta-llama/Meta-Llama-3.1-8B-Instruct`
+All Llama 3.1 and 3.2 models should be supported.
-* `meta-llama/Meta-Llama-3.1-70B-Instruct`
+
-* `meta-llama/Meta-Llama-3.1-405B-Instruct`
+* `meta-llama/Llama-3.1-*`
-* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8`
+* `meta-llama/Llama-3.2-*`
 The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below.
 The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below.
 Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
 Known issues:
@ -166,10 +167,14 @@ Known issues:
 2. The model can generate parameters with a wrong format, such as generating
   an array serialized as string instead of an array.
-The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that
+VLLM provides two JSON based chat templates for Llama 3.1 and 3.2:
 it works better with vLLM.
-Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
+* `examples/tool_chat_template_llama3.1_json.jinja` - this is the "official" chat template for the Llama 3.1
 models, but tweaked so that it works better with vLLM.
 * `examples/tool_chat_template_llama3.2_json.jinja` - this extends upon the Llama 3.1 chat template by adding support for
 images.
 Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
 #### IBM Granite
@ -245,6 +250,8 @@ Example supported models:
 * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
 * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
 * `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
 * `meta-llama/Llama-4-Scout-17B-16E-Instruct`\* (use with `examples/tool_chat_template_llama4_pythonic.jinja`)
 * `meta-llama/Llama-4-Maverick-17B-128E-Instruct`\* (use with `examples/tool_chat_template_llama4_pythonic.jinja`)
 Flags: `--tool-call-parser pythonic --chat-template {see_above}`
--- a/docs/source/generate_examples.py
+++ b/docs/source/generate_examples.py
@ -17,6 +17,7 @@ def fix_case(text: str) -> str:
        "cli": "CLI",
        "cpu": "CPU",
        "llm": "LLM",
        "mae": "MAE",
        "tpu": "TPU",
        "aqlm": "AQLM",
        "gguf": "GGUF",
@ -24,6 +25,7 @@ def fix_case(text: str) -> str:
        "rlhf": "RLHF",
        "vllm": "vLLM",
        "openai": "OpenAI",
        "lmcache": "LMCache",
        "multilora": "MultiLoRA",
        "mlpspeculator": "MLPSpeculator",
        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@ -13,11 +13,11 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 - Intel Gaudi accelerator
 - Intel Gaudi software version 1.18.0
-Please follow the instructions provided in the [Gaudi Installation
+Please follow the instructions provided in the
-Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
+[Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html)
 to set up the execution environment. To achieve the best performance,
-please follow the methods outlined in the [Optimizing Training Platform
+please follow the methods outlined in the
-Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 ## Configure a new environment
@ -32,15 +32,13 @@ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloade
 pip list | grep neural # verify that neural_compressor is installed
 ```
-Refer to [Intel Gaudi Software Stack
+Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
 Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade)
 for more details.
 ### Run Docker Image
 It is highly recommended to use the latest Docker image from Intel Gaudi
-vault. Refer to the [Intel Gaudi
+vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
 documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers)
 for more details.
 Use the following commands to run a Docker image:
@ -278,8 +276,9 @@ Lower value corresponds to less usable graph memory reserved for prefill stage,
 :::
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
-\- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
+
-\- `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
+- `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode
 - `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
@ -326,8 +325,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 - We recommend running inference on Gaudi 2 with `block_size` of 128
  for BF16 data type. Using default values (16, 32) might lead to
  sub-optimal performance due to Matrix Multiplication Engine
-  under-utilization (see [Gaudi
+  under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
  Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)).
 - For max throughput on Llama 7B, we recommend running with batch size
  of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
  If you encounter out-of-memory issues, see troubleshooting section.
@ -336,11 +334,11 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 **Diagnostic and profiling knobs:**
- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default.
+- `VLLM_PROFILER_ENABLED`: If `true`, enable the high level profiler. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). `false` by default.
- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: If `true`, log graph compilations for each vLLM engine step when any occurs. Highly recommended to use with `PT_HPU_METRICS_GC_DETAILS=1`. `false` by default.
- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
+- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: If `true`, always log graph compilations for each vLLM engine step even if none occurred. `false` by default.
- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: If `true`, log CPU fallbacks for each vLLM engine step when any occurs. `false` by default.
- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, always log CPU fallbacks for each vLLM engine step even if none occurred. `false` by default.
 **Performance tuning knobs:**
@ -381,7 +379,7 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default
+- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used; if `1`, PyTorch Lazy backend for Gaudi will be used. `1` is default.
 - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs
 ## Troubleshooting: tweaking HPU graphs
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/source/getting_started/installation/cpu.md
@ -272,7 +272,7 @@ $ python examples/offline_inference/basic/basic.py
 - Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance.
- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.inc.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
+- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, Tensor Parallel is a option for better performance.
  - Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving:
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/source/getting_started/installation/cpu/apple.inc.md
@ -12,7 +12,7 @@ There are no pre-built wheels or images for this device, so you must build vLLM
 - OS: `macOS Sonoma` or later
 - SDK: `XCode 15.4` or later with Command Line Tools
- Compiler: `Apple Clang >= 15.0.0` and `Apple Clang < 17.0.0`
+- Compiler: `Apple Clang >= 15.0.0`
 ## Set up using Python
@ -51,14 +51,6 @@ If the build has error like the following snippet where standard C++ headers can
      1 error generated.
 ```
 If run with error like the following snippet you need to check clang version and install a compatible version.
 ```text
 AttributeError: '_OpNamespace' '_C' object has no attribute 'silu_and_mul'
 ```
 More information can be found in <gh-issue:15941>.
 ## Set up using Docker
 ### Pre-built images
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/source/getting_started/installation/cpu/build.inc.md
@ -2,7 +2,7 @@ First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as
 ```console
 sudo apt-get update  -y
-sudo apt-get install -y gcc-12 g++-12 libnuma-dev
+sudo apt-get install -y gcc-12 g++-12 libnuma-dev python3-dev
 sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 ```
@ -26,3 +26,9 @@ Finally, build and install vLLM CPU backend:
 ```console
 VLLM_TARGET_DEVICE=cpu python setup.py install
 ```
 If you want to develop vllm, install it in editable mode instead.
 ```console
 VLLM_TARGET_DEVICE=cpu python setup.py develop
 ```
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/source/getting_started/installation/gpu/cuda.inc.md
@ -46,7 +46,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ##### Install the latest code using `pip`
 ```console
-pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
 ```
 `--pre` is required for `pip` to consider pre-released versions.
@ -65,9 +65,11 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p
 Another way to install the latest code is to use `uv`:
 ```console
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly
+uv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 ##### Install specific revisions using `uv`
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 ```console
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/source/getting_started/installation/gpu/xpu.inc.md
@ -23,6 +23,8 @@ Currently, there are no pre-built XPU wheels.
 - Second, install Python packages for vLLM XPU backend building:
 ```console
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
 pip install --upgrade pip
 pip install -v -r requirements/xpu.txt
 ```
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/source/getting_started/troubleshooting.md
@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form
 ## Out of memory
-If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](#reducing-memory-usage) to reduce the memory consumption.
 ## Generation quality changed
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/source/getting_started/v1_user_guide.md
@ -156,10 +156,3 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring
  cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
 For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
 ## Frequently Asked Questions
 **I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
 The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
 On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.
--- a/docs/source/models/extensions/fastsafetensor.md
+++ b/docs/source/models/extensions/fastsafetensor.md
@ -1,5 +1,5 @@
 Loading Model weights with fastsafetensors
 ===================================================================
-Using fastsafetensor library enables loading model weights to GPU memory by leveraging GPU direct storage. See https://github.com/foundation-model-stack/fastsafetensors for more details.
+Using fastsafetensors library enables loading model weights to GPU memory by leveraging GPU direct storage. See [their GitHub repository](https://github.com/foundation-model-stack/fastsafetensors) for more details.
 For enabling this feature, set the environment variable ``USE_FASTSAFETENSOR`` to ``true``
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/source/models/extensions/tensorizer.md
@ -9,7 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html).
 :::{note}
 Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
--- a/docs/source/models/pooling_models.md
+++ b/docs/source/models/pooling_models.md
@ -141,3 +141,77 @@ Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints tha
 - [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 - [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
 - [Score API](#score-api) is similar to `LLM.score` for cross-encoder models.
 ## Matryoshka Embeddings
 [Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
 :::{warning}
 Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
 For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
 ```json
 {"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
 ```
 :::
 ### Manually enable Matryoshka Embeddings
 There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, we simply check the existence of the fields `is_matryoshka` or `matryoshka_dimensions` inside `config.json`.
 For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}` (offline) or `--hf_overrides '{"is_matryoshka": true}'` (online).
 Here is an example to serve a model with Matryoshka Embeddings enabled.
 ```text
 vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"is_matryoshka":true}'
 ```
 ### Offline Inference
 You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in {class}`~vllm.PoolingParams`.
 ```python
 from vllm import LLM, PoolingParams
 model = LLM(model="jinaai/jina-embeddings-v3", 
            task="embed", 
            trust_remote_code=True)
 outputs = model.embed(["Follow the white rabbit."], 
                      pooling_params=PoolingParams(dimensions=32))
 print(outputs[0].outputs)
 ```
 A code example can be found here: <gh-file:examples/offline_inference/embed_matryoshka_fy.py>
 ### Online Inference
 Use the following command to start vllm server.
 ```text
 vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
 ```
 You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
 ```text
 curl http://127.0.0.1:8000/v1/embeddings \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
    "input": "Follow the white rabbit.",
    "model": "jinaai/jina-embeddings-v3",
    "encoding_format": "float",
    "dimensions": 1
  }'
 ```
 Expected output:
 ```json
 {"id":"embd-0aab28c384d348c3b8f0eb783109dc5f","object":"list","created":1744195454,"model":"jinaai/jina-embeddings-v3","data":[{"index":0,"object":"embedding","embedding":[-1.0]}],"usage":{"prompt_tokens":10,"total_tokens":10,"completion_tokens":0,"prompt_tokens_details":null}}
 ```
 A openai client example can be found here: <gh-file:examples/online_serving/openai_embedding_matryoshka_fy.py>
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@ -24,7 +24,7 @@ vLLM also supports model implementations that are available in Transformers. Thi
 To check if the modeling backend is Transformers, you can simply do this:
-```python 
+```python
 from vllm import LLM
 llm = LLM(model=..., task="generate")  # Name or path of your model
 llm.apply_model(lambda model: print(type(model)))
@ -55,7 +55,11 @@ If your model is neither supported natively by vLLM or Transformers, you can sti
 Simply set `trust_remote_code=True` and vLLM will run any model on the Model Hub that is compatible with Transformers.
 Provided that the model writer implements their model in a compatible way, this means that you can run new models before they are officially supported in Transformers or vLLM!
-```python 
+:::{tip}
 If you have not yet created your custom model, you can follow this guide on [customising models in Transformers](https://huggingface.co/docs/transformers/en/custom_models).
 :::
 ```python
 from vllm import LLM
 llm = LLM(model=..., task="generate", trust_remote_code=True)  # Name or path of your model
 llm.apply_model(lambda model: print(model.__class__))
@ -129,7 +133,7 @@ class MyConfig(PretrainedConfig):
 ### Hugging Face Hub
-By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models).
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
 To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
@ -160,6 +164,35 @@ If vLLM successfully returns text (for generative models) or hidden states (for
 Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
 Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
 #### Using a proxy
 Here are some tips for loading/downloading models from Hugging Face using a proxy:
 - Set the proxy globally for your session (or set it in the profile file):
 ```shell
 export http_proxy=http://your.proxy.server:port
 export https_proxy=http://your.proxy.server:port
 ```
 - Set the proxy for just the current command:
 ```shell
 https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
 # or use vllm cmd directly
 https_proxy=http://your.proxy.server:port  vllm serve <model_name> --disable-log-requests
 ```
 - Set the proxy in Python interpreter:
 ```python
 import os
 os.environ['http_proxy'] = 'http://your.proxy.server:port'
 os.environ['https_proxy'] = 'http://your.proxy.server:port'
 ```
 ### ModelScope
 To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
@ -233,9 +266,9 @@ See [this page](#generative-models) for more information on how to use generativ
  * `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
  *
  *
- * `ChatGLMModel`
+- * `ChatGLMModel`, `ChatGLMForConditionalGeneration`
  * ChatGLM
-  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.
+  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.
  * ✅︎
  * ✅︎
 - * `CohereForCausalLM`, `Cohere2ForCausalLM`
@ -303,6 +336,11 @@ See [this page](#generative-models) for more information on how to use generativ
  * `THUDM/glm-4-9b-chat-hf`, etc.
  * ✅︎
  * ✅︎
 - * `Glm4ForCausalLM`
  * GLM-4-0414
  * `THUDM/GLM-4-32B-0414`, etc.
  * ✅︎
  * ✅︎
 - * `GPT2LMHeadModel`
  * GPT-2
  * `gpt2`, `gpt2-xl`, etc.
@ -463,6 +501,11 @@ See [this page](#generative-models) for more information on how to use generativ
  * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
  *
  * ✅︎
 - * `Plamo2ForCausalLM`
  * PLaMo2
  * `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc.
  *
  *
 - * `QWenLMHeadModel`
  * Qwen
  * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
@ -478,6 +521,16 @@ See [this page](#generative-models) for more information on how to use generativ
  * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
  *
  * ✅︎
 - * `Qwen3ForCausalLM`
  * Qwen3
  * `Qwen/Qwen3-8B`, etc.
  * ✅︎
  * ✅︎
 - * `Qwen3MoeForCausalLM`
  * Qwen3MoE
  * `Qwen/Qwen3-MoE-15B-A2B`, etc.
  * ✅︎
  * ✅︎
 - * `StableLmForCausalLM`
  * StableLM
  * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
@ -691,6 +744,11 @@ If your model is not in the above list, we will try to automatically convert the
  * `BAAI/bge-reranker-v2-m3`, etc.
  *
  *
 - * `ModernBertForSequenceClassification`
  * ModernBert-based
  * `Alibaba-NLP/gte-reranker-modernbert-base`, etc.
  *
  *
 :::
 (supported-mm-models)=
@ -715,7 +773,7 @@ On the other hand, modalities separated by `/` are mutually exclusive.
 See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
 :::{important}
-To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference)
+**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
 or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
 Offline inference:
@ -730,9 +788,11 @@ llm = LLM(
 Online serving:
 ```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4
+vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
 ```
 **This is no longer required if you are using vLLM V1.**
 :::
 :::{note}
@ -834,9 +894,23 @@ See [this page](#generative-models) for more information on how to use generativ
  *
  * ✅︎
 - * `InternVLChatModel`
-  * InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
+  * InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
  * T + I<sup>E+</sup>
-  * `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
+  * `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
  *
  * ✅︎
  * ✅︎
 - * `KimiVLForConditionalGeneration`
  * Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking
  * T + I<sup>+</sup>
  * `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`
  *
  *
  * ✅︎
 - * `Llama4ForConditionalGeneration`
  * Llama 4
  * T + I<sup>+</sup>
  * `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc.
  *
  * ✅︎
  * ✅︎
@ -930,7 +1004,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * `microsoft/Phi-4-multimodal-instruct`, etc.
  * ✅︎
  *
-  *
+  * ✅︎
 - * `PixtralForConditionalGeneration`
  * Pixtral
  * T + I<sup>+</sup>
@ -966,6 +1040,13 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
  * ✅︎
  * ✅︎
 - * `Qwen2_5OmniThinkerForConditionalGeneration`
  * Qwen2.5-Omni
  * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>
  * `Qwen/Qwen2.5-Omni-7B`
  *
  * ✅︎
  * ✅︎\*
 - * `SkyworkR1VChatModel`
  * Skywork-R1V-38B
  * T + I
@ -973,6 +1054,13 @@ See [this page](#generative-models) for more information on how to use generativ
  *
  * ✅︎
  * ✅︎
 - * `SmolVLMForConditionalGeneration`
  * SmolVLM2
  * T + I
  * `SmolVLM2-2.2B-Instruct`
  *
  * ✅︎
  * ✅︎
 - * `UltravoxModel`
  * Ultravox
  * T + A<sup>E+</sup>
@ -989,9 +1077,6 @@ See [this page](#generative-models) for more information on how to use generativ
 <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
 :::{important}
 To use Gemma3 series models, you have to install Hugging Face Transformers library from source via
 `pip install git+https://github.com/huggingface/transformers`.
 Pan-and-scan image pre-processing is currently supported on V0 (but not V1).
 You can enable it by passing `--mm-processor-kwargs '{"do_pan_and_scan": True}'`.
 :::
@ -1031,6 +1116,14 @@ For more details, please see: <gh-pr:4087#issuecomment-2250397630>
 Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
 :::
 :::{note}
 To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
 `pip install git+https://github.com/huggingface/transformers.git`.
 Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
 `--mm-processor-kwargs '{"use_audio_in_video": True}'`.
 :::
 ### Pooling Models
 See [this page](pooling-models) for more information on how to use pooling models.
@ -1128,5 +1221,5 @@ We have the following levels of testing for models:
 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
--- a/docs/source/serving/multimodal_inputs.md
+++ b/docs/source/serving/multimodal_inputs.md
@ -228,7 +228,7 @@ First, launch the OpenAI-compatible server:
 ```bash
 vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
+  --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 ```
 Then, you can use the OpenAI client as follows:
--- a/docs/source/serving/offline_inference.md
+++ b/docs/source/serving/offline_inference.md
@ -28,6 +28,8 @@ Please refer to the above pages for more details about each API.
 [API Reference](/api/offline_inference/index)
 :::
 (configuration-options)=
 ## Configuration Options
 This section lists the most common options for running the vLLM engine.
@ -59,6 +61,8 @@ model = LLM(
 Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM.
 (reducing-memory-usage)=
 ### Reducing memory usage
 Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
@ -81,6 +85,12 @@ before initializing vLLM. Otherwise, you may run into an error like `RuntimeErro
 To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
 :::
 :::{note}
 With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
 You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
 :::
 #### Quantization
 Quantized models take less memory at the cost of lower precision.
@ -103,6 +113,39 @@ llm = LLM(model="adept/fuyu-8b",
          max_num_seqs=2)
 ```
 #### Reduce CUDA Graphs
 By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
 :::{important}
 CUDA graph capture takes up more memory in V1 than in V0.
 :::
 You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
 ```python
 from vllm import LLM
 from vllm.config import CompilationConfig, CompilationLevel
 llm = LLM(
    model="meta-llama/Llama-3.1-8B-Instruct",
    compilation_config=CompilationConfig(
        level=CompilationLevel.PIECEWISE,
        # By default, it goes up to max_num_seqs
        cudagraph_capture_sizes=[1, 2, 4, 8, 16],
    ),
 )
 ```
 You can disable graph capturing completely via the `enforce_eager` flag:
 ```python
 from vllm import LLM
 llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
          enforce_eager=True)
 ```
 #### Adjust cache size
 If you run out of CPU RAM, try the following options:
@ -110,6 +153,62 @@ If you run out of CPU RAM, try the following options:
 - (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 #### Multi-modal input limits
 You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model:
 ```python
 from vllm import LLM
 # Accept up to 3 images and 1 video per prompt
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
          limit_mm_per_prompt={"image": 3, "video": 1})
 ```
 You can go a step further and disable unused modalities completely by setting its limit to zero.
 For example, if your application only accepts image input, there is no need to allocate any memory for videos.
 ```python
 from vllm import LLM
 # Accept any number of images but no videos
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
          limit_mm_per_prompt={"video": 0})
 ```
 You can even run a multi-modal model for text-only inference:
 ```python
 from vllm import LLM
 # Don't accept images. Just text.
 llm = LLM(model="google/gemma-3-27b-it",
          limit_mm_per_prompt={"image": 0})
 ```
 #### Multi-modal processor arguments
 For certain models, you can adjust the multi-modal processor arguments to
 reduce the size of the processed multi-modal inputs, which in turn saves memory.
 Here are some examples:
 ```python
 from vllm import LLM
 # Available for Qwen2-VL series models
 llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
          mm_processor_kwargs={
              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
          })
 # Available for InternVL series models
 llm = LLM(model="OpenGVLab/InternVL2-2B",
          mm_processor_kwargs={
              "max_dynamic_patch": 4,  # Default is 12
          })
 ```
 ### Performance optimization and tuning
 You can potentially improve the performance of vLLM by finetuning various options.
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@ -2,15 +2,15 @@
 # OpenAI-Compatible Server
-vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more!
+vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client.
-You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker):
+In your terminal, you can [install](../getting_started/installation.md) vLLM, then start the server with the [`vllm serve`](#vllm-serve) command. (You can also use our [Docker](#deployment-docker) image.)
 ```bash
 vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
 ```
-To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client.
+To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
 ```python
 from openai import OpenAI
@ -33,11 +33,13 @@ print(completion.choices[0].message)
 vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
 You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
 :::
 :::{important}
 By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 To disable this behavior, please pass `--generation-config vllm` when launching the server.
 :::
 ## Supported APIs
 We currently support the following OpenAI APIs:
@ -172,6 +174,12 @@ print(completion._request_id)
 The `vllm serve` command is used to launch the OpenAI-compatible server.
 :::{tip}
 The vast majority of command-line arguments are based on those for offline inference.
 See [here](configuration-options) for some common options.
 :::
 :::{argparse}
 :module: vllm.entrypoints.openai.cli_args
 :func: create_parser_for_docs
@ -394,9 +402,26 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
 :::
 Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
 <!-- TODO: api enforced limits + uploading audios -->
-Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
+#### Extra Parameters
 The following [sampling parameters](#sampling-params) are supported.
 :::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-transcription-sampling-params
 :end-before: end-transcription-sampling-params
 :::
 The following extra parameters are supported:
 :::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
 :language: python
 :start-after: begin-transcription-extra-params
 :end-before: end-transcription-extra-params
 :::
 (tokenizer-api)=
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@ -47,7 +47,7 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
-        max_num_seqs=5,
+        max_num_seqs=2,
        limit_mm_per_prompt={"audio": audio_count},
    )
@ -89,7 +89,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=12800,
        max_num_seqs=2,
        enable_lora=True,
        max_lora_rank=320,
@ -130,6 +130,36 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
    )
 # Qwen2.5-Omni
 def run_qwen2_5_omni(question: str, audio_count: int):
    model_name = "Qwen/Qwen2.5-Omni-7B"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        limit_mm_per_prompt={"audio": audio_count},
    )
    audio_in_prompt = "".join([
        "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
    ])
    default_system = (
        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
        "Group, capable of perceiving auditory and visual inputs, as well as "
        "generating text and speech.")
    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
              "<|im_start|>user\n"
              f"{audio_in_prompt}{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
    )
 # Ultravox 0.5-1B
 def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
    model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
@ -182,59 +212,13 @@ model_example_map = {
    "minicpmo": run_minicpmo,
    "phi4_mm": run_phi4mm,
    "qwen2_audio": run_qwen2_audio,
    "qwen2_5_omni": run_qwen2_5_omni,
    "ultravox": run_ultravox,
    "whisper": run_whisper,
 }
-def main(args):
+def parse_args():
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")
    audio_count = args.num_audios
    req_data = model_example_map[model](question_per_audio_count[audio_count],
                                        audio_count)
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    # To maintain code compatibility in this script, we add LoRA here.
    # You can also add LoRA using:
    # llm.generate(prompts, lora_request=lora_request,...)
    if req_data.lora_requests:
        for lora_request in req_data.lora_requests:
            llm.llm_engine.add_lora(lora_request=lora_request)
    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
                                     stop_token_ids=req_data.stop_token_ids)
    mm_data = {}
    if audio_count > 0:
        mm_data = {
            "audio": [
                asset.audio_and_sample_rate
                for asset in audio_assets[:audio_count]
            ]
        }
    assert args.num_prompts > 0
    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
    if args.num_prompts > 1:
        # Batch inference
        inputs = [inputs] * args.num_prompts
    outputs = llm.generate(inputs, sampling_params=sampling_params)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
        'audio language models')
@ -258,5 +242,61 @@ if __name__ == "__main__":
                        default=None,
                        help="Set the seed when initializing `vllm.LLM`.")
-    args = parser.parse_args()
+    return parser.parse_args()
 def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")
    audio_count = args.num_audios
    req_data = model_example_map[model](question_per_audio_count[audio_count],
                                        audio_count)
    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {})
    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
    llm = LLM(**engine_args)
    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = SamplingParams(temperature=0.2,
                                     max_tokens=64,
                                     stop_token_ids=req_data.stop_token_ids)
    mm_data = {}
    if audio_count > 0:
        mm_data = {
            "audio": [
                asset.audio_and_sample_rate
                for asset in audio_assets[:audio_count]
            ]
        }
    assert args.num_prompts > 0
    inputs = {"prompt": req_data.prompt, "multi_modal_data": mm_data}
    if args.num_prompts > 1:
        # Batch inference
        inputs = [inputs] * args.num_prompts
    # Add LoRA request if applicable
    lora_request = (req_data.lora_requests *
                    args.num_prompts if req_data.lora_requests else None)
    outputs = llm.generate(
        inputs,
        sampling_params=sampling_params,
        lora_request=lora_request,
    )
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
 if __name__ == "__main__":
    args = parse_args()
    main(args)
--- a/examples/offline_inference/basic/basic.py
+++ b/examples/offline_inference/basic/basic.py
@ -12,16 +12,23 @@ prompts = [
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-# Create an LLM.
+
-llm = LLM(model="facebook/opt-125m")
+def main():
-# Generate texts from the prompts. The output is a list of RequestOutput objects
+    # Create an LLM.
-# that contain the prompt, generated text, and other information.
+    llm = LLM(model="facebook/opt-125m")
-outputs = llm.generate(prompts, sampling_params)
+    # Generate texts from the prompts.
-# Print the outputs.
+    # The output is a list of RequestOutput objects
-print("\nGenerated Outputs:\n" + "-" * 60)
+    # that contain the prompt, generated text, and other information.
-for output in outputs:
+    outputs = llm.generate(prompts, sampling_params)
-    prompt = output.prompt
+    # Print the outputs.
-    generated_text = output.outputs[0].text
+    print("\nGenerated Outputs:\n" + "-" * 60)
-    print(f"Prompt:    {prompt!r}")
+    for output in outputs:
-    print(f"Output:    {generated_text!r}")
+        prompt = output.prompt
-    print("-" * 60)
+        generated_text = output.outputs[0].text
        print(f"Prompt:    {prompt!r}")
        print(f"Output:    {generated_text!r}")
        print("-" * 60)
 if __name__ == "__main__":
    main()
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@ -4,6 +4,24 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 def create_parser():
    parser = FlexibleArgumentParser()
    # Add engine args
    engine_group = parser.add_argument_group("Engine arguments")
    EngineArgs.add_cli_args(engine_group)
    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)
    sampling_group.add_argument("--temperature", type=float)
    sampling_group.add_argument("--top-p", type=float)
    sampling_group.add_argument("--top-k", type=int)
    # Add example params
    parser.add_argument("--chat-template-path", type=str)
    return parser
 def main(args: dict):
    # Pop arguments not used by LLM
    max_tokens = args.pop("max_tokens")
@ -82,18 +100,6 @@ def main(args: dict):
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
+    parser = create_parser()
    # Add engine args
    engine_group = parser.add_argument_group("Engine arguments")
    EngineArgs.add_cli_args(engine_group)
    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)
    sampling_group.add_argument("--temperature", type=float)
    sampling_group.add_argument("--top-p", type=float)
    sampling_group.add_argument("--top-k", type=int)
    # Add example params
    parser.add_argument("--chat-template-path", type=str)
    args: dict = vars(parser.parse_args())
    main(args)
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
 from vllm.utils import FlexibleArgumentParser
 def parse_args():
    parser = FlexibleArgumentParser()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
                        task="classify",
                        enforce_eager=True)
    return parser.parse_args()
 def main(args: Namespace):
    # Sample prompts.
    prompts = [
@ -34,11 +44,5 @@ def main(args: Namespace):
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
+    args = parse_args()
    parser = EngineArgs.add_cli_args(parser)
    # Set example specific arguments
    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
                        task="classify",
                        enforce_eager=True)
    args = parser.parse_args()
    main(args)
--- a/Show More
+++ b/Show More
`@ -1,4 +1,4 @@`
	`FROM vault.habana.ai/gaudi-docker/1.19.1/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest`	`FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest`

	`COPY ./ /workspace/vllm`	`COPY ./ /workspace/vllm`