Merge branch 'main' into optimize-prefix-caching-scheduling

simplify code
format
2025-11-05 18:25:13 +08:00 · 2024-06-04 00:20:15 +00:00 · 2024-06-03 03:36:38 +00:00 · 2024-06-02 00:02:54 +00:00 · 2024-06-02 00:01:30 +00:00
554 changed files with 10439 additions and 38836 deletions
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@ -8,6 +8,10 @@ set -o pipefail
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg

--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
-model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.892
-  - name: "exact_match,flexible-extract"
-    value: 0.892
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
-model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.756
-  - name: "exact_match,flexible-extract"
-    value: 0.752
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
-model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.756
-  - name: "exact_match,flexible-extract"
-    value: 0.752
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x22B-Instruct-v0.1-FP8-Dynamic.yaml
@ -1,11 +0,0 @@
-# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
-model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.86
-  - name: "exact_match,flexible-extract"
-    value: 0.86
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@ -1,11 +0,0 @@
-# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
-model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.624
-  - name: "exact_match,flexible-extract"
-    value: 0.624
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
+++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml
@ -1,11 +0,0 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
-model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.616
-  - name: "exact_match,flexible-extract"
-    value: 0.632
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-57B-A14-Instruct.yaml
@ -1,11 +0,0 @@
-# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
-model_name: "Qwen/Qwen2-57B-A14B-Instruct"
-tasks:
- name: "gsm8k"
-  metrics:
-  - name: "exact_match,strict-match"
-    value: 0.792
-  - name: "exact_match,flexible-extract"
-    value: 0.824
-limit: 250
-num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@ -1,3 +0,0 @@
-Meta-Llama-3-70B-Instruct.yaml
-Mixtral-8x7B-Instruct-v0.1.yaml
-Qwen2-57B-A14-Instruct.yaml
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@ -1,2 +0,0 @@
-Meta-Llama-3-8B-Instruct.yaml
-Meta-Llama-3-8B-Instruct-FP8.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -1,46 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on GSM for transformers.
-#
-# Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using huggingface transformers."
-    echo "This pathway is intended to be used to create baselines for "
-    echo "our automated nm-test-accuracy workflow"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -m    - huggingface stub or local directory of the model"
-    echo "  -b    - batch size to run the evaluation at"
-    echo "  -l    - limit number of samples to run"
-    echo "  -f    - number of fewshot samples to use"
-    echo
-}
-
-while getopts "m:b:l:f:" OPT; do
-  case ${OPT} in
-    m ) 
-        MODEL="$OPTARG"
-        ;;
-    b ) 
-        BATCH_SIZE="$OPTARG"
-        ;;
-    l ) 
-        LIMIT="$OPTARG"
-        ;;
-    f ) 
-        FEWSHOT="$OPTARG"
-        ;;
-    \? ) 
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-lm_eval --model hf \
-  --model_args pretrained=$MODEL,parallelize=True \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -1,51 +0,0 @@
-#!/bin/bash
-# We can use this script to compute baseline accuracy on GSM for vllm.
-# We use this for fp8, which HF does not support.
-#
-# Make sure you have lm-eval-harness installed:
-#   pip install lm-eval==0.4.2
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using huggingface transformers."
-    echo "This pathway is intended to be used to create baselines for "
-    echo "our automated nm-test-accuracy workflow"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -m    - huggingface stub or local directory of the model"
-    echo "  -b    - batch size to run the evaluation at"
-    echo "  -l    - limit number of samples to run"
-    echo "  -f    - number of fewshot samples to use"
-    echo "  -t    - tensor parallel size to run at"
-    echo
-}
-
-while getopts "m:b:l:f:t:" OPT; do
-  case ${OPT} in
-    m ) 
-        MODEL="$OPTARG"
-        ;;
-    b ) 
-        BATCH_SIZE="$OPTARG"
-        ;;
-    l ) 
-        LIMIT="$OPTARG"
-        ;;
-    f ) 
-        FEWSHOT="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? ) 
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@ -1,59 +0,0 @@
-#!/bin/bash
-
-usage() {
-    echo``
-    echo "Runs lm eval harness on GSM8k using vllm and compares to "
-    echo "precomputed baseline (measured by HF transformers.)"
-    echo
-    echo "usage: ${0} <options>"
-    echo
-    echo "  -c    - path to the test data config (e.g. configs/small-models.txt)"
-    echo "  -t    - tensor parallel size"
-    echo
-}
-
-SUCCESS=0
-
-while getopts "c:t:" OPT; do
-  case ${OPT} in
-    c ) 
-        CONFIG="$OPTARG"
-        ;;
-    t )
-        TP_SIZE="$OPTARG"
-        ;;
-    \? )
-        usage
-        exit 1
-        ;;
-  esac
-done
-
-# Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
-
-for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
-do
-    LOCAL_SUCCESS=0
-    
-    echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
-
-    export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
-    export LM_EVAL_TP_SIZE=$TP_SIZE
-    pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
-
-    if [[ $LOCAL_SUCCESS == 0 ]]; then
-        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
-    else
-        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
-    fi
-
-    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
-
-done
-
-if [ "${SUCCESS}" -eq "0" ]; then
-    exit 0
-else
-    exit 1
-fi
--- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@ -1,54 +0,0 @@
-"""
-LM eval harness on model to compare vs HF baseline computed offline.
-Configs are found in configs/$MODEL.yaml
-
-* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
-* export LM_EVAL_TP_SIZE=4 
-* pytest -s test_lm_eval_correctness.py
-"""
-
-import os
-from pathlib import Path
-
-import lm_eval
-import numpy
-import yaml
-
-RTOL = 0.02
-TEST_DATA_FILE = os.environ.get(
-    "LM_EVAL_TEST_DATA_FILE",
-    ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
-
-TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
-
-
-def launch_lm_eval(eval_config):
-    model_args = f"pretrained={eval_config['model_name']}," \
-                 f"tensor_parallel_size={TP_SIZE}"
-
-    results = lm_eval.simple_evaluate(
-        model="vllm",
-        model_args=model_args,
-        tasks=[task["name"] for task in eval_config["tasks"]],
-        num_fewshot=eval_config["num_fewshot"],
-        limit=eval_config["limit"],
-        batch_size="auto")
-
-    return results
-
-
-def test_lm_eval_correctness():
-    eval_config = yaml.safe_load(
-        Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
-
-    # Launch eval requests.
-    results = launch_lm_eval(eval_config)
-
-    # Confirm scores match ground truth.
-    for task in eval_config["tasks"]:
-        for metric in task["metrics"]:
-            ground_truth = metric["value"]
-            measured_value = results["results"][task["name"]][metric["name"]]
-            print(f'{task["name"]} | {metric["name"]}: '
-                  f'ground_truth={ground_truth} | measured={measured_value}')
-            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -1,103 +0,0 @@
-# vLLM benchmark suite
-
-## Introduction
-
-This directory contains the performance benchmarking CI for vllm.
-The goal is to help developers know the impact of their PRs on the performance of vllm.
-
-This benchmark will be *triggered* upon:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label.
-
-**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models.
-
-**Benchmarking Duration**: about 1hr.
-
-**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run.
-
-
-## Configuring the workload
-
-The benchmarking workload contains three parts:
- Latency tests in `latency-tests.json`.
- Throughput tests in `throughput-tests.json`.
- Serving tests in `serving-tests.json`.
-
-See [descriptions.md](tests/descriptions.md) for detailed descriptions. 
-
-### Latency test
-
-Here is an example of one test inside `latency-tests.json`:
-
-```json
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-]
-```
-
-In this example:
-  The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
-  The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
-
-Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
-
-WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
-
-
-### Throughput test
-The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
-
-The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
-
-### Serving test
-We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
-
-```
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-]
-```
-
-Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py`
-
-The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly.
-
-WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
-
-## Visualizing the results
-The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
-You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
-If you do not see the table, please wait till the benchmark finish running.
-The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
-The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@ -1,62 +0,0 @@
-steps:
-  - label: "Wait for container to be ready"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: badouralix/curl-jq
-            command:
-            - sh
-            - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
-  - wait
-  - label: "A100 Benchmark"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-            command:
-            - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  # - label: "H100: NVIDIA SMI"
-  #   agents:
-  #     queue: H100
-  #   plugins:
-  #   - docker#v5.11.0:
-  #       image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-  #       command:
-  #       - bash
-  #       - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
-  #       mount-buildkite-agent: true
-  #       propagate-environment: true
-  #       propagate-uid-gid: false
-  #       ipc: host
-  #       gpus: all
-  #       environment:
-  #       - VLLM_USAGE_SOURCE
-  #       - HF_TOKEN
-
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
-set -euo pipefail
-
-# Install system packages
-apt update
-apt install -y curl jq
-
-# Install minijinja for templating
-curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
-source $HOME/.cargo/env
-
-# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
-if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
-
-  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
-    echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
-  else
-    echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
-    exit 0
-  fi
-fi
-
-# Upload sample.yaml
-buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@ -1,358 +0,0 @@
-#!/bin/bash
-
-# This script should be run inside the CI process
-# This script assumes that we are already inside the vllm/ directory
-# Benchmarking results will be available inside vllm/benchmarks/results/
-
-# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
-# and we still want to see other benchmarking results even when mixtral crashes.
-set -o pipefail
-
-check_gpus() {
-  # check the number of GPUs and GPU type.
-  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
-  if [[ $gpu_count -gt 0 ]]; then
-    echo "GPU found."
-  else
-    echo "Need at least 1 GPU to run benchmarking."
-    exit 1
-  fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
-  echo "GPU type is $gpu_type"
-}
-
-check_hf_token() {
-  # check if HF_TOKEN is available and valid
-  if [[ -z "$HF_TOKEN" ]]; then
-    echo "Error: HF_TOKEN is not set."
-    exit 1
-  elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
-    echo "Error: HF_TOKEN does not start with 'hf_'."
-    exit 1
-  else
-    echo "HF_TOKEN is set and valid."
-  fi
-}
-
-json2args() {
-  # transforms the JSON string to command line args, and '_' is replaced to '-'
-  # example:
-  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
-  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
-  local json_string=$1
-  local args=$(
-    echo "$json_string" | jq -r '
-      to_entries |
-      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
-      join(" ")
-    '
-  )
-  echo "$args"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  timeout 1200 bash -c '
-    until curl localhost:8000/v1/completions; do
-      sleep 1
-    done' && return 0 || return 1
-}
-
-kill_gpu_processes() {
-  # kill all processes on GPU.
-  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
-  if [ -z "$pids" ]; then
-      echo "No GPU processes found."
-  else
-      for pid in $pids; do
-          kill -9 "$pid"
-          echo "Killed process with PID: $pid"
-      done
-
-      echo "All GPU processes have been killed."
-  fi
-
-  # waiting for GPU processes to be fully killed
-  sleep 10
-
-  # remove vllm config file
-  rm -rf ~/.config/vllm
-
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-upload_to_buildkite() {
-  # upload the benchmarking results to buildkite
-
-  # if the agent binary is not found, skip uploading the results, exit 0
-  if [ ! -f /workspace/buildkite-agent ]; then
-    echo "buildkite-agent binary not found. Skip uploading the results."
-    return 0
-  fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
-  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
-}
-
-run_latency_tests() {
-  # run latency tests using `benchmark_latency.py`
-  # $1: a json file specifying latency test cases
-
-  local latency_test_file
-  latency_test_file=$1
-
-  # Iterate over latency tests
-  jq -c '.[]' "$latency_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^latency_ ]]; then
-      echo "In latency-test.json, test_name must start with \"latency_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # get arguments
-    latency_params=$(echo "$params" | jq -r '.parameters')
-    latency_args=$(json2args "$latency_params")
-
-    # check if there is enough GPU to run the test
-    tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
-      continue
-    fi
-
-    latency_command="python3 benchmark_latency.py \
-      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $latency_args"
-
-    echo "Running test case $test_name"
-    echo "Latency command: $latency_command"
-
-    # recoding benchmarking command ang GPU command
-    jq_output=$(jq -n \
-      --arg latency "$latency_command" \
-      --arg gpu "$gpu_type" \
-      '{
-        latency_command: $latency,
-        gpu_type: $gpu
-      }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
-
-    # run the benchmark
-    eval "$latency_command"
-
-    kill_gpu_processes
-
-  done
-}
-
-
-run_throughput_tests() {
-  # run throughput tests using `benchmark_throughput.py`
-  # $1: a json file specifying throughput test cases
-
-  local throughput_test_file
-  throughput_test_file=$1
-
-  # Iterate over throughput tests
-  jq -c '.[]' "$throughput_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^throughput_ ]]; then
-      echo "In throughput-test.json, test_name must start with \"throughput_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-    # get arguments
-    throughput_params=$(echo "$params" | jq -r '.parameters')
-    throughput_args=$(json2args "$throughput_params")
-
-    # check if there is enough GPU to run the test
-    tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
-      continue
-    fi
-
-    throughput_command="python3 benchmark_throughput.py \
-      --output-json $RESULTS_FOLDER/${test_name}.json \
-      $throughput_args"
-
-    echo "Running test case $test_name"
-    echo "Throughput command: $throughput_command"
-    # recoding benchmarking command ang GPU command
-    jq_output=$(jq -n \
-      --arg command "$throughput_command" \
-      --arg gpu "$gpu_type" \
-      '{
-        throughput_command: $command,
-        gpu_type: $gpu
-      }')
-    echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands"
-
-    # run the benchmark
-    eval "$throughput_command"
-
-    kill_gpu_processes
-
-  done
-}
-
-run_serving_tests() {
-  # run serving tests using `benchmark_serving.py`
-  # $1: a json file specifying serving test cases
-
-  local serving_test_file
-  serving_test_file=$1
-
-  # Iterate over serving tests
-  jq -c '.[]' "$serving_test_file" | while read -r params; do
-    # get the test name, and append the GPU type back to it.
-    test_name=$(echo "$params" | jq -r '.test_name')
-    if [[ ! "$test_name" =~ ^serving_ ]]; then
-      echo "In serving-test.json, test_name must start with \"serving_\"."
-      exit 1
-    fi
-
-    # if TEST_SELECTOR is set, only run the test cases that match the selector
-    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
-      echo "Skip test case $test_name."
-      continue
-    fi
-
-
-    # get client and server arguments
-    server_params=$(echo "$params" | jq -r '.server_parameters')
-    client_params=$(echo "$params" | jq -r '.client_parameters')
-    server_args=$(json2args "$server_params")
-    client_args=$(json2args "$client_params")
-    qps_list=$(echo "$params" | jq -r '.qps_list')
-    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
-    echo "Running over qps list $qps_list"
-
-    # check if there is enough GPU to run the test
-    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
-    if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
-      continue
-    fi
-
-    # check if server model and client model is aligned
-    server_model=$(echo "$server_params" | jq -r '.model')
-    client_model=$(echo "$client_params" | jq -r '.model')
-    if [[ $server_model != "$client_model" ]]; then
-      echo "Server model and client model must be the same. Skip testcase $testname."
-      continue
-    fi
-
-    server_command="python3 \
-      -m vllm.entrypoints.openai.api_server \
-      $server_args"
-
-    # run the server
-    echo "Running test case $test_name"
-    echo "Server command: $server_command"
-    eval "$server_command" &
-
-    # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
-      echo ""
-      echo "vllm server is up and running."
-    else
-      echo ""
-      echo "vllm failed to start within the timeout period."
-    fi
-
-    # iterate over different QPS
-    for qps in $qps_list; do
-      # remove the surrounding single quote from qps
-      if [[ "$qps" == *"inf"* ]]; then
-        echo "qps was $qps"
-        qps="inf"
-        echo "now qps is $qps"
-      fi
-
-      new_test_name=$test_name"_qps_"$qps
-
-      client_command="python3 benchmark_serving.py \
-        --save-result \
-        --result-dir $RESULTS_FOLDER \
-        --result-filename ${new_test_name}.json \
-        --request-rate $qps \
-        $client_args"
-
-      echo "Running test case $test_name with qps $qps"
-      echo "Client command: $client_command"
-
-      eval "$client_command"
-
-      # record the benchmarking commands
-      jq_output=$(jq -n \
-        --arg server "$server_command" \
-        --arg client "$client_command" \
-        --arg gpu "$gpu_type" \
-        '{
-          server_command: $server,
-          client_command: $client,
-          gpu_type: $gpu
-        }')
-      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
-
-    done
-
-    # clean up
-    kill_gpu_processes
-  done
-}
-
-main() {
-  check_gpus
-  check_hf_token
-
-  # dependencies
-  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-  (which jq) || (apt-get update && apt-get -y install jq)
-
-  # get the current IP address, required by benchmark_serving.py
-  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-  # turn of the reporting of the status of each request, to clean up the terminal output
-  export VLLM_LOG_LEVEL="WARNING"
-
-  # prepare for benchmarking
-  cd benchmarks || exit 1
-  wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-  declare -g RESULTS_FOLDER=results/
-  mkdir -p $RESULTS_FOLDER
-  QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
-
-  # benchmarking
-  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
-  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
-  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
-
-
-  # postprocess benchmarking results
-  pip install tabulate pandas
-  python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py
-
-  upload_to_buildkite
-}
-
-main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@ -1,192 +0,0 @@
-import json
-import os
-from pathlib import Path
-
-import pandas as pd
-from tabulate import tabulate
-
-results_folder = Path("results/")
-
-# latency results and the keys that will be printed into markdown
-latency_results = []
-latency_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    "avg_latency": "Mean latency (ms)",
-    # "P10": "P10 (s)",
-    # "P25": "P25 (s)",
-    "P50": "Median latency (ms)",
-    # "P75": "P75 (s)",
-    # "P90": "P90 (s)",
-    "P99": "P99 latency (ms)",
-}
-
-# throughput tests and the keys that will be printed into markdown
-throughput_results = []
-throughput_results_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    # "num_requests": "# of req.",
-    # "total_num_tokens": "Total # of tokens",
-    # "elapsed_time": "Elapsed time (s)",
-    "requests_per_second": "Tput (req/s)",
-    # "tokens_per_second": "Tput (tok/s)",
-}
-
-# serving results and the keys that will be printed into markdown
-serving_results = []
-serving_column_mapping = {
-    "test_name": "Test name",
-    "gpu_type": "GPU",
-    # "completed": "# of req.",
-    "request_throughput": "Tput (req/s)",
-    # "input_throughput": "Input Tput (tok/s)",
-    # "output_throughput": "Output Tput (tok/s)",
-    "mean_ttft_ms": "Mean TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
-    "p99_ttft_ms": "P99 TTFT (ms)",
-    # "mean_tpot_ms": "Mean TPOT (ms)",
-    # "median_tpot_ms": "Median",
-    # "p99_tpot_ms": "P99",
-    "mean_itl_ms": "Mean ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "p99_itl_ms": "P99 ITL (ms)",
-}
-
-
-def read_markdown(file):
-    if os.path.exists(file):
-        with open(file, "r") as f:
-            return f.read() + "\n"
-    else:
-        return f"{file} not found.\n"
-
-
-def results_to_json(latency, throughput, serving):
-    return json.dumps({
-        'latency': latency.to_dict(),
-        'throughput': throughput.to_dict(),
-        'serving': serving.to_dict()
-    })
-
-
-if __name__ == "__main__":
-
-    # collect results
-    for test_file in results_folder.glob("*.json"):
-
-        with open(test_file, "r") as f:
-            raw_result = json.loads(f.read())
-
-        if "serving" in str(test_file):
-            # this result is generated via `benchmark_serving.py`
-
-            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
-                command = json.loads(f.read())
-            raw_result.update(command)
-
-            # update the test name of this result
-            raw_result.update({"test_name": test_file.stem})
-
-            # add the result to raw_result
-            serving_results.append(raw_result)
-            continue
-
-        elif "latency" in f.name:
-            # this result is generated via `benchmark_latency.py`
-
-            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
-                command = json.loads(f.read())
-            raw_result.update(command)
-
-            # update the test name of this result
-            raw_result.update({"test_name": test_file.stem})
-
-            # get different percentiles
-            for perc in [10, 25, 50, 75, 90, 99]:
-                # Multiply 1000 to convert the time unit from s to ms
-                raw_result.update(
-                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
-            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
-
-            # add the result to raw_result
-            latency_results.append(raw_result)
-            continue
-
-        elif "throughput" in f.name:
-            # this result is generated via `benchmark_throughput.py`
-
-            # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
-                command = json.loads(f.read())
-            raw_result.update(command)
-
-            # update the test name of this result
-            raw_result.update({"test_name": test_file.stem})
-
-            # add the result to raw_result
-            throughput_results.append(raw_result)
-            continue
-
-        print(f"Skipping {test_file}")
-
-    latency_results = pd.DataFrame.from_dict(latency_results)
-    serving_results = pd.DataFrame.from_dict(serving_results)
-    throughput_results = pd.DataFrame.from_dict(throughput_results)
-
-    raw_results_json = results_to_json(latency_results, throughput_results,
-                                       serving_results)
-
-    # remapping the key, for visualization purpose
-    if not latency_results.empty:
-        latency_results = latency_results[list(
-            latency_column_mapping.keys())].rename(
-                columns=latency_column_mapping)
-    if not serving_results.empty:
-        serving_results = serving_results[list(
-            serving_column_mapping.keys())].rename(
-                columns=serving_column_mapping)
-    if not throughput_results.empty:
-        throughput_results = throughput_results[list(
-            throughput_results_column_mapping.keys())].rename(
-                columns=throughput_results_column_mapping)
-
-    processed_results_json = results_to_json(latency_results,
-                                             throughput_results,
-                                             serving_results)
-
-    # get markdown tables
-    latency_md_table = tabulate(latency_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    serving_md_table = tabulate(serving_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
-    throughput_md_table = tabulate(throughput_results,
-                                   headers='keys',
-                                   tablefmt='pipe',
-                                   showindex=False)
-
-    # document the result
-    with open(results_folder / "benchmark_results.md", "w") as f:
-
-        results = read_markdown(
-            "../.buildkite/nightly-benchmarks/tests/descriptions.md")
-        results = results.format(
-            latency_tests_markdown_table=latency_md_table,
-            throughput_tests_markdown_table=throughput_md_table,
-            serving_tests_markdown_table=serving_md_table,
-            benchmarking_results_in_json_string=processed_results_json)
-        f.write(results)
-
-    # document benchmarking results in json
-    with open(results_folder / "benchmark_results.json", "w") as f:
-
-        results = latency_results.to_dict(
-            orient='records') + throughput_results.to_dict(
-                orient='records') + serving_results.to_dict(orient='records')
-        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@ -1,17 +0,0 @@
-#!/bin/sh
-TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
-URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
-
-retries=0
-while [ $retries -lt 1000 ]; do
-    if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
-        exit 0
-    fi
-
-    echo "Waiting for image to be available..."
-
-    retries=$((retries + 1))
-    sleep 5
-done
-
-exit 1
--- a/.buildkite/nightly-benchmarks/tests/descriptions.md
+++ b/.buildkite/nightly-benchmarks/tests/descriptions.md
@ -1,67 +0,0 @@
-
-## Latency tests
-
-This test suite aims to test vllm's end-to-end latency under a controlled setup.
-
- Input length: 32 tokens.
- Output length: 128 tokens.
- Batch size: fixed (8).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: end-to-end latency (mean, median, p99).
-
-### Latency benchmarking results
-
-{latency_tests_markdown_table}
-
-## Throughput tests
-
-This test suite aims to test vllm's throughput.
-
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm to achieve maximum throughput.
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput.
-
-### Throughput benchmarking results
-
-{throughput_tests_markdown_table}
-
-## Serving tests
-
-This test suite aims to test vllm's real serving metrics.
-
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
- Output length: the corresponding output length of these 200 prompts.
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
-
-### Serving benchmarking results
-
-{serving_tests_markdown_table}
-
-## json version of the benchmarking tables
-
-This section contains the data of the markdown tables above in JSON format. 
-You can load the benchmarking tables into pandas dataframes as follows:
-
-```python
-import json
-import pandas as pd
-
-benchmarking_results_json = """The json string"""
-benchmarking_results = json.loads(benchmarking_results_json)
-latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
-throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
-serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
-```
-
-The json string for all benchmarking tables:
-```json
-{benchmarking_results_in_json_string}
-```
-
-You can also check the raw experiment data in the Artifact tab of the Buildkite page.
-
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
@ -1,32 +0,0 @@
-[
-    {
-        "test_name": "latency_llama8B_tp1",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-    {
-        "test_name": "latency_llama70B_tp4",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "num-iters-warmup": 5,
-            "num-iters": 15
-        }
-    },
-    {
-        "test_name": "latency_mixtral8x7B_tp2",
-        "parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "load_format": "dummy",
-            "num-iters-warmup": 5,
-            "num-iters": 15
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
@ -1,59 +0,0 @@
-[
-    {
-        "test_name": "serving_llama8B_tp1_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama70B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "server_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "swap_space": 16,
-            "disable_log_stats": "",
-            "disable_log_requests": "",
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    }
-]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
@ -1,35 +0,0 @@
-[
-    {
-        "test_name": "throughput_llama8B_tp1",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama70B_tp4",
-        "parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_mixtral8x7B_tp2",
-        "parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    }
-]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,21 +0,0 @@
-steps:
-  - block: "Build wheels"
-
-  - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" 
-    agents:
-      queue: cpu_queue
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-    matrix:
-      setup:
-        cuda_version:
-          - "11.8.0"
-          - "12.1.0"
-        python_version:
-          - "3.8"
-          - "3.9"
-          - "3.10"
-          - "3.11"
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@ -50,16 +50,16 @@ echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
 echo '```' >> benchmark_results.md
-tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
 echo '```' >> benchmark_results.md

 # if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f /usr/bin/buildkite-agent ]; then
+if [ ! -f /workspace/buildkite-agent ]; then
    exit 0
 fi

 # upload the results to buildkite
-buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

 # exit with the exit code of the benchmarks
 if [ $bench_latency_exit_code -ne 0 ]; then
@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
 fi

 rm ShareGPT_V3_unfiltered_cleaned_split.json
-buildkite-agent artifact upload "*.json"
+/workspace/buildkite-agent artifact upload "*.json"
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@ -4,25 +4,21 @@ set -ex

 # Try building the docker image
 docker build -t cpu-test -f Dockerfile.cpu .
-docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .

 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
+remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container

 # Run the image
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
-  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
-  --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test

 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"

 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
  pip install pytest Pillow protobuf
+  bash ../.buildkite/download-images.sh
  cd ../
-  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py" # Mamba on CPU is not supported
+  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@ -1,14 +0,0 @@
-# This script build the OpenVINO docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t openvino-test -f Dockerfile.openvino .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f openvino-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@ -1,14 +0,0 @@
-# This script build the CPU docker image and run the offline inference inside the container.
-# It serves a sanity check for compilation and basic model usage.
-set -ex
-
-# Try building the docker image
-docker build -t xpu-test -f Dockerfile.xpu .
-
-# Setup cleanup
-remove_docker_container() { docker rm -f xpu-test || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -1,10 +1,7 @@
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-
-# This script will be feed into Jinja template in `test-template-aws.j2` at
-# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
-# to generate the final pipeline yaml file.
-
+# This script will be feed into Jinja template in `test-template.j2` to generate
+# the final pipeline yaml file.

 steps:
 - label: Regression Test
@ -27,63 +24,35 @@ steps:

 - label: Core Test
  mirror_hardwares: [amd]
-  commands: 
-  - pytest -v -s core
-  - pytest -v -s distributed/test_parallel_state.py
+  command: pytest -v -s core

 - label: Distributed Comm Ops Test
  #mirror_hardwares: [amd]
+  command: pytest -v -s distributed/test_comm_ops.py
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
-  commands:
-  - pytest -v -s distributed/test_comm_ops.py
-  - pytest -v -s distributed/test_shm_broadcast.py

- label: Distributed Tests (2 GPUs)
+- label: Distributed Tests
  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  commands:
-  - bash ../.buildkite/download-images.sh
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist.py 

- label: Distributed Tests (4 GPUs)
+- label: Distributed Tests (Multiple Groups)
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  commands:
  - pytest -v -s distributed/test_pynccl.py
-  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
-  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
-
- label: Pipeline Parallelism Test
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  commands:
-  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-  - TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
-  - PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-

 - label: Engine Test
  mirror_hardwares: [amd]
@ -93,8 +62,9 @@ steps:
  mirror_hardwares: [amd]

  commands:
-  - pytest -v -s entrypoints/llm
-  - pytest -v -s entrypoints/openai
+  - pytest -v -s test_inputs.py
+  - pytest -v -s entrypoints -m llm
+  - pytest -v -s entrypoints -m openai

 - label: Examples Test
  working_dir: "/vllm-workspace/examples"
@ -109,31 +79,22 @@ steps:
    - python3 llava_example.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors

- label: Inputs Test
-  #mirror_hardwares: [amd]
-  commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s test_inputs.py
-    - pytest -v -s multimodal
-
 - label: Kernels Test %N
  #mirror_hardwares: [amd]
-  commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 4

 - label: Models Test
  #mirror_hardwares: [amd]
  commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-    - pytest -v -s models -m \"not vlm\"
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s models --ignore=models/test_llava.py

- label: Vision Language Models Test
+- label: Llava Test
  mirror_hardwares: [amd]
  commands:
    - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m vlm
+    - pytest -v -s models/test_llava.py

 - label: Prefix Caching Test
  mirror_hardwares: [amd]
@ -157,10 +118,7 @@ steps:

 - label: Speculative decoding tests
  #mirror_hardwares: [amd]
-  commands:
-    # See https://github.com/vllm-project/vllm/issues/5152
-    - export VLLM_ATTENTION_BACKEND=XFORMERS
-    - pytest -v -s spec_decode
+  command: pytest -v -s spec_decode

 - label: LoRA Test %N
  #mirror_hardwares: [amd]
@ -172,10 +130,14 @@ steps:
  num_gpus: 4
  # This test runs llama 13B, so it is required to run on 4 GPUs.
  commands:
-    # FIXIT: find out which code initialize cuda before running the test
-    # before the fix, we need to use spawn to test it
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s -x lora/test_long_context.py
+    # Temporarily run this way because we cannot clean up GPU mem usage
+    # for multi GPU tests.
+    # TODO(sang): Fix it.
+    - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
+    - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
+    - pytest -v -s lora/test_long_context.py::test_self_consistency
+    - pytest -v -s lora/test_long_context.py::test_quality
+    - pytest -v -s lora/test_long_context.py::test_max_len

 - label: Tensorizer Test
  #mirror_hardwares: [amd]
@ -189,15 +151,6 @@ steps:
  #mirror_hardwares: [amd]
  command: pytest -v -s quantization

- label: Tracing Test
-  commands: 
-    - "pip install \
-        opentelemetry-sdk \
-        opentelemetry-api \
-        opentelemetry-exporter-otlp \
-        opentelemetry-semantic-conventions-ai"
-    - pytest -v -s tracing
-
 - label: Benchmarks
  working_dir: "/vllm-workspace/.buildkite"
  mirror_hardwares: [amd]
@ -205,39 +158,9 @@ steps:
  - pip install aiohttp
  - bash run-benchmarks.sh

- label: LM Eval Small Models
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  commands:
-  - pip install lm-eval
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-small.txt -t 1
-
- label: LM Eval Large Models
-  gpu: a100
-  num_gpus: 4
-  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-  commands:
-  - pip install lm-eval
-  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - bash ./run-tests.sh -c configs/models-large.txt -t 4
-
 - label: Documentation Build
  working_dir: "/vllm-workspace/test_docs/docs"
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
-
- label: Distributed Tests (A100)
-  gpu: a100
-  num_gpus: 4
-  commands: 
-  # NOTE: don't test llama model here, it seems hf implementation is buggy
-  # see https://github.com/vllm-project/vllm/pull/5689 for details
-  - pytest -v -s distributed/test_custom_all_reduce.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s -x lora/test_mixtral.py
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@ -0,0 +1,59 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    agents:
+      queue: cpu_queue
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+  - wait
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      {% if step.no_gpu %}
+      queue: cpu_queue
+      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
+      queue: gpu_4_queue
+      {% else %}
+      queue: gpu_1_queue
+      {% endif %}
+    soft_fail: true
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+      - docker#v5.2.0:
+          image: {{ docker_image }}
+          always-pull: true
+          propagate-environment: true
+          {% if not step.no_gpu %}
+          gpus: all
+          {% endif %}
+          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
+          environment:
+            - VLLM_USAGE_SOURCE=ci-test
+            - HF_TOKEN
+            {% if step.label == "Speculative decoding tests" %}
+            - VLLM_ATTENTION_BACKEND=XFORMERS
+            {% endif %}
+          volumes:
+            - /dev/shm:/dev/shm
+  {% endfor %}
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@ -0,0 +1,95 @@
+{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
+{% set default_num_gpu = 1 %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    commands: 
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+  - wait
+
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+    {% for step in steps %}
+    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+      - label: "AMD: {{ step.label }}"
+        agents:
+          queue: amd
+        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+    {% endif %}
+    {% endfor %}
+
+  - label: "Neuron Test"
+    depends_on: ~
+    agents:
+      queue: neuron
+    command: bash .buildkite/run-neuron-test.sh
+    soft_fail: true
+
+  - label: "Intel Test"
+    depends_on: ~
+    agents:
+      queue: intel
+    command: bash .buildkite/run-cpu-test.sh
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: kubernetes
+    soft_fail: {{ step.soft_fail or false }}
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+      - kubernetes:
+          podSpec:
+            {% if step.num_gpus %}
+            priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
+            {% endif %}
+            volumes:
+              - name: dshm
+                emptyDir:
+                  medium: Memory
+            containers:
+              - image: "{{ docker_image }}"
+                command: ["bash"]
+                args:
+                - '-c'
+                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                {% if not step.no_gpu %}
+                resources:
+                  requests:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                  limits:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                {% endif %}
+                env:
+                  - name: VLLM_USAGE_SOURCE
+                    value: ci-test
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+  {% endfor %}
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -47,5 +47,5 @@ jobs:
        mypy vllm/model_executor  --config-file pyproject.toml
        mypy vllm/lora --config-file pyproject.toml
        mypy vllm/logging --config-file pyproject.toml
-        mypy tests --config-file pyproject.toml
+        mypy vllm/model_executor --config-file pyproject.toml

--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -25,7 +25,7 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
+        pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
    - name: Analysing the code with ruff
      run: |
        ruff .
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -2,8 +2,7 @@ cmake_minimum_required(VERSION 3.21)

 project(vllm_extensions LANGUAGES CXX)

-# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
-set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
+option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")

 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
@ -33,7 +32,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # versions are derived from Dockerfile.rocm
 #
 set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
+set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
+set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")

 #
 # Try to find python package with an executable that exactly matches
@ -66,6 +66,19 @@ endif()
 #
 find_package(Torch REQUIRED)

+#
+# Normally `torch.utils.cpp_extension.CUDAExtension` would add
+# `libtorch_python.so` for linking against an extension. Torch's cmake
+# configuration does not include this library (presumably since the cmake
+# config is used for standalone C++ binaries that link against torch).
+# The `libtorch_python.so` library defines some of the glue code between
+# torch/python via pybind and is required by VLLM extensions for this
+# reason. So, add it by manually with `find_library` using torch's
+# installed library path.
+#
+find_library(torch_python_LIBRARY torch_python PATHS
+  "${TORCH_INSTALL_PREFIX}/lib")
+
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@ -98,11 +111,18 @@ elseif(HIP_FOUND)
  # .hip extension automatically, HIP must be enabled explicitly.
  enable_language(HIP)

-  # ROCm 5.X and 6.X
-  if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
-    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} "
-      "expected for ROCm build, saw ${Torch_VERSION} instead.")
+  # ROCm 5.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
+      "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
+  endif()
+
+  # ROCm 6.x
+  if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
+      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
+    message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
+      "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
  endif()
 else()
  message(FATAL_ERROR "Can't find CUDA or HIP installation.")
@ -151,7 +171,7 @@ set(VLLM_EXT_SRC
  "csrc/quantization/fp8/common.cu"
  "csrc/cuda_utils_kernels.cu"
  "csrc/moe_align_block_size_kernels.cu"
-  "csrc/torch_bindings.cpp")
+  "csrc/pybind.cpp")

 if(VLLM_GPU_LANG STREQUAL "CUDA")
  include(FetchContent)
@ -171,11 +191,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin.cu"
    "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-    "csrc/quantization/fp8/fp8_marlin.cu"
    "csrc/custom_all_reduce.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
-    "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")

  #
  # The CUTLASS kernels for Hopper require sm90a to be enabled.
@ -183,7 +202,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
    set_source_files_properties(
-          "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+          "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
          PROPERTIES
          COMPILE_FLAGS
          "-gencode arch=compute_90a,code=sm_90a")
@ -199,7 +218,6 @@ define_gpu_extension_target(
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
-  USE_SABI 3
  WITH_SOABI)

 #
@ -207,7 +225,7 @@ define_gpu_extension_target(
 #

 set(VLLM_MOE_EXT_SRC
-  "csrc/moe/torch_bindings.cpp"
+  "csrc/moe/moe_ops.cpp"
  "csrc/moe/topk_softmax_kernels.cu")

 define_gpu_extension_target(
@ -217,7 +235,6 @@ define_gpu_extension_target(
  SOURCES ${VLLM_MOE_EXT_SRC}
  COMPILE_FLAGS ${VLLM_GPU_FLAGS}
  ARCHITECTURES ${VLLM_GPU_ARCHES}
-  USE_SABI 3
  WITH_SOABI)

 #
@ -232,7 +249,7 @@ set(VLLM_PUNICA_EXT_SRC
  "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
  "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
  "csrc/punica/punica_ops.cu"
-  "csrc/punica/torch_bindings.cpp")
+  "csrc/punica/punica_pybind.cpp")

 #
 # Copy GPU compilation flags+update for punica
@ -269,7 +286,6 @@ if (VLLM_PUNICA_GPU_ARCHES)
    SOURCES ${VLLM_PUNICA_EXT_SRC}
    COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
    ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
-    USE_SABI 3
    WITH_SOABI)
 else()
  message(WARNING "Unable to create _punica_C target because none of the "
--- a/101
+++ b/101
@ -5,35 +5,18 @@
 # docs/source/dev/dockerfile/dockerfile.rst and
 # docs/source/assets/dev/dockerfile-stages-dependency.png

-ARG CUDA_VERSION=12.4.1
 #################### BASE BUILD IMAGE ####################
 # prepare basic build environment
-FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base
-
-ARG CUDA_VERSION=12.4.1
-ARG PYTHON_VERSION=3
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
-    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
-    && apt-get update -y \
-    && apt-get install -y ccache software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \
-    && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \
-    && python3 --version \
-    && python3 -m pip --version
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev

 RUN apt-get update -y \
-    && apt-get install -y python3-pip git curl sudo
+    && apt-get install -y python3-pip git

 # Workaround for https://github.com/openai/triton/issues/2507 and
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/

 WORKDIR /workspace

@ -41,11 +24,12 @@ WORKDIR /workspace
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-cuda.txt
+    pip install -r requirements-cuda.txt

-COPY requirements-mamba.txt requirements-mamba.txt
-RUN python3 -m pip install packaging
-RUN python3 -m pip install -r requirements-mamba.txt
+# install development dependencies
+COPY requirements-dev.txt requirements-dev.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-dev.txt

 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@ -55,16 +39,14 @@ ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################

-#################### WHEEL BUILD IMAGE ####################
-FROM base AS build

-ARG PYTHON_VERSION=3
+#################### WHEEL BUILD IMAGE ####################
+FROM dev AS build

 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
-
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-build.txt
+    pip install -r requirements-build.txt

 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
@ -88,28 +70,10 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1

-ARG USE_SCCACHE
-# if USE_SCCACHE is set, use sccache to speed up compilation
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
-        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
-        && export SCCACHE_BUCKET=vllm-build-sccache \
-        && export SCCACHE_REGION=us-west-2 \
-        && sccache --show-stats \
-        && python3 setup.py bdist_wheel --dist-dir=dist \
-        && sccache --show-stats; \
-    fi
-
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/pip \
-    if [ "$USE_SCCACHE" != "1" ]; then \
-        python3 setup.py bdist_wheel --dist-dir=dist; \
-    fi
+    python3 setup.py bdist_wheel --dist-dir=dist

 # check the size of the wheel, we cannot upload wheels larger than 100MB
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
@ -117,36 +81,9 @@ RUN python3 check-wheel-size.py dist

 #################### EXTENSION Build IMAGE ####################

-#################### DEV IMAGE ####################
-FROM base as dev
-
-COPY requirements-lint.txt requirements-lint.txt
-COPY requirements-test.txt requirements-test.txt
-COPY requirements-dev.txt requirements-dev.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-dev.txt
-
-#################### DEV IMAGE ####################
-#################### MAMBA Build IMAGE ####################
-FROM dev as mamba-builder
-# max jobs used for build
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-
-WORKDIR /usr/src/mamba
-
-COPY requirements-mamba.txt requirements-mamba.txt
-
-# Download the wheel or build it if a pre-compiled release doesn't exist
-RUN pip --verbose wheel -r requirements-mamba.txt \
-    --no-build-isolation --no-deps --no-cache-dir
-
-#################### MAMBA Build IMAGE ####################
-
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
-FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
-ARG CUDA_VERSION=12.4.1
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
 WORKDIR /vllm-workspace

 RUN apt-get update -y \
@ -156,16 +93,12 @@ RUN apt-get update -y \
 # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
 # this won't be needed for future versions of this docker image
 # or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/

 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install dist/*.whl --verbose
-
-RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamba \
-    --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
+    pip install dist/*.whl --verbose
 #################### vLLM installation IMAGE ####################


@ -178,7 +111,7 @@ ADD . /vllm-workspace/

 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-dev.txt
+    pip install -r requirements-dev.txt

 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
@ -195,7 +128,7 @@ FROM vllm-base AS vllm-openai

 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0'
+    pip install accelerate hf_transfer modelscope

 ENV VLLM_USAGE_SOURCE production-docker-image

--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -3,21 +3,11 @@
 FROM ubuntu:22.04 AS cpu-test-1

 RUN apt-get update  -y \
-    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
+    && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

-# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
-# intel-openmp provides additional performance improvement vs. openmp
-# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
-RUN pip install intel-openmp
-
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
-
-
-RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl
-
 RUN pip install --upgrade pip \
-    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
+    && pip install wheel packaging ninja setuptools>=49.4.0 numpy

 FROM cpu-test-1 AS build

@ -27,14 +17,10 @@ WORKDIR /workspace/vllm

 RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu

-# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
-ARG VLLM_CPU_DISABLE_AVX512
-ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
-
 RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

 WORKDIR /workspace/

-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+CMD ["/bin/bash"]
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@ -28,7 +28,7 @@ COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
 RUN cd /app/vllm \
    && python3 -m pip install -U -r requirements-neuron.txt

-ENV VLLM_TARGET_DEVICE neuron
+ENV VLLM_BUILD_WITH_NEURON 1
 RUN cd /app/vllm \
    && pip install -e . \
    && cd ..
--- a/Dockerfile.openvino
+++ b/Dockerfile.openvino
@ -1,26 +0,0 @@
-# The vLLM Dockerfile is used to construct vLLM image that can be directly used
-# to run the OpenAI compatible server.
-
-FROM ubuntu:22.04 AS dev
-
-RUN apt-get update -y && \
-    apt-get install -y python3-pip git
-WORKDIR /workspace
-
-# copy requirements
-COPY requirements-build.txt /workspace/vllm/
-COPY requirements-common.txt /workspace/vllm/
-COPY requirements-openvino.txt /workspace/vllm/
-
-COPY vllm/ /workspace/vllm/vllm
-COPY setup.py /workspace/vllm/
-
-# install build requirements
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
-# build vLLM with OpenVINO backend
-RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
-
-COPY examples/ /workspace/vllm/examples
-COPY benchmarks/ /workspace/vllm/benchmarks
-
-CMD ["/bin/bash"]
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@ -1,22 +0,0 @@
-FROM mambaorg/micromamba
-ARG MAMBA_DOCKERFILE_ACTIVATE=1
-USER root
-
-RUN apt-get update  -y     && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
-
-# Some packages in requirements-cpu are installed here
-# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
-# Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults     python=3.10     pytorch-cpu=2.1.2     torchvision-cpu=0.16.2    &&     micromamba clean --all --yes
-
-COPY ./ /workspace/vllm
-
-WORKDIR /workspace/vllm
-
-# These packages will be in rocketce eventually
-RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
-
-RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
-
-WORKDIR /vllm-workspace
-ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -1,35 +1,35 @@
-# Default ROCm 6.1 base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+# default base image
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"

-# Tested and supported base rocm/pytorch images
-ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \
-    ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \
-    ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+FROM $BASE_IMAGE
+
+ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
+# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"

-# Default ROCm ARCHes to build vLLM for.
-ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"

-# Whether to build CK-based flash-attention
-# If 0, will not build flash attention
-# This is useful for gfx target where flash-attention is not supported
-# (i.e. those that do not appear in `FA_GFX_ARCHS`)
-# Triton FA is used by default on ROCm now so this is unnecessary.
-ARG BUILD_FA="1"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
+RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
+
 ARG FA_BRANCH="ae7928c"
+RUN echo "FA_BRANCH is $FA_BRANCH"

-# Whether to build triton on rocm
+# whether to build flash-attention
+# if 0, will not build flash attention
+# this is useful for gfx target where flash-attention is not supported
+# In that case, we need to use the python reference attention implementation in vllm
+ARG BUILD_FA="1"
+
+# whether to build triton on rocm
 ARG BUILD_TRITON="1"
-ARG TRITON_BRANCH="0ef1848"
-
-### Base image build stage
-FROM $BASE_IMAGE AS base
-
-# Import arg(s) defined before this build stage
-ARG PYTORCH_ROCM_ARCH

 # Install some basic utilities
 RUN apt-get update && apt-get install python3 python3-pip -y
+
+# Install some basic utilities
 RUN apt-get update && apt-get install -y \
    curl \
    ca-certificates \
@ -40,165 +40,76 @@ RUN apt-get update && apt-get install -y \
    build-essential \
    wget \
    unzip \
+    nvidia-cuda-toolkit \
    tmux \
-    ccache \
 && rm -rf /var/lib/apt/lists/*

-# When launching the container, mount the code directory to /vllm-workspace
+### Mount Point ###
+# When launching the container, mount the code directory to /app
 ARG APP_MOUNT=/vllm-workspace
+VOLUME [ ${APP_MOUNT} ]
 WORKDIR ${APP_MOUNT}

-RUN pip install --upgrade pip
-# Remove sccache so it doesn't interfere with ccache
-# TODO: implement sccache support across components
-RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
-# Install torch == 2.4.0 on ROCm
-RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-5.7"*) \
-            pip uninstall -y torch torchaudio torchvision \
-            && pip install --no-cache-dir --pre \
-                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
-                torchvision==0.19.0.dev20240612 \
-               --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
-        *"rocm-6.0"*) \
-            pip uninstall -y torch torchaudio torchvision \
-            && pip install --no-cache-dir --pre \
-                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
-                torchvision==0.19.0.dev20240612 \
-               --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
-        *"rocm-6.1"*) \
-            pip uninstall -y torch torchaudio torchvision \
-            && pip install --no-cache-dir --pre \
-                torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
-                torchvision==0.19.0.dev20240612 \
-               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
-        *) ;; esac
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas

 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
 ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
 ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:

-ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
-ENV CCACHE_DIR=/root/.cache/ccache
-
-
-### AMD-SMI build stage
-FROM base AS build_amdsmi
-# Build amdsmi wheel always
-RUN cd /opt/rocm/share/amd_smi \
-    && pip wheel . --wheel-dir=/install
-
-
-### Flash-Attention wheel build stage
-FROM base AS build_fa
-ARG BUILD_FA
-ARG FA_GFX_ARCHS
-ARG FA_BRANCH
-# Build ROCm flash-attention wheel if `BUILD_FA = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_FA" = "1" ]; then \
-    mkdir -p libs \
+# Install ROCm flash-attention
+RUN if [ "$BUILD_FA" = "1" ]; then \
+    mkdir libs \
    && cd libs \
    && git clone https://github.com/ROCm/flash-attention.git \
    && cd flash-attention \
-    && git checkout "${FA_BRANCH}" \
+    && git checkout ${FA_BRANCH} \
    && git submodule update --init \
-    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-5.7"*) \
-            export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \
-            && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \
-        *) ;; esac \
-    && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
+    && export GPU_ARCHS=${FA_GFX_ARCHS} \
+    && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
+        patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
+    && python3 setup.py install \
+    && cd ..; \
    fi

-
-### Triton wheel build stage
-FROM base AS build_triton
-ARG BUILD_TRITON
-ARG TRITON_BRANCH
-# Build triton wheel if `BUILD_TRITON = 1`
-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    if [ "$BUILD_TRITON" = "1" ]; then \
-    mkdir -p libs \
-    && cd libs \
-    && git clone https://github.com/OpenAI/triton.git \
-    && cd triton \
-    && git checkout "${TRITON_BRANCH}" \
-    && cd python \
-    && python3 setup.py bdist_wheel --dist-dir=/install; \
-    # Create an empty directory otherwise as later build stages expect one
-    else mkdir -p /install; \
-    fi
-
-
-### Final vLLM build stage
-FROM base AS final
-# Import the vLLM development directory from the build context
-COPY . .
-
 # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually remove it so that later steps of numpy upgrade can continue
-RUN case "$(which python3)" in \
-        *"/opt/conda/envs/py_3.9"*) \
-            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
-        *) ;; esac
+# Manually removed it so that later steps of numpy upgrade can continue
+RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
+    rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi

-# Package upgrades for useful functionality or to avoid dependency issues
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install --upgrade numba scipy huggingface-hub[cli]
+# build triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
+    mkdir -p libs \
+    && cd libs \
+    && pip uninstall -y triton \
+    && git clone https://github.com/ROCm/triton.git \
+    && cd triton/python \
+    && pip3 install . \
+    && cd ../..; \
+    fi

-# Make sure punica kernels are built (for LoRA)
+WORKDIR /vllm-workspace
+COPY . .
+
+#RUN python3 -m pip install pynvml # to be removed eventually
+RUN python3 -m pip install --upgrade pip numba
+
+# make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 # Workaround for ray >= 2.10.0
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-# Silences the HF Tokenizers warning
-ENV TOKENIZERS_PARALLELISM=false

-RUN --mount=type=cache,target=${CCACHE_DIR} \
-    --mount=type=cache,target=/root/.cache/pip \
-    pip install -U -r requirements-rocm.txt \
-    && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-        *"rocm-6.0"*) \
-            patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
-        *"rocm-6.1"*) \
-            # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM
-            wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \
-            && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
-            # Prevent interference if torch bundles its own HIP runtime
-            && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
-        *) ;; esac \
-    && python3 setup.py clean --all \
-    && python3 setup.py develop
+ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so

-# Copy amdsmi wheel into final image
-RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
-    mkdir -p libs \
-    && cp /install/*.whl libs \
-    # Preemptively uninstall to avoid same-version no-installs
-    && pip uninstall -y amdsmi;
-
-# Copy triton wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && pip uninstall -y triton; fi
-
-# Copy flash-attn wheel(s) into final image if they were built
-RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
-    mkdir -p libs \
-    && if ls /install/*.whl; then \
-        cp /install/*.whl libs \
-        # Preemptively uninstall to avoid same-version no-installs
-        && pip uninstall -y flash-attn; fi
-
-# Install wheels that were built to the final image
 RUN --mount=type=cache,target=/root/.cache/pip \
-    if ls libs/*.whl; then \
-    pip install libs/*.whl; fi
+    pip install -U -r requirements-rocm.txt \
+    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
+    && python3 setup.py install \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cd ..
+

 CMD ["/bin/bash"]
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@ -1,19 +0,0 @@
-ARG NIGHTLY_DATE="20240601"
-ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
-
-FROM $BASE_IMAGE
-
-WORKDIR /workspace
-COPY . /workspace/vllm
-
-ENV VLLM_TARGET_DEVICE="tpu"
-# Install aiohttp separately to avoid build errors.
-RUN pip install aiohttp
-# Install the TPU and Pallas dependencies.
-RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
-# Build vLLM.
-RUN cd /workspace/vllm && python setup.py develop
-
-CMD ["/bin/bash"]
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@ -1,22 +0,0 @@
-FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
-
-RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
-    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
-    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
-    rm /etc/apt/sources.list.d/intel-graphics.list && \
-    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
-    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
-    chmod 644 /usr/share/keyrings/intel-graphics.gpg
-
-RUN apt-get update  -y \
-&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
-
-COPY ./ /workspace/vllm
-
-WORKDIR /workspace/vllm
-
-RUN pip install -v -r requirements-xpu.txt
-
-RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
-
-CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@ -16,17 +16,16 @@ Easy, fast, and cheap LLM serving for everyone

 ---

-**Ray Summit CPF is Open (June 4th to June 20th)!**
+**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**

-There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
-If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
-This will be a great chance for everyone in the community to get together and learn.
-Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
+We are thrilled to announce our fourth vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://lu.ma/agivllm) and join us!

 ---

 *Latest News* 🔥
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
@ -59,7 +58,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
+- Support NVIDIA GPUs and AMD GPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support

@ -108,11 +107,9 @@ vLLM is a community project. Our compute resources for development and testing a
 - Replicate
 - Roblox
 - RunPod
- Sequoia Capital
 - Trainy
 - UC Berkeley
 - UC San Diego
- ZhenFund

 We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.

--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@ -4,13 +4,10 @@ import sys
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import List, Optional

 import aiohttp
-import huggingface_hub.constants
 from tqdm.asyncio import tqdm
-from transformers import (AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)

 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)

@ -71,13 +68,9 @@ async def async_request_tgi(
                        chunk_bytes = chunk_bytes.strip()
                        if not chunk_bytes:
                            continue
-                        chunk_bytes = chunk_bytes.decode("utf-8")

-                        #NOTE: Sometimes TGI returns a ping response without
-                        # any data, we should skip it.
-                        if chunk_bytes.startswith(":"):
-                            continue
-                        chunk = remove_prefix(chunk_bytes, "data:")
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+                                              "data:")

                        data = json.loads(chunk)
                        timestamp = time.perf_counter()
@ -225,8 +218,8 @@ async def async_request_openai_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "completions"
-    ), "OpenAI Completions API URL must end with 'completions'."
+        "v1/completions"
+    ), "OpenAI Completions API URL must end with 'v1/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@ -265,9 +258,6 @@ async def async_request_openai_completions(
                        else:
                            data = json.loads(chunk)

-                            # NOTE: Some completion API might have a last
-                            # usage summary response without a token so we
-                            # want to check a token was generated
                            if data["choices"][0]["text"]:
                                timestamp = time.perf_counter()
                                # First token
@ -276,8 +266,12 @@ async def async_request_openai_completions(
                                    output.ttft = ttft

                                # Decoding phase
-                                output.itl.append(timestamp -
-                                                  most_recent_timestamp)
+                                # NOTE: Some completion API might have a last
+                                # usage summary response without a token so we
+                                # do not want to include as inter-token-latency
+                                elif data.get("usage", None) is None:
+                                    output.itl.append(timestamp -
+                                                      most_recent_timestamp)

                                most_recent_timestamp = timestamp
                                generated_text += data["choices"][0]["text"]
@ -304,8 +298,8 @@ async def async_request_openai_chat_completions(
 ) -> RequestFuncOutput:
    api_url = request_func_input.api_url
    assert api_url.endswith(
-        "chat/completions"
-    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+        "v1/chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."

    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
        assert not request_func_input.use_beam_search
@ -390,30 +384,6 @@ def remove_prefix(text: str, prefix: str) -> str:
    return text


-def get_model(pretrained_model_name_or_path: str):
-    if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
-        from modelscope import snapshot_download
-    else:
-        from huggingface_hub import snapshot_download
-
-    model_path = snapshot_download(
-        model_id=pretrained_model_name_or_path,
-        local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-        ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
-    return model_path
-
-
-def get_tokenizer(
-    pretrained_model_name_or_path: str, trust_remote_code: bool
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
-    if pretrained_model_name_or_path is not None and not os.path.exists(
-            pretrained_model_name_or_path):
-        pretrained_model_name_or_path = get_model(
-            pretrained_model_name_or_path)
-    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
-                                         trust_remote_code=trust_remote_code)
-
-
 ASYNC_REQUEST_FUNCS = {
    "tgi": async_request_tgi,
    "vllm": async_request_openai_completions,
@ -422,5 +392,4 @@ ASYNC_REQUEST_FUNCS = {
    "openai": async_request_openai_completions,
    "openai-chat": async_request_openai_chat_completions,
    "tensorrt-llm": async_request_trt_llm,
-    "scalellm": async_request_openai_completions,
 }
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -10,10 +10,8 @@ import torch
 from tqdm import tqdm

 from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptStrictInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser


 def main(args: argparse.Namespace):
@ -21,33 +19,24 @@ def main(args: argparse.Namespace):

    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
-        model=args.model,
-        speculative_model=args.speculative_model,
-        num_speculative_tokens=args.num_speculative_tokens,
-        speculative_draft_tensor_parallel_size=\
-            args.speculative_draft_tensor_parallel_size,
-        tokenizer=args.tokenizer,
-        quantization=args.quantization,
-        tensor_parallel_size=args.tensor_parallel_size,
-        trust_remote_code=args.trust_remote_code,
-        dtype=args.dtype,
-        max_model_len=args.max_model_len,
-        enforce_eager=args.enforce_eager,
-        kv_cache_dtype=args.kv_cache_dtype,
-        quantization_param_path=args.quantization_param_path,
-        device=args.device,
-        ray_workers_use_nsight=args.ray_workers_use_nsight,
-        use_v2_block_manager=args.use_v2_block_manager,
-        enable_chunked_prefill=args.enable_chunked_prefill,
-        download_dir=args.download_dir,
-        block_size=args.block_size,
-        gpu_memory_utilization=args.gpu_memory_utilization,
-        load_format=args.load_format,
-        distributed_executor_backend=args.distributed_executor_backend,
-        otlp_traces_endpoint=args.otlp_traces_endpoint,
-        enable_prefix_caching=args.enable_prefix_caching,
-    )
+    llm = LLM(model=args.model,
+              speculative_model=args.speculative_model,
+              num_speculative_tokens=args.num_speculative_tokens,
+              tokenizer=args.tokenizer,
+              quantization=args.quantization,
+              tensor_parallel_size=args.tensor_parallel_size,
+              trust_remote_code=args.trust_remote_code,
+              dtype=args.dtype,
+              enforce_eager=args.enforce_eager,
+              kv_cache_dtype=args.kv_cache_dtype,
+              quantization_param_path=args.quantization_param_path,
+              device=args.device,
+              ray_workers_use_nsight=args.ray_workers_use_nsight,
+              use_v2_block_manager=args.use_v2_block_manager,
+              enable_chunked_prefill=args.enable_chunked_prefill,
+              download_dir=args.download_dir,
+              block_size=args.block_size,
+              gpu_memory_utilization=args.gpu_memory_utilization)

    sampling_params = SamplingParams(
        n=args.n,
@ -106,7 +95,7 @@ def main(args: argparse.Namespace):
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile_dir=None))
    latencies = np.array(latencies)
-    percentages = [10, 25, 50, 75, 90, 99]
+    percentages = [10, 25, 50, 75, 90]
    percentiles = np.percentile(latencies, percentages)
    print(f'Avg latency: {np.mean(latencies)} seconds')
    for percentage, percentile in zip(percentages, percentiles):
@ -124,16 +113,12 @@ def main(args: argparse.Namespace):


 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description='Benchmark the latency of processing a single batch of '
        'requests till completion.')
    parser.add_argument('--model', type=str, default='facebook/opt-125m')
    parser.add_argument('--speculative-model', type=str, default=None)
    parser.add_argument('--num-speculative-tokens', type=int, default=None)
-    parser.add_argument('--speculative-draft-tensor-parallel-size',
-                        '-spec-draft-tp',
-                        type=int,
-                        default=None)
    parser.add_argument('--tokenizer', type=str, default=None)
    parser.add_argument('--quantization',
                        '-q',
@ -159,12 +144,6 @@ if __name__ == '__main__':
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
-    parser.add_argument(
-        '--max-model-len',
-        type=int,
-        default=None,
-        help='Maximum length of a sequence (including prompt and output). '
-        'If None, will be derived from the model.')
    parser.add_argument(
        '--dtype',
        type=str,
@ -208,10 +187,9 @@ if __name__ == '__main__':
    parser.add_argument(
        "--device",
        type=str,
-        default="auto",
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
-        'CPU.')
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help='device type for vLLM execution, supporting CUDA and CPU.')
    parser.add_argument('--block-size',
                        type=int,
                        default=16,
@ -221,9 +199,6 @@ if __name__ == '__main__':
        action='store_true',
        help='If True, the prefill requests can be chunked based on the '
        'max_num_batched_tokens')
-    parser.add_argument("--enable-prefix-caching",
-                        action='store_true',
-                        help="Enable automatic prefix caching")
    parser.add_argument('--use-v2-block-manager', action='store_true')
    parser.add_argument(
        "--ray-workers-use-nsight",
@ -246,40 +221,5 @@ if __name__ == '__main__':
                        help='the fraction of GPU memory to be used for '
                        'the model executor, which can range from 0 to 1.'
                        'If unspecified, will use the default value of 0.9.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--otlp-traces-endpoint',
-        type=str,
-        default=None,
-        help='Target URL to which OpenTelemetry traces will be sent.')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@ -1,7 +1,7 @@
+import argparse
 import time

 from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser

 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n"  # noqa: E501

@ -44,7 +44,7 @@ def main(args):


 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description='Benchmark the performance with or without automatic '
        'prefix caching.')
    parser.add_argument('--model',
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -31,7 +31,7 @@ import time
 import warnings
 from dataclasses import dataclass
 from datetime import datetime
-from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
+from typing import AsyncGenerator, List, Optional, Tuple

 import numpy as np
 from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@ -39,15 +39,7 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
 from tqdm.asyncio import tqdm
 from transformers import PreTrainedTokenizerBase

-try:
-    from vllm.transformers_utils.tokenizer import get_tokenizer
-except ImportError:
-    from backend_request_func import get_tokenizer
-
-try:
-    from vllm.utils import FlexibleArgumentParser
-except ImportError:
-    from argparse import ArgumentParser as FlexibleArgumentParser
+from vllm.transformers_utils.tokenizer import get_tokenizer


@dataclass
@ -64,9 +56,6 @@ class BenchmarkMetrics:
    mean_tpot_ms: float
    median_tpot_ms: float
    p99_tpot_ms: float
-    mean_itl_ms: float
-    median_itl_ms: float
-    p99_itl_ms: float


 def sample_sharegpt_requests(
@ -208,27 +197,19 @@ def calculate_metrics(
    dur_s: float,
    tokenizer: PreTrainedTokenizerBase,
 ) -> Tuple[BenchmarkMetrics, List[int]]:
-    actual_output_lens: List[int] = []
+    actual_output_lens = []
    total_input = 0
    completed = 0
-    itls: List[float] = []
-    tpots: List[float] = []
-    ttfts: List[float] = []
+    tpots = []
+    ttfts = []
    for i in range(len(outputs)):
        if outputs[i].success:
-            # We use the tokenizer to count the number of output tokens for all
-            # serving backends instead of looking at len(outputs[i].itl) since
-            # multiple output tokens may be bundled together
-            # Note: this may inflate the output token count slightly
-            output_len = len(
-                tokenizer(outputs[i].generated_text,
-                          add_special_tokens=False).input_ids)
+            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
            actual_output_lens.append(output_len)
            total_input += input_requests[i][1]
            if output_len > 1:
                tpots.append(
                    (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
-            itls += outputs[i].itl
            ttfts.append(outputs[i].ttft)
            completed += 1
        else:
@ -253,9 +234,6 @@ def calculate_metrics(
        mean_tpot_ms=np.mean(tpots or 0) * 1000,
        median_tpot_ms=np.median(tpots or 0) * 1000,
        p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
-        mean_itl_ms=np.mean(itls or 0) * 1000,
-        median_itl_ms=np.median(itls or 0) * 1000,
-        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
    )

    return metrics, actual_output_lens
@ -273,7 +251,7 @@ async def benchmark(
    disable_tqdm: bool,
 ):
    if backend in ASYNC_REQUEST_FUNCS:
-        request_func = ASYNC_REQUEST_FUNCS[backend]
+        request_func = ASYNC_REQUEST_FUNCS.get(backend)
    else:
        raise ValueError(f"Unknown backend: {backend}")

@ -300,7 +278,7 @@ async def benchmark(
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))

    benchmark_start_time = time.perf_counter()
-    tasks: List[asyncio.Task] = []
+    tasks = []
    async for request in get_request(input_requests, request_rate):
        prompt, prompt_len, output_len = request
        request_func_input = RequestFuncInput(
@ -318,7 +296,7 @@ async def benchmark(
                             pbar=pbar)))
    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)

-    if pbar is not None:
+    if not disable_tqdm:
        pbar.close()

    benchmark_duration = time.perf_counter() - benchmark_start_time
@ -355,10 +333,6 @@ async def benchmark(
    print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
                                    metrics.median_tpot_ms))
    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
-    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
-    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
-    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
-    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
    print("=" * 50)

    result = {
@ -375,9 +349,6 @@ async def benchmark(
        "mean_tpot_ms": metrics.mean_tpot_ms,
        "median_tpot_ms": metrics.median_tpot_ms,
        "p99_tpot_ms": metrics.p99_tpot_ms,
-        "mean_itl_ms": metrics.mean_itl_ms,
-        "median_itl_ms": metrics.median_itl_ms,
-        "p99_itl_ms": metrics.p99_itl_ms,
        "input_lens": [output.prompt_len for output in outputs],
        "output_lens": actual_output_lens,
        "ttfts": [output.ttft for output in outputs],
@ -474,7 +445,7 @@ def main(args: argparse.Namespace):

    # Save config and results to json
    if args.save_result:
-        result_json: Dict[str, Any] = {}
+        result_json = {}

        # Setup
        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@ -507,8 +478,6 @@ def main(args: argparse.Namespace):
        # Save to file
        base_model_id = model_id.split("/")[-1]
        file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"  #noqa
-        if args.result_filename:
-            file_name = args.result_filename
        if args.result_dir:
            file_name = os.path.join(args.result_dir, file_name)
        with open(file_name, "w") as outfile:
@ -516,7 +485,7 @@ def main(args: argparse.Namespace):


 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description="Benchmark the online serving throughput.")
    parser.add_argument(
        "--backend",
@ -649,15 +618,6 @@ if __name__ == "__main__":
        help="Specify directory to save benchmark json results."
        "If not specified, results are saved in the current directory.",
    )
-    parser.add_argument(
-        "--result-filename",
-        type=str,
-        default=None,
-        help="Specify the filename to save benchmark json results."
-        "If not specified, results will be saved in "
-        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
-        " format.",
-    )

    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -10,9 +10,7 @@ from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)

-from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.utils import FlexibleArgumentParser


 def sample_requests(
@ -80,10 +78,8 @@ def run_vllm(
    enable_prefix_caching: bool,
    enable_chunked_prefill: bool,
    max_num_batched_tokens: int,
-    distributed_executor_backend: Optional[str],
    gpu_memory_utilization: float = 0.9,
    download_dir: Optional[str] = None,
-    load_format: str = EngineArgs.load_format,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@ -104,13 +100,11 @@ def run_vllm(
        download_dir=download_dir,
        enable_chunked_prefill=enable_chunked_prefill,
        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-        load_format=load_format,
    )

    # Add the requests to the engine.
-    prompts: List[str] = []
-    sampling_params: List[SamplingParams] = []
+    prompts = []
+    sampling_params = []
    for prompt, _, output_len in requests:
        prompts.append(prompt)
        sampling_params.append(
@ -231,8 +225,8 @@ def main(args: argparse.Namespace):
            args.enforce_eager, args.kv_cache_dtype,
            args.quantization_param_path, args.device,
            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.gpu_memory_utilization, args.download_dir, args.load_format)
+            args.max_num_batched_tokens, args.gpu_memory_utilization,
+            args.download_dir)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -262,7 +256,7 @@ def main(args: argparse.Namespace):


 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser = argparse.ArgumentParser(description="Benchmark the throughput.")
    parser.add_argument("--backend",
                        type=str,
                        choices=["vllm", "hf", "mii"],
@ -349,10 +343,9 @@ if __name__ == "__main__":
    parser.add_argument(
        "--device",
        type=str,
-        default="auto",
-        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
-        'CPU.')
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help='device type for vLLM execution, supporting CUDA and CPU.')
    parser.add_argument(
        "--enable-prefix-caching",
        action='store_true',
@ -375,36 +368,6 @@ if __name__ == "__main__":
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')
-    parser.add_argument(
-        '--distributed-executor-backend',
-        choices=['ray', 'mp'],
-        default=None,
-        help='Backend to use for distributed serving. When more than 1 GPU '
-        'is used, will be automatically set to "ray" if installed '
-        'or "mp" (multiprocessing) otherwise.')
-    parser.add_argument(
-        '--load-format',
-        type=str,
-        default=EngineArgs.load_format,
-        choices=[
-            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
-            'bitsandbytes'
-        ],
-        help='The format of the model weights to load.\n\n'
-        '* "auto" will try to load the weights in the safetensors format '
-        'and fall back to the pytorch bin format if safetensors format '
-        'is not available.\n'
-        '* "pt" will load the weights in the pytorch bin format.\n'
-        '* "safetensors" will load the weights in the safetensors format.\n'
-        '* "npcache" will load the weights in pytorch format and store '
-        'a numpy cache to speed up the loading.\n'
-        '* "dummy" will initialize the weights with random values, '
-        'which is mainly for profiling.\n'
-        '* "tensorizer" will load the weights using tensorizer from '
-        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-        'section for more information.\n'
-        '* "bitsandbytes" will load the weights using bitsandbytes '
-        'quantization.\n')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model
--- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@ -11,7 +11,6 @@ from torch.utils.benchmark import Measurement as TMeasurement
 from weight_shapes import WEIGHT_SHAPES

 from vllm import _custom_ops as ops
-from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@ -47,7 +46,7 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
 # impl


-def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                    scale_b: torch.tensor,
                    out_dtype: torch.dtype) -> torch.tensor:
    return torch.mm(a, b)
@ -77,7 +76,11 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
 def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
                 scale_b: torch.tensor,
                 out_dtype: torch.dtype) -> torch.tensor:
-    return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
+    return ops.cutlass_scaled_mm_dq(a,
+                                    b,
+                                    scale_a,
+                                    scale_b,
+                                    out_dtype=out_dtype)


 # bench
@ -116,13 +119,14 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
    timers.append(
        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
+                 torch.bfloat16, label, sub_label, pytorch_i8_impl,
                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))

    # cutlass impl
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
+        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+                 torch.bfloat16, label, sub_label, cutlass_impl,
+                 "cutlass_i8_i8_bf16_scaled_mm"))

    return timers

@ -136,13 +140,6 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,

    timers = []

-    # pytorch impl w. bf16
-    timers.append(
-        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
-                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
-                 torch.bfloat16, label, sub_label, pytorch_mm_impl,
-                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
-
    # pytorch impl: bf16 output, without fp8 fast accum
    timers.append(
        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
@ -167,12 +164,14 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,

    # cutlass impl: bf16 output
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
-                 cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
+        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+                 torch.bfloat16, label, sub_label, cutlass_impl,
+                 "cutlass_fp8_fp8_bf16_scaled_mm"))
    # cutlass impl: fp16 output
    timers.append(
-        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
-                 cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
+        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+                 torch.float16, label, sub_label, cutlass_impl,
+                 "cutlass_fp8_fp8_fp16_scaled_mm"))
    return timers


@ -294,7 +293,7 @@ if __name__ == '__main__':
            return torch.float8_e4m3fn
        raise ValueError("unsupported dtype")

-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description="""
 Benchmark Cutlass GEMM.

--- a/benchmarks/cutlass_benchmarks/weight_shapes.py
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@ -22,12 +22,6 @@ WEIGHT_SHAPES = {
        ([4096, 22016], 1),
        ([11008, 4096], 0),
    ],
-    "meta-llama/Llama-3-8b": [
-        ([4096, 6144], 1),
-        ([4096, 4096], 0),
-        ([4096, 28672], 1),
-        ([14336, 4096], 0),
-    ],
    "meta-llama/Llama-2-13b-hf": [
        ([5120, 15360], 1),
        ([5120, 5120], 0),
--- a/benchmarks/kernels/benchmark_aqlm.py
+++ b/benchmarks/kernels/benchmark_aqlm.py
@ -1,3 +1,4 @@
+import argparse
 import os
 import sys
 from typing import Optional
@ -9,7 +10,6 @@ from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.aqlm import (
    dequantize_weight, generic_dequantize_gemm, get_int_dtype,
    optimized_dequantize_gemm)
-from vllm.utils import FlexibleArgumentParser

 os.environ['CUDA_VISIBLE_DEVICES'] = '0'

@ -86,9 +86,9 @@ def dequant_no_scale(
 # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
 # the generic pytorch version.
 # Just visual comparison.
-def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:

-    n = int(parts.sum().item())
+    n = parts.sum().item()

    device = torch.device('cuda:0')

@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:

 def main():

-    parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
+    parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")

    # Add arguments
    parser.add_argument("--nbooks",
@ -204,7 +204,7 @@ def main():
        sys.stdout = sys.__stdout__


-def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
             methods):

    # I didn't see visible improvements from increasing these, but feel free :)
@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
    print('')


-def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
               nbooks: int, bits: int, method) -> float:

-    n = int(parts.sum().item())
+    n = parts.sum().item()

    device = torch.device('cuda:0')

--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@ -1,4 +1,4 @@
-from typing import List
+import argparse

 import torch
 import torch.utils.benchmark as benchmark
@ -15,7 +15,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
    MarlinWorkspace, marlin_24_quantize, marlin_quantize)
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
    gptq_pack, quantize_weights, sort_weights)
-from vllm.utils import FlexibleArgumentParser

 DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
 DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
@ -24,9 +23,8 @@ ACT_ORDER_OPTS = [False, True]
 K_FULL_OPTS = [False, True]


-def bench_run(results: List[benchmark.Measurement], model: str,
-              act_order: bool, is_k_full: bool, num_bits: int, group_size: int,
-              size_m: int, size_k: int, size_n: int):
+def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
+              size_m, size_k, size_n):
    label = "Quant Matmul"

    sub_label = ("{}, act={} k_full={}, b={}, g={}, "
@ -158,7 +156,7 @@ def main(args):
    for i, model in enumerate(args.models):
        print(f"[{i}]  {model}")

-    results: List[benchmark.Measurement] = []
+    results = []

    for model in args.models:
        for layer in WEIGHT_SHAPES[model]:
@ -211,7 +209,7 @@ def main(args):
 #   python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
 #
 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description="Benchmark Marlin across specified models/shapes/batches")
    parser.add_argument(
        "--models",
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@ -0,0 +1,239 @@
+import argparse
+import json
+import os
+import sys
+
+import torch
+import torch.nn.functional as F
+import triton
+from tqdm import tqdm
+
+from vllm.model_executor.layers.fused_moe import (fused_moe,
+                                                  get_config_file_name)
+
+
+def main(model, tp_size, gpu, dtype: str):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
+    method = fused_moe
+    for bs in [
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
+    ]:
+        run_grid(bs,
+                 model=model,
+                 method=method,
+                 gpu=gpu,
+                 tp_size=tp_size,
+                 dtype=dtype)
+
+
+def run_grid(bs, model, method, gpu, tp_size, dtype: str):
+    if model == '8x7B':
+        d_model = 4096
+        model_intermediate_size = 14336
+        num_layers = 32
+    elif model == '8x22B':
+        d_model = 6144
+        model_intermediate_size = 16384
+        num_layers = 56
+    else:
+        raise ValueError(f'Unsupported Mixtral model {model}')
+    num_total_experts = 8
+    top_k = 2
+    # tp_size = 2
+    num_calls = 100
+
+    num_warmup_trials = 1
+    num_trials = 1
+
+    configs = []
+
+    for block_size_n in [32, 64, 128, 256]:
+        for block_size_m in [16, 32, 64, 128, 256]:
+            for block_size_k in [64, 128, 256]:
+                for group_size_m in [1, 16, 32, 64]:
+                    for num_warps in [4, 8]:
+                        for num_stages in [2, 3, 4, 5]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_size_m,
+                                "BLOCK_SIZE_N": block_size_n,
+                                "BLOCK_SIZE_K": block_size_k,
+                                "GROUP_SIZE_M": group_size_m,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
+
+    best_config = None
+    best_time_us = 1e20
+
+    print(f'{tp_size=} {bs=}')
+
+    for config in tqdm(configs):
+        # warmup
+        try:
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                    dtype=dtype,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+
+        # trial
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+                dtype=dtype,
+            )
+
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+
+                tqdm.write(
+                    f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
+                    f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
+                    f'{d_model=} {model_intermediate_size=} {num_layers=}')
+
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(num_total_experts,
+                                    model_intermediate_size // tp_size,
+                                    "float8" if dtype == "float8" else None)
+    print(f"writing config to file {filename}")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
+
+
+def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
+               top_k: int, tp_size: int, model_intermediate_size: int, method,
+               config, dtype: str) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=torch.float16,
+    )
+
+    w1 = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w2 = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+
+    if dtype == "float8":
+        w1 = w1.to(torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fn)
+        w1_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        w2_scale = torch.ones(num_total_experts,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a1_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+        a2_scale = torch.ones(1,
+                              device=hidden_states.device,
+                              dtype=torch.float32)
+
+    gating_output = F.softmax(torch.rand(
+        (num_calls, bs, num_total_experts),
+        device=hidden_states.device,
+        dtype=torch.float32,
+    ),
+                              dim=-1)
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            gating_output=gating_output[i],
+            topk=2,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=dtype == "float8",
+        )
+    end_event.record()
+    end_event.synchronize()
+
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='benchmark_mixtral_moe',
+        description='Benchmark and tune the fused_moe kernel',
+    )
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['float8', 'float16'],
+        help='Data type used for fused_moe kernel computations',
+    )
+    parser.add_argument('--model',
+                        type=str,
+                        default='8x7B',
+                        choices=['8x7B', '8x22B'],
+                        help='The Mixtral model to benchmark')
+    parser.add_argument('--tp-size',
+                        type=int,
+                        default=2,
+                        help='Tensor paralleli size')
+    parser.add_argument('--gpu',
+                        type=int,
+                        default=0,
+                        help="GPU ID for benchmarking")
+    args = parser.parse_args()
+    sys.exit(main(args.model, args.tp_size, args.gpu, args.dtype))
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -1,333 +0,0 @@
-import argparse
-import time
-from datetime import datetime
-from typing import Any, Dict, List, Tuple, TypedDict
-
-import ray
-import torch
-import triton
-from ray.experimental.tqdm_ray import tqdm
-from transformers import AutoConfig
-
-from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser
-
-
-class BenchmarkConfig(TypedDict):
-    BLOCK_SIZE_M: int
-    BLOCK_SIZE_N: int
-    BLOCK_SIZE_K: int
-    GROUP_SIZE_M: int
-    num_warps: int
-    num_stages: int
-
-
-def benchmark_config(
-    config: BenchmarkConfig,
-    num_tokens: int,
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8: bool,
-    num_iters: int = 100,
-) -> float:
-    init_dtype = torch.float16 if use_fp8 else dtype
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
-    w1 = torch.randn(num_experts,
-                     shard_intermediate_size,
-                     hidden_size,
-                     dtype=init_dtype)
-    w2 = torch.randn(num_experts,
-                     hidden_size,
-                     shard_intermediate_size // 2,
-                     dtype=init_dtype)
-    gating_output = torch.randn(num_iters,
-                                num_tokens,
-                                num_experts,
-                                dtype=torch.float32)
-
-    w1_scale = None
-    w2_scale = None
-    a1_scale = None
-    a2_scale = None
-    if use_fp8:
-        w1_scale = torch.randn(num_experts, dtype=torch.float32)
-        w2_scale = torch.randn(num_experts, dtype=torch.float32)
-        a1_scale = torch.randn(1, dtype=torch.float32)
-        a2_scale = torch.randn(1, dtype=torch.float32)
-
-        w1 = w1.to(torch.float8_e4m3fn)
-        w2 = w2.to(torch.float8_e4m3fn)
-
-    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
-
-    def prepare(i: int):
-        input_gating.copy_(gating_output[i])
-
-    def run():
-        fused_moe(
-            x,
-            w1,
-            w2,
-            input_gating,
-            topk,
-            renormalize=True,
-            inplace=True,
-            override_config=config,
-            use_fp8=use_fp8,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-        )
-
-    # JIT compilation & warmup
-    run()
-    torch.cuda.synchronize()
-
-    # Capture 10 invocations with CUDA graph
-    graph = torch.cuda.CUDAGraph()
-    with torch.cuda.graph(graph):
-        for _ in range(10):
-            run()
-    torch.cuda.synchronize()
-
-    # Warmup
-    for _ in range(5):
-        graph.replay()
-    torch.cuda.synchronize()
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    latencies: List[float] = []
-    for i in range(num_iters):
-        prepare(i)
-        torch.cuda.synchronize()
-
-        start_event.record()
-        graph.replay()
-        end_event.record()
-        end_event.synchronize()
-        latencies.append(start_event.elapsed_time(end_event))
-    avg = sum(latencies) / (num_iters * 10) * 1000  # us
-    graph.reset()
-    return avg
-
-
-def get_configs_compute_bound() -> List[Dict[str, int]]:
-    # Reduced search space for faster tuning.
-    # TODO(woosuk): Increase the search space and use a performance model to
-    # prune the search space.
-    configs: List[BenchmarkConfig] = []
-    for num_stages in [2, 3, 4, 5]:
-        for block_m in [16, 32, 64, 128, 256]:
-            for block_k in [64, 128, 256]:
-                for block_n in [32, 64, 128, 256]:
-                    for num_warps in [4, 8]:
-                        for group_size in [1, 16, 32, 64]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_m,
-                                "BLOCK_SIZE_N": block_n,
-                                "BLOCK_SIZE_K": block_k,
-                                "GROUP_SIZE_M": group_size,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
-    return configs
-
-
-@ray.remote(num_gpus=1)
-class BenchmarkWorker:
-
-    def __init__(self, seed: int) -> None:
-        torch.set_default_device("cuda")
-        torch.cuda.manual_seed_all(seed)
-        self.seed = seed
-
-    def benchmark(
-        self,
-        num_tokens: int,
-        num_experts: int,
-        shard_intermediate_size: int,
-        hidden_size: int,
-        topk: int,
-        dtype: torch.dtype,
-        use_fp8: bool,
-    ) -> Tuple[Dict[str, int], float]:
-        torch.cuda.manual_seed_all(self.seed)
-
-        dtype_str = "float8" if use_fp8 else None
-        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
-        # is the intermediate size after silu_and_mul.
-        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
-        if op_config is None:
-            config = get_default_config(num_tokens, num_experts,
-                                        shard_intermediate_size, hidden_size,
-                                        topk, dtype_str)
-        else:
-            config = op_config[min(op_config.keys(),
-                                   key=lambda x: abs(x - num_tokens))]
-        kernel_time = benchmark_config(config, num_tokens, num_experts,
-                                       shard_intermediate_size, hidden_size,
-                                       topk, dtype, use_fp8)
-        return config, kernel_time
-
-    def tune(
-        self,
-        num_tokens: int,
-        num_experts: int,
-        shard_intermediate_size: int,
-        hidden_size: int,
-        topk: int,
-        dtype: torch.dtype,
-        use_fp8: bool,
-        search_space: List[BenchmarkConfig],
-    ) -> BenchmarkConfig:
-        best_config = None
-        best_time = float("inf")
-        for config in tqdm(search_space):
-            try:
-                kernel_time = benchmark_config(config,
-                                               num_tokens,
-                                               num_experts,
-                                               shard_intermediate_size,
-                                               hidden_size,
-                                               topk,
-                                               dtype,
-                                               use_fp8,
-                                               num_iters=10)
-            except triton.runtime.autotuner.OutOfResources:
-                # Some configurations may be invalid and fail to compile.
-                continue
-
-            if kernel_time < best_time:
-                best_time = kernel_time
-                best_config = config
-        now = datetime.now()
-        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
-        assert best_config is not None
-        return best_config
-
-
-def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
-    return {
-        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
-        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
-        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
-        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
-        "num_warps": config["num_warps"],
-        "num_stages": config["num_stages"],
-    }
-
-
-def save_configs(
-    configs: Dict[int, BenchmarkConfig],
-    num_experts: int,
-    shard_intermediate_size: int,
-    hidden_size: int,
-    topk: int,
-    dtype: torch.dtype,
-    use_fp8: bool,
-) -> None:
-    dtype_str = "float8" if use_fp8 else None
-    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
-    # is the intermediate size after silu_and_mul.
-    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
-                                    dtype_str)
-    print(f"Writing best config to {filename}...")
-    with open(filename, "w") as f:
-        json.dump(configs, f, indent=4)
-        f.write("\n")
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    config = AutoConfig.from_pretrained(args.model)
-    if config.architectures[0] == "DbrxForCausalLM":
-        E = config.ffn_config.moe_num_experts
-        topk = config.ffn_config.moe_top_k
-        intermediate_size = config.ffn_config.ffn_hidden_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-    else:
-        # Default: Mixtral.
-        E = config.num_local_experts
-        topk = config.num_experts_per_tok
-        intermediate_size = config.intermediate_size
-        shard_intermediate_size = 2 * intermediate_size // args.tp_size
-
-    hidden_size = config.hidden_size
-    dtype = config.torch_dtype
-    use_fp8 = args.dtype == "fp8"
-
-    if args.batch_size is None:
-        batch_sizes = [
-            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
-            2048, 3072, 4096
-        ]
-    else:
-        batch_sizes = [args.batch_size]
-
-    ray.init()
-    num_gpus = int(ray.available_resources()["GPU"])
-    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
-
-    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
-        outputs = []
-        worker_idx = 0
-        for input_args in inputs:
-            worker = workers[worker_idx]
-            worker_method = getattr(worker, method)
-            output = worker_method.remote(*input_args)
-            outputs.append(output)
-            worker_idx = (worker_idx + 1) % num_gpus
-        return ray.get(outputs)
-
-    if args.tune:
-        search_space = get_configs_compute_bound()
-        print(f"Start tuning over {len(search_space)} configurations...")
-
-        start = time.time()
-        configs = _distribute(
-            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
-                      topk, dtype, use_fp8, search_space)
-                     for batch_size in batch_sizes])
-        best_configs = {
-            M: sort_config(config)
-            for M, config in zip(batch_sizes, configs)
-        }
-        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
-                     topk, dtype, use_fp8)
-        end = time.time()
-        print(f"Tuning took {end - start:.2f} seconds")
-    else:
-        outputs = _distribute("benchmark",
-                              [(batch_size, E, shard_intermediate_size,
-                                hidden_size, topk, dtype, use_fp8)
-                               for batch_size in batch_sizes])
-
-        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
-            print(f"Batch size: {batch_size}, config: {config}")
-            print(f"Kernel time: {kernel_time:.2f} us")
-
-
-if __name__ == "__main__":
-    parser = FlexibleArgumentParser()
-    parser.add_argument("--model",
-                        type=str,
-                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
-    parser.add_argument("--tp-size", "-tp", type=int, default=2)
-    parser.add_argument("--dtype",
-                        type=str,
-                        choices=["auto", "fp8"],
-                        default="auto")
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--batch-size", type=int, required=False)
-    parser.add_argument("--tune", action="store_true")
-    args = parser.parse_args()
-
-    main(args)
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -1,12 +1,12 @@
+import argparse
 import random
 import time
-from typing import List, Optional
+from typing import Optional

 import torch

 from vllm import _custom_ops as ops
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
-                        create_kv_caches_with_random)
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random

 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@ -54,17 +54,14 @@ def main(

    # Create the block tables.
    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
-    block_tables_lst: List[List[int]] = []
+    block_tables = []
    for _ in range(num_seqs):
        block_table = [
            random.randint(0, NUM_BLOCKS - 1)
            for _ in range(max_num_blocks_per_seq)
        ]
-        block_tables_lst.append(block_table)
-
-    block_tables = torch.tensor(block_tables_lst,
-                                dtype=torch.int,
-                                device=device)
+        block_tables.append(block_table)
+    block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)

    # Create the KV cache.
    key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
@ -161,14 +158,14 @@ def main(


 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description="Benchmark the paged attention kernel.")
    parser.add_argument("--version",
                        type=str,
                        choices=["v1", "v2"],
                        default="v2")
    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument("--seq-len", type=int, default=4096)
+    parser.add_argument("--seq_len", type=int, default=4096)
    parser.add_argument("--num-query-heads", type=int, default=64)
    parser.add_argument("--num-kv-heads", type=int, default=8)
    parser.add_argument("--head-size",
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@ -1,12 +1,11 @@
+import argparse
 from itertools import accumulate
-from typing import List, Optional
+from typing import Optional

 import nvtx
 import torch

-from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
-                                                         get_rope)
-from vllm.utils import FlexibleArgumentParser
+from vllm.model_executor.layers.rotary_embedding import get_rope


 def benchmark_rope_kernels_multi_lora(
@ -38,7 +37,7 @@ def benchmark_rope_kernels_multi_lora(
                            })
    # non-batched RoPE takes only one scaling factor, we create multiple
    # instances to simulate the same behavior
-    non_batched_ropes: List[RotaryEmbedding] = []
+    non_batched_ropes = []
    for scaling_factor in scaling_factors:
        non_batched_ropes.append(
            get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
@ -86,7 +85,7 @@ def benchmark_rope_kernels_multi_lora(


 if __name__ == '__main__':
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description="Benchmark the rotary embedding kernels.")
    parser.add_argument("--is-neox-style", type=bool, default=True)
    parser.add_argument("--batch-size", type=int, default=16)
--- a/benchmarks/overheads/benchmark_hashing.py
+++ b/benchmarks/overheads/benchmark_hashing.py
@ -1,8 +1,8 @@
+import argparse
 import cProfile
 import pstats

 from vllm import LLM, SamplingParams
-from vllm.utils import FlexibleArgumentParser

 # A very long prompt, total number of tokens is about 15k.
 LONG_PROMPT = ["You are an expert in large language models, aren't you?"
@ -47,7 +47,7 @@ def main(args):


 if __name__ == "__main__":
-    parser = FlexibleArgumentParser(
+    parser = argparse.ArgumentParser(
        description='Benchmark the performance of hashing function in'
        'automatic prefix caching.')
    parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -12,7 +12,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
-list(APPEND CXX_COMPILE_FLAGS
+list(APPEND CXX_COMPILE_FLAGS 
    "-fopenmp"
    "-DVLLM_CPU_EXTENSION")

@ -33,23 +33,9 @@ function (find_isa CPUINFO TARGET OUT)
    endif()
 endfunction()

-function (is_avx512_disabled OUT)
-    set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
-    if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
-        set(${OUT} ON PARENT_SCOPE)
-    else()
-        set(${OUT} OFF PARENT_SCOPE)
-    endif()
-endfunction()
-
-is_avx512_disabled(AVX512_DISABLED)
-
-find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
-find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
-find_isa(${CPUINFO} "POWER9" POWER9_FOUND)

-if (AVX512_FOUND AND NOT AVX512_DISABLED)
+if (AVX512_FOUND)
    list(APPEND CXX_COMPILE_FLAGS
        "-mavx512f"
        "-mavx512vl"
@ -58,8 +44,8 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)

    find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
    if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND 
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) 
            list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
        else()
            message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
@ -67,18 +53,8 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
    else()
        message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
    endif()
-elseif (AVX2_FOUND)
-    list(APPEND CXX_COMPILE_FLAGS "-mavx2")
-    message(WARNING "vLLM CPU backend using AVX2 ISA")
-elseif (POWER9_FOUND OR POWER10_FOUND)
-    message(STATUS "PowerPC detected")
-    # Check for PowerPC VSX support
-    list(APPEND CXX_COMPILE_FLAGS
-        "-mvsx"
-        "-mcpu=native"
-        "-mtune=native")
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
 endif()

 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@ -97,7 +73,7 @@ set(VLLM_EXT_SRC
    "csrc/cpu/cache.cpp"
    "csrc/cpu/layernorm.cpp"
    "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/torch_bindings.cpp")
+    "csrc/cpu/pybind.cpp")

 define_gpu_extension_target(
    _C
@ -105,10 +81,10 @@ define_gpu_extension_target(
    LANGUAGE CXX
    SOURCES ${VLLM_EXT_SRC}
    COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-    USE_SABI 3
-    WITH_SOABI
+    WITH_SOABI 
 )

 add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
+
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -5,7 +5,7 @@
 macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
  set(Python_EXECUTABLE ${EXECUTABLE})
-  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
+  find_package(Python COMPONENTS Interpreter Development.Module)
  if (NOT Python_FOUND)
    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
  endif()
@ -147,23 +147,16 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
  if (${GPU_LANG} STREQUAL "HIP")
    #
    # `GPU_ARCHES` controls the `--offload-arch` flags.
+    # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
+    # via the `PYTORCH_ROCM_ARCH` env variable.
    #
-    # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
-    # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
-    # "rocm_agent_enumerator" in "enable_language(HIP)"
-    # (in file Modules/CMakeDetermineHIPCompiler.cmake)
-    #
-    if(DEFINED ENV{PYTORCH_ROCM_ARCH})
-      set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
-    else()
-      set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
-    endif()
+
    #
    # Find the intersection of the supported + detected architectures to
    # set the module architecture flags.
    #
    set(${GPU_ARCHES})
-    foreach (_ARCH ${HIP_ARCHITECTURES})
+    foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
      if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
        list(APPEND ${GPU_ARCHES} ${_ARCH})
      endif()
@ -171,7 +164,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)

    if(NOT ${GPU_ARCHES})
      message(FATAL_ERROR
-        "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
+        "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
        " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
    endif()

@ -301,7 +294,6 @@ endmacro()
 # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
 # LIBRARIES <libraries>      - Extra link libraries.
 # WITH_SOABI                 - Generate library with python SOABI suffix name.
-# USE_SABI <version>         - Use python stable api <version>
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
@ -309,7 +301,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
  cmake_parse_arguments(PARSE_ARGV 1
    GPU
    "WITH_SOABI"
-    "DESTINATION;LANGUAGE;USE_SABI"
+    "DESTINATION;LANGUAGE"
    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")

  # Add hipify preprocessing step when building with HIP/ROCm.
@ -323,11 +315,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
    set(GPU_WITH_SOABI)
  endif()

-  if (GPU_USE_SABI)
-    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
-  else()
-    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
-  endif()
+  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})

  if (GPU_LANGUAGE STREQUAL "HIP")
    # Make this target dependent on the hipify preprocessor step.
--- a/collect_env.py
+++ b/collect_env.py
@ -64,7 +64,6 @@ DEFAULT_CONDA_PATTERNS = {
    "triton",
    "optree",
    "nccl",
-    "transformers",
 }

 DEFAULT_PIP_PATTERNS = {
@ -76,7 +75,6 @@ DEFAULT_PIP_PATTERNS = {
    "optree",
    "onnx",
    "nccl",
-    "transformers",
 }


@ -603,11 +601,6 @@ Versions of relevant libraries:
 {conda_packages}
 """.strip()

-# both the above code and the following code use `strip()` to
-# remove leading/trailing whitespaces, so we need to add a newline
-# in between to separate the two sections
-env_info_fmt += "\n"
-
 env_info_fmt += """
 ROCM Version: {rocm_version}
 Neuron SDK Version: {neuron_sdk_version}
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>

 #include <cmath>
@ -135,12 +135,6 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
  return ((T)0.5) * x * (((T)1.0) + t);
 }

-template <typename T>
-__device__ __forceinline__ T gelu_quick_kernel(const T& x) {
-  // x * sigmoid(1.702 * x)
-  return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x)));
-}
-
 }  // namespace vllm

 void gelu_new(torch::Tensor& out,    // [..., d]
@ -154,9 +148,3 @@ void gelu_fast(torch::Tensor& out,    // [..., d]
 {
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
 }
-
-void gelu_quick(torch::Tensor& out,    // [..., d]
-                torch::Tensor& input)  // [..., d]
-{
-  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
-}
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@ -17,7 +17,7 @@
 * limitations under the License.
 */

-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <algorithm>
@ -808,17 +808,16 @@ void paged_attention_v1(
    torch::Tensor&
        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
    torch::Tensor&
-        value_cache,       // [num_blocks, num_heads, head_size, block_size]
-    int64_t num_kv_heads,  // [num_heads]
-    double scale,
+        value_cache,   // [num_blocks, num_heads, head_size, block_size]
+    int num_kv_heads,  // [num_heads]
+    float scale,
    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& seq_lens,      // [num_seqs]
-    int64_t block_size, int64_t max_seq_len,
+    int block_size, int max_seq_len,
    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  const bool is_block_sparse = (blocksparse_vert_stride > 1);

  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
@ -973,17 +972,16 @@ void paged_attention_v2(
    torch::Tensor&
        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
    torch::Tensor&
-        value_cache,       // [num_blocks, num_heads, head_size, block_size]
-    int64_t num_kv_heads,  // [num_heads]
-    double scale,
+        value_cache,   // [num_blocks, num_heads, head_size, block_size]
+    int num_kv_heads,  // [num_heads]
+    float scale,
    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
    torch::Tensor& seq_lens,      // [num_seqs]
-    int64_t block_size, int64_t max_seq_len,
+    int block_size, int max_seq_len,
    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  const bool is_block_sparse = (blocksparse_vert_stride > 1);
  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
                             CALL_V2_LAUNCHER_BLOCK_SIZE)
@ -992,4 +990,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -1,6 +1,6 @@
 #pragma once

-#include <torch/all.h>
+#include <torch/extension.h>

 #include <map>
 #include <vector>
@ -8,18 +8,14 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                 const torch::Tensor& block_mapping);

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
+void copy_blocks(std::vector<torch::Tensor>& key_caches,
+                 std::vector<torch::Tensor>& value_caches,
                 const torch::Tensor& block_mapping);

 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype,
-                       const double kv_scale);
+                       const std::string& kv_cache_dtype, const float kv_scale);

 void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                             torch::Tensor& key_cache,
@ -29,4 +25,4 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,

 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const double scale, const std::string& kv_cache_dtype);
+                 const float scale, const std::string& kv_cache_dtype);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

@ -95,11 +95,8 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,

 }  // namespace vllm

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
+void copy_blocks(std::vector<torch::Tensor>& key_caches,
+                 std::vector<torch::Tensor>& value_caches,
                 const torch::Tensor& block_mapping) {
  int num_layers = key_caches.size();
  TORCH_CHECK(num_layers == value_caches.size());
@ -258,7 +255,7 @@ void reshape_and_cache(
    torch::Tensor&
        value_cache,  // [num_blocks, num_heads, head_size, block_size]
    torch::Tensor& slot_mapping,  // [num_tokens]
-    const std::string& kv_cache_dtype, const double kv_scale) {
+    const std::string& kv_cache_dtype, const float kv_scale) {
  int num_tokens = key.size(0);
  int num_heads = key.size(1);
  int head_size = key.size(2);
@ -337,7 +334,7 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,

 // Only for testing.
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const double kv_scale, const std::string& kv_cache_dtype) {
+                 const float kv_scale, const std::string& kv_cache_dtype) {
  torch::Device src_device = src_cache.device();
  torch::Device dst_device = dst_cache.device();
  TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
--- a/csrc/cpu/activation.cpp
+++ b/csrc/cpu/activation.cpp
@ -59,13 +59,6 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
  return w3 * x * (ones + t);
 }

-FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) {
-  const vec_op::FP32Vec8 zeros(0.0);
-  const vec_op::FP32Vec8 ones(1.0);
-  const vec_op::FP32Vec8 w1(1.702f);
-  return x / (ones + (zeros - w1 * x).exp());
-}
-
 FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
  const vec_op::FP32Vec8 ones(1.0);
  const vec_op::FP32Vec8 w1(M_SQRT1_2);
@ -149,15 +142,3 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
    CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
  });
 }
-
-void gelu_quick(torch::Tensor& out, torch::Tensor& input) {
-  int num_tokens = input.numel() / input.size(-1);
-  int d = input.size(-1);
-
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] {
-    CPU_KERNEL_GUARD_IN(gelu_quick_impl)
-    activation_kernel<scalar_t, gelu_quick_act, false>(
-        num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
-    CPU_KERNEL_GUARD_OUT(gelu_quick_impl)
-  });
-}
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@ -420,13 +420,12 @@ void paged_attention_v1_impl_launcher(

 void paged_attention_v1(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  TORCH_CHECK(kv_scale == 1.0f);
  TORCH_CHECK(blocksparse_vert_stride <= 1,
              "CPU backend does not support blocksparse attention yet.");
@ -739,13 +738,12 @@ void paged_attention_v2_impl_launcher(
 void paged_attention_v2(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
  TORCH_CHECK(kv_scale == 1.0f);
  TORCH_CHECK(blocksparse_vert_stride <= 1,
              "CPU backend does not support blocksparse attention yet.");
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@ -5,8 +5,8 @@

 namespace {
 template <typename scalar_t>
-void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
-                          std::vector<torch::Tensor> const& value_caches,
+void copy_blocks_cpu_impl(std::vector<torch::Tensor>& key_caches,
+                          std::vector<torch::Tensor>& value_caches,
                          const torch::Tensor& mapping_pairs,
                          const int element_num_per_block,
                          const int layer_num) {
@ -82,11 +82,8 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace

-// Note: the key_caches and value_caches vectors are constant but
-// not the Tensors they contain. The vectors need to be const refs
-// in order to satisfy pytorch's C++ operator registration code.
-void copy_blocks(std::vector<torch::Tensor> const& key_caches,
-                 std::vector<torch::Tensor> const& value_caches,
+void copy_blocks(std::vector<torch::Tensor>& key_caches,
+                 std::vector<torch::Tensor>& value_caches,
                 const torch::Tensor& block_mapping) {
  unsigned num_layers = key_caches.size();
  TORCH_CHECK(num_layers == value_caches.size());
@ -107,7 +104,7 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                       torch::Tensor& key_cache, torch::Tensor& value_cache,
                       torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, double kv_scale) {
+                       const std::string& kv_cache_dtype, float kv_scale) {
  TORCH_CHECK(kv_scale == 1.0f);

  int num_tokens = key.size(0);
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@ -2,14 +2,351 @@
 #ifndef CPU_TYPES_HPP
 #define CPU_TYPES_HPP

-#if defined(__x86_64__)
-  //x86 implementation
-  #include "cpu_types_x86.hpp"
-#elif defined(__POWER9_VECTOR__)
-  //ppc implementation
-  #include "cpu_types_vsx.hpp"
+#include <immintrin.h>
+#include <torch/extension.h>
+
+namespace vec_op {
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
 #else
-  #warning "unsupported vLLM cpu implementation"
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
 #endif

+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+}; // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+#ifdef __AVX512FP16__
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128h reg;
+
+  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
+
+  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
+
+  explicit FP16Vec8(__m128h data) : reg(data) {}
+
+  FP16Vec8 operator*(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_mul_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator+(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_add_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator-(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_sub_ph(reg, b.reg));
+  }
+
+  FP16Vec8 operator/(const FP16Vec8 &b) const {
+    return FP16Vec8(_mm_div_ph(reg, b.reg));
+  }
+
+  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
+};
+#endif
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  __m128i reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  __m256i reg;
+
+  explicit BF16Vec16(const void *ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+};
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  __m512i reg;
+
+  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
+
+  explicit BF16Vec32(__m512i data) : reg(data) {}
+
+  explicit BF16Vec32(BF16Vec8 &vec8_data)
+      : reg((__m512i)_mm512_inserti32x4(
+            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                                                      (__m128i)vec8_data.reg),
+                                                  (__m128i)vec8_data.reg, 1),
+                               (__m128i)vec8_data.reg, 2),
+            (__m128i)vec8_data.reg, 3)) {}
+
+  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+  union AliasReg {
+    __m128 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m128 reg;
+
+  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
+
+  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
+
+  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
+
+  explicit FP32Vec4(__m128 data) : reg(data) {}
+
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    __m256 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m256 reg;
+
+  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
+
+  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
+
+  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
+
+  explicit FP32Vec8(__m256 data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
+
+#ifdef __AVX512FP16__
+  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
+#endif
+
+  explicit FP32Vec8(const BF16Vec8 &v)
+      : reg(_mm256_castsi256_ps(
+            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
+
+    return result;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
+                                  expf(ar.values[5]), expf(ar.values[4]),
+                                  expf(ar.values[3]), expf(ar.values[2]),
+                                  expf(ar.values[1]), expf(ar.values[0])));
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
+                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
+                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
+                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
+                                  erf(ar.values[5]), erf(ar.values[4]),
+                                  erf(ar.values[3]), erf(ar.values[2]),
+                                  erf(ar.values[1]), erf(ar.values[0])));
+  }
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_add_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8(_mm256_div_ps(reg, b.reg));
+  }
+
+  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    __m512 reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  __m512 reg;
+
+  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
+
+  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
+
+  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
+
+  explicit FP32Vec16(__m512 data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+
+  explicit FP32Vec16(const FP32Vec4 &data)
+      : reg((__m512)_mm512_inserti32x4(
+            _mm512_inserti32x4(
+                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
+                                   (__m128i)data.reg, 1),
+                (__m128i)data.reg, 2),
+            (__m128i)data.reg, 3)) {}
+
+  explicit FP32Vec16(const FP32Vec8 &data)
+      : reg((__m512)_mm512_inserti32x8(
+            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
+
+  explicit FP32Vec16(const BF16Vec16 &v)
+      : reg(_mm512_castsi512_ps(
+            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_add_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(_mm512_div_ps(reg, b.reg));
+  }
+
+  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
+    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
+    return _mm512_mask_reduce_add_ps(mask, reg);
+  }
+
+  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
+};
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+#ifdef __AVX512FP16__
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
+#endif
+
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+#ifdef __AVX512FP16__
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<_Float16 *>(ptr) = v;
+}
+#endif
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+  acc = acc + a * b;
+}
+
+#ifdef __AVX512BF16__
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
+
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
+}
+#else
+template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
+  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
+      reinterpret_cast<c10::BFloat16 *>(&v);
+  *ptr = *(v_ptr + 1);
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
+    : reg(_mm256_cvtepi32_epi16(
+          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
+    : reg(_mm512_cvtepi32_epi16(
+          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
+#endif
+
+inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
+
+}; // namespace vec_op
+
 #endif
--- a/csrc/cpu/cpu_types_vsx.hpp
+++ b/csrc/cpu/cpu_types_vsx.hpp
@ -1,491 +0,0 @@
-
-#ifndef CPU_TYPES_VSX_HPP
-#define CPU_TYPES_VSX_HPP
-
-#include <altivec.h>
-#include <cmath>
-#include <torch/all.h>
-
-namespace vec_op {
-
-// FIXME: FP16 is not fully supported in Torch-CPU
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
-
-#ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
-#else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
-#endif
-
-#define FORCE_INLINE __attribute__((always_inline)) inline
-
-namespace {
-template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
-  (f(std::integral_constant<T, indexes>{}), ...);
-}
-}; // namespace
-
-template <typename T, T count, typename F,
-          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
-  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
-}
-
-template <typename T> struct Vec {
-  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
-};
-
-typedef struct ss16x8x2_t {
-  __vector signed short val[2];
-} ss16x8x2_t;
-
-typedef struct ss16x8x4_t {
-  __vector signed short val[4];
-} ss16x8x4_t;
-
-typedef struct f32x4x2_t {
-  __vector float val[2];
-} f32x4x2_t;
-
-typedef struct f32x4x4_t {
-  __vector float val[4];
-} f32x4x4_t;
-
-struct FP32Vec8;
-struct FP32Vec16;
-
-struct BF16Vec8 : public Vec<BF16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  __vector signed short reg;
-
-  explicit BF16Vec8(const void *ptr)
-      : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {}
-
-  explicit BF16Vec8(const FP32Vec8 &);
-
-  void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; }
-};
-
-struct BF16Vec16 : public Vec<BF16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-
-  ss16x8x2_t reg;
-
-  explicit BF16Vec16(const void *ptr) {
-    // Load 256 bits in two parts
-    reg.val[0] = (__vector signed short)vec_xl(0,  (signed short *)ptr);
-    reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr);
-  }
-
-  explicit BF16Vec16(const FP32Vec16 &);
-
-  void save(void *ptr) const {
-    // Save 256 bits in two parts
-    vec_xst(reg.val[0], 0, (signed short *)ptr);
-    vec_xst(reg.val[1], 16, (signed short *)ptr);
-  }
-};
-
-const static __vector signed short zero = vec_splats((signed short)0);
-
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-
-  ss16x8x4_t reg;
-  explicit BF16Vec32(const void *ptr)
-      : reg(*reinterpret_cast<const ss16x8x4_t *>(ptr)) {}
-
-  explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
-
-  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg,
-    vec8_data.reg
-  }) {}
-
-  void save(void *ptr) const { *reinterpret_cast<ss16x8x4_t *>(ptr) = reg; }
-};
-
-struct FP32Vec4 : public Vec<FP32Vec4> {
-  constexpr static int VEC_ELEM_NUM = 4;
-  union AliasReg {
-    __vector float reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __vector float reg;
-
-  explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
-
-  explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
-
-  explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {}
-
-  explicit FP32Vec4(__vector float data) : reg(data) {}
-
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
-};
-
-struct FP32Vec8 : public Vec<FP32Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  union AliasReg {
-    f32x4x2_t reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  f32x4x2_t reg;
-
-  explicit FP32Vec8(float v) {
-    reg.val[0] = vec_splats(v);
-    reg.val[1] = vec_splats(v);
-  }
-
-  explicit FP32Vec8() {
-    reg.val[0] = vec_splats(0.0f);
-    reg.val[1] = vec_splats(0.0f);
-  }
-
-  explicit FP32Vec8(const float *ptr) {
-    reg.val[0] = vec_xl(0, ptr);
-    reg.val[1] = vec_xl(16, ptr);
-  }
-
-  explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
-
-  explicit FP32Vec8(const FP32Vec8 &data) {
-    reg.val[0] = data.reg.val[0];
-    reg.val[1] = data.reg.val[1];
-  }
-
-  explicit FP32Vec8(const BF16Vec8 &v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
-  }
-
-  float reduce_sum() const {
-    AliasReg ar;
-    ar.reg = reg;
-    float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
-
-    return result;
-  }
-
-  FP32Vec8 exp() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::exp(ar.values[0]);
-    ret.val[0][1] = std::exp(ar.values[1]);
-    ret.val[0][2] = std::exp(ar.values[2]);
-    ret.val[0][3] = std::exp(ar.values[3]);
-    ret.val[1][0] = std::exp(ar.values[4]);
-    ret.val[1][1] = std::exp(ar.values[5]);
-    ret.val[1][2] = std::exp(ar.values[6]);
-    ret.val[1][3] = std::exp(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
-  }
-
-  FP32Vec8 tanh() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::tanh(ar.values[0]);
-    ret.val[0][1] = std::tanh(ar.values[1]);
-    ret.val[0][2] = std::tanh(ar.values[2]);
-    ret.val[0][3] = std::tanh(ar.values[3]);
-    ret.val[1][0] = std::tanh(ar.values[4]);
-    ret.val[1][1] = std::tanh(ar.values[5]);
-    ret.val[1][2] = std::tanh(ar.values[6]);
-    ret.val[1][3] = std::tanh(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
-  }
-
-  FP32Vec8 er() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::erf(ar.values[0]);
-    ret.val[0][1] = std::erf(ar.values[1]);
-    ret.val[0][2] = std::erf(ar.values[2]);
-    ret.val[0][3] = std::erf(ar.values[3]);
-    ret.val[1][0] = std::erf(ar.values[4]);
-    ret.val[1][1] = std::erf(ar.values[5]);
-    ret.val[1][2] = std::erf(ar.values[6]);
-    ret.val[1][3] = std::erf(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
-  }
-
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
-  }
-
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
-  }
-
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
-  }
-
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
-    return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
-  }
-
-  void save(float *ptr) const {
-    vec_xst(reg.val[0], 0, ptr);
-    vec_xst(reg.val[1], 16, ptr);
-  }
-};
-
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  union AliasReg {
-    f32x4x4_t reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  f32x4x4_t reg;
-
-  explicit FP32Vec16(float v) {
-    reg.val[0] = vec_splats(v);
-    reg.val[1] = vec_splats(v);
-    reg.val[2] = vec_splats(v);
-    reg.val[3] = vec_splats(v);
-  }
-
-  explicit FP32Vec16() {
-    reg.val[0] = vec_splats(0.0f);
-    reg.val[1] = vec_splats(0.0f);
-    reg.val[2] = vec_splats(0.0f);
-    reg.val[3] = vec_splats(0.0f);
-  }
-
-  explicit FP32Vec16(const float *ptr) {
-    reg.val[0] = vec_xl(0, ptr);
-    reg.val[1] = vec_xl(16, ptr);
-    reg.val[2] = vec_xl(32, ptr);
-    reg.val[3] = vec_xl(48, ptr);
-  }
-
-  explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
-
-  explicit FP32Vec16(const FP32Vec16 &data) {
-    reg.val[0] = data.reg.val[0];
-    reg.val[1] = data.reg.val[1];
-    reg.val[2] = data.reg.val[2];
-    reg.val[3] = data.reg.val[3];
-  }
-
-  explicit FP32Vec16(const FP32Vec4 &data) {
-    reg.val[0] = data.reg;
-    reg.val[1] = data.reg;
-    reg.val[2] = data.reg;
-    reg.val[3] = data.reg;
-  }
-
-  explicit FP32Vec16(const FP32Vec8 &data) {
-    reg.val[0] = data.reg.val[0];
-    reg.val[1] = data.reg.val[1];
-    reg.val[2] = data.reg.val[0];
-    reg.val[3] = data.reg.val[1];
-  }
-
-  explicit FP32Vec16(const BF16Vec16 &v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
-    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
-    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
-  }
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_mul(reg.val[0], b.reg.val[0]),
-        vec_mul(reg.val[1], b.reg.val[1]),
-        vec_mul(reg.val[2], b.reg.val[2]),
-        vec_mul(reg.val[3], b.reg.val[3])}));
-  }
-
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_add(reg.val[0], b.reg.val[0]),
-        vec_add(reg.val[1], b.reg.val[1]),
-        vec_add(reg.val[2], b.reg.val[2]),
-        vec_add(reg.val[3], b.reg.val[3])}));
-  }
-
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_sub(reg.val[0], b.reg.val[0]),
-        vec_sub(reg.val[1], b.reg.val[1]),
-        vec_sub(reg.val[2], b.reg.val[2]),
-        vec_sub(reg.val[3], b.reg.val[3])}));
-  }
-
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(f32x4x4_t({
-        vec_div(reg.val[0], b.reg.val[0]),
-        vec_div(reg.val[1], b.reg.val[1]),
-        vec_div(reg.val[2], b.reg.val[2]),
-        vec_div(reg.val[3], b.reg.val[3])}));
-  }
-
-  float reduce_sum() const {
-    AliasReg ar;
-    ar.reg = reg;
-    float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
-
-    return result;
-  }
-
-  template <int group_size> float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-
-    AliasReg ar;
-    ar.reg = reg;
-    float result = 0;
-    const int start = idx * group_size;
-    unroll_loop<int, group_size>(
-        [&result, &start, ar](int i) { result += ar.values[start + i]; });
-
-    return result;
-  }
-
-  void save(float *ptr) const {
-    vec_xst(reg.val[0], 0, ptr);
-    vec_xst(reg.val[1], 16, ptr);
-    vec_xst(reg.val[2], 32, ptr);
-    vec_xst(reg.val[3], 48, ptr);
-  }
-};
-
-template <typename T> struct VecType { using vec_type = void; };
-
-template <typename T> using vec_t = typename VecType<T>::vec_type;
-
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
-
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
-
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
-
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
-  acc = acc + a * b;
-}
-
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::BFloat16 *>(&v);
-  *ptr = *(v_ptr + 1);
-}
-
-#ifndef __VEC_CLASS_FP_NAN
-#define __VEC_CLASS_FP_NAN (1 << 6)
-#endif
-
-const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
-#ifndef _ARCH_PWR10
-const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff };
-const static __vector unsigned int nan  = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 };
-const static __vector unsigned int sh16 = { 16, 16, 16, 16 };
-const static __vector unsigned int one  = { 1, 1, 1, 1 };
-#endif
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) {
-#ifdef _ARCH_PWR10
-  __vector signed short ret[2];
-  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
-  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
-  reg = vec_perm(ret[0], ret[1], omask);
-#elif defined(_ARCH_PWR9)
-  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
-  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
-  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
-  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
-  lsb0 = vec_and(lsb0, one);
-  lsb1 = vec_and(lsb1, one);
-  __vector unsigned int rnd0 = vec_add(lsb0, bias);
-  __vector unsigned int rnd1 = vec_add(lsb1, bias);
-  inp0 = vec_add(inp0, rnd0);
-  inp1 = vec_add(inp1, rnd1);
-  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
-  inp0 = vec_sel(inp0, nan, sel0);
-  inp1 = vec_sel(inp1, nan, sel1);
-  inp0 = vec_sr(inp0, sh16);
-  inp1 = vec_sr(inp1, sh16);
-  reg = (__vector signed short)vec_perm(inp0, inp1, omask);
-#endif
-}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
-#ifdef _ARCH_PWR10
-  __vector signed short ret[4];
-  ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]);
-  ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]);
-  ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]);
-  ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]);
-  reg.val[0] = vec_perm(ret[0], ret[1], omask);
-  reg.val[1] = vec_perm(ret[2], ret[3], omask);
-#elif defined(_ARCH_PWR9)
-  __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
-  __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
-  __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
-  __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
-  __vector unsigned int lsb0 = vec_sr(inp0, sh16);
-  __vector unsigned int lsb1 = vec_sr(inp1, sh16);
-  __vector unsigned int lsb2 = vec_sr(inp2, sh16);
-  __vector unsigned int lsb3 = vec_sr(inp3, sh16);
-  lsb0 = vec_and(lsb0, one);
-  lsb1 = vec_and(lsb1, one);
-  lsb2 = vec_and(lsb2, one);
-  lsb3 = vec_and(lsb3, one);
-  __vector unsigned int rnd0 = vec_add(lsb0, bias);
-  __vector unsigned int rnd1 = vec_add(lsb1, bias);
-  __vector unsigned int rnd2 = vec_add(lsb2, bias);
-  __vector unsigned int rnd3 = vec_add(lsb3, bias);
-  inp0 = vec_add(inp0, rnd0);
-  inp1 = vec_add(inp1, rnd1);
-  inp2 = vec_add(inp2, rnd2);
-  inp3 = vec_add(inp3, rnd3);
-  __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN);
-  __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN);
-  inp0 = vec_sel(inp0, nan, sel0);
-  inp1 = vec_sel(inp1, nan, sel1);
-  inp2 = vec_sel(inp2, nan, sel2);
-  inp3 = vec_sel(inp3, nan, sel3);
-  inp0 = vec_sr(inp0, sh16);
-  inp1 = vec_sr(inp1, sh16);
-  inp2 = vec_sr(inp2, sh16);
-  inp3 = vec_sr(inp3, sh16);
-  reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
-  reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
-#endif
-}
-
-inline void prefetch(const void *addr) {
-  __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory");
-}
-
-}; // namespace vec_op
-
-#endif
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -1,515 +0,0 @@
-
-#ifndef CPU_TYPES_X86_HPP
-#define CPU_TYPES_X86_HPP
-
-#include <immintrin.h>
-#include <torch/all.h>
-
-#ifndef __AVX2__
-static_assert(false, "AVX2 must be supported for the current implementation.");
-#endif
-
-namespace vec_op {
-
-// FIXME: FP16 is not fully supported in Torch-CPU
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
-
-#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
-  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
-
-#ifndef CPU_OP_GUARD
-#define CPU_KERNEL_GUARD_IN(NAME)
-#define CPU_KERNEL_GUARD_OUT(NAME)
-#else
-#define CPU_KERNEL_GUARD_IN(NAME)                                              \
-  std::cout << #NAME << " invoked." << std::endl;
-#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
-#endif
-
-#define FORCE_INLINE __attribute__((always_inline)) inline
-
-namespace {
-template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
-  (f(std::integral_constant<T, indexes>{}), ...);
-}
-}; // namespace
-
-template <typename T, T count, typename F,
-          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
-constexpr void unroll_loop(F &&f) {
-  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
-}
-
-template <typename T> struct Vec {
-  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
-};
-
-struct FP32Vec8;
-struct FP32Vec16;
-
-#ifdef __AVX512FP16__
-struct FP16Vec8 : public Vec<FP16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  __m128h reg;
-
-  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
-
-  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
-
-  explicit FP16Vec8(__m128h data) : reg(data) {}
-
-  FP16Vec8 operator*(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_mul_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator+(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_add_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator-(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_sub_ph(reg, b.reg));
-  }
-
-  FP16Vec8 operator/(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_div_ph(reg, b.reg));
-  }
-
-  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
-};
-#endif
-
-struct BF16Vec8 : public Vec<BF16Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-
-  __m128i reg;
-
-  explicit BF16Vec8(const void *ptr)
-      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
-
-  explicit BF16Vec8(const FP32Vec8 &);
-
-  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
-};
-
-struct BF16Vec16 : public Vec<BF16Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-
-  __m256i reg;
-
-  explicit BF16Vec16(const void *ptr)
-      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
-
-  explicit BF16Vec16(const FP32Vec16 &);
-
-  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
-};
-
-#ifdef __AVX512F__
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-
-  __m512i reg;
-
-  explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
-
-  explicit BF16Vec32(__m512i data) : reg(data) {}
-
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
-      : reg((__m512i)_mm512_inserti32x4(
-            _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
-                                                      (__m128i)vec8_data.reg),
-                                                  (__m128i)vec8_data.reg, 1),
-                               (__m128i)vec8_data.reg, 2),
-            (__m128i)vec8_data.reg, 3)) {}
-
-  void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
-};
-#else
-struct BF16Vec32 : public Vec<BF16Vec32> {
-  constexpr static int VEC_ELEM_NUM = 32;
-
-  __m256i reg_low;
-  __m256i reg_high;
-
-  explicit BF16Vec32(const void *ptr)
-      : reg_low(_mm256_loadu_si256((__m256i const *)ptr)),
-        reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {}
-
-  explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low),
-                                                  reg_high(high) {}
-
-  explicit BF16Vec32(BF16Vec8 &vec8_data)
-      : reg_low((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)),
-        reg_high((__m256i)_mm256_inserti32x4(
-                _mm256_castsi128_si256((__m128i)vec8_data.reg),
-                                       (__m128i)vec8_data.reg, 1)) {}
-
-  void save(void *ptr) const {
-    *reinterpret_cast<__m256i *>(ptr) = reg_low;
-    *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high;
-  }
-};
-#endif
-
-struct FP32Vec4 : public Vec<FP32Vec4> {
-  constexpr static int VEC_ELEM_NUM = 4;
-  union AliasReg {
-    __m128 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m128 reg;
-
-  explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
-
-  explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
-
-  explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
-
-  explicit FP32Vec4(__m128 data) : reg(data) {}
-
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
-};
-
-struct FP32Vec8 : public Vec<FP32Vec8> {
-  constexpr static int VEC_ELEM_NUM = 8;
-  union AliasReg {
-    __m256 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m256 reg;
-
-  explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
-
-  explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
-
-  explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
-
-  explicit FP32Vec8(__m256 data) : reg(data) {}
-
-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
-
-#ifdef __AVX512FP16__
-  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
-#endif
-
-  explicit FP32Vec8(const BF16Vec8 &v)
-      : reg(_mm256_castsi256_ps(
-            _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
-
-  float reduce_sum() const {
-    AliasReg ar;
-    ar.reg = reg;
-    float result = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
-
-    return result;
-  }
-
-  FP32Vec8 exp() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
-                                  expf(ar.values[5]), expf(ar.values[4]),
-                                  expf(ar.values[3]), expf(ar.values[2]),
-                                  expf(ar.values[1]), expf(ar.values[0])));
-  }
-
-  FP32Vec8 tanh() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
-                                  tanhf(ar.values[5]), tanhf(ar.values[4]),
-                                  tanhf(ar.values[3]), tanhf(ar.values[2]),
-                                  tanhf(ar.values[1]), tanhf(ar.values[0])));
-  }
-
-  FP32Vec8 er() const {
-    AliasReg ar;
-    ar.reg = reg;
-    return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
-                                  erf(ar.values[5]), erf(ar.values[4]),
-                                  erf(ar.values[3]), erf(ar.values[2]),
-                                  erf(ar.values[1]), erf(ar.values[0])));
-  }
-
-  FP32Vec8 operator*(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_mul_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator+(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_add_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator-(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_sub_ps(reg, b.reg));
-  }
-
-  FP32Vec8 operator/(const FP32Vec8 &b) const {
-    return FP32Vec8(_mm256_div_ps(reg, b.reg));
-  }
-
-  void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
-};
-
-#ifdef __AVX512F__
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-  union AliasReg {
-    __m512 reg;
-    float values[VEC_ELEM_NUM];
-  };
-
-  __m512 reg;
-
-  explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
-
-  explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
-
-  explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
-
-  explicit FP32Vec16(__m512 data) : reg(data) {}
-
-  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
-
-  explicit FP32Vec16(const FP32Vec4 &data)
-      : reg((__m512)_mm512_inserti32x4(
-            _mm512_inserti32x4(
-                _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
-                                   (__m128i)data.reg, 1),
-                (__m128i)data.reg, 2),
-            (__m128i)data.reg, 3)) {}
-
-  explicit FP32Vec16(const FP32Vec8 &data)
-      : reg((__m512)_mm512_inserti32x8(
-            _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
-
-  explicit FP32Vec16(const BF16Vec16 &v)
-      : reg(_mm512_castsi512_ps(
-            _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_mul_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_add_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_sub_ps(reg, b.reg));
-  }
-
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm512_div_ps(reg, b.reg));
-  }
-
-  float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
-
-  template <int group_size> float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
-    return _mm512_mask_reduce_add_ps(mask, reg);
-  }
-
-  void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
-};
-#else
-struct FP32Vec16 : public Vec<FP32Vec16> {
-  constexpr static int VEC_ELEM_NUM = 16;
-
-  union AliasReg {
-    __m256 reg;
-    float values[8];
-  };
-
-  __m256 reg_low;
-  __m256 reg_high;
-
-  explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)),
-                                reg_high(_mm256_set1_ps(v)) {}
-
-  explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)),
-                         reg_high(_mm256_set1_ps(0.0)) {}
-
-  explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)),
-                                         reg_high(_mm256_loadu_ps(ptr + 8)) {}
-
-  explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {}
-
-  explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low),
-                                              reg_high(data.reg_high) {}
-
-  explicit FP32Vec16(const FP32Vec4 &data)
-      : reg_low((__m256)_mm256_inserti128_si256(
-                _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)),
-        reg_high((__m256)_mm256_inserti128_si256(
-                 _mm256_castsi128_si256((__m128i)data.reg),
-                                       (__m128i)data.reg, 1)) {}
-
-  explicit FP32Vec16(const FP32Vec8 &data)
-      : reg_low(data.reg), reg_high(data.reg) {}
-
-  explicit FP32Vec16(const BF16Vec16 &v) {
-    __m128i low = _mm256_extractf128_si256(v.reg, 0);
-    __m128i high = _mm256_extractf128_si256(v.reg, 1);
-
-    __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low);
-    __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high);
-
-    __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2);
-    __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2);
-
-    reg_low = _mm256_castsi256_ps(v_low_shifted);
-    reg_high = _mm256_castsi256_ps(v_high_shifted);
-  }
-
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-
-  FP32Vec16 operator*(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low),
-                     _mm256_mul_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator+(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low),
-                     _mm256_add_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator-(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low),
-                     _mm256_sub_ps(reg_high, b.reg_high));
-  }
-
-  FP32Vec16 operator/(const FP32Vec16 &b) const {
-    return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low),
-                     _mm256_div_ps(reg_high, b.reg_high));
-  }
-
-  float reduce_sum() const {
-    FP32Vec8 low = FP32Vec8(reg_low);
-    FP32Vec8 high = FP32Vec8(reg_high);
-    return low.reduce_sum() + high.reduce_sum();
-  }
-
-  template <int group_size> float reduce_sub_sum(int idx) {
-    float sum = 0.0;
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    uint32_t mask = base_mask << (idx * group_size);
-
-    AliasReg ar;
-
-    auto func = [&sum, &mask, &ar](int i) {
-      int flag = mask & 0x1;
-      mask = mask >> 1;
-      if (flag != 0) sum += ar.values[i];
-    };
-
-    ar.reg = reg_low;
-    unroll_loop<int, 8>(func);
-
-    ar.reg = reg_high;
-    unroll_loop<int, 8>(func);
-
-    return sum;
-  }
-
-  void save(float *ptr) const {
-    _mm256_storeu_ps(ptr, reg_low);
-    _mm256_storeu_ps(ptr + 8, reg_high);
-  }
-};
-#endif
-
-template <typename T> struct VecType { using vec_type = void; };
-
-template <typename T> using vec_t = typename VecType<T>::vec_type;
-
-template <> struct VecType<float> { using vec_type = FP32Vec8; };
-
-#ifdef __AVX512FP16__
-template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
-#endif
-
-template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
-
-template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
-
-#ifdef __AVX512FP16__
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  *reinterpret_cast<_Float16 *>(ptr) = v;
-}
-#endif
-
-inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
-  acc = acc + a * b;
-}
-
-#ifdef __AVX512BF16__
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
-}
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
-    : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
-
-inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
-  acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
-}
-#else
-template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
-  c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
-      reinterpret_cast<c10::BFloat16 *>(&v);
-  *ptr = *(v_ptr + 1);
-}
-
-#ifdef __AVX512F__
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg(_mm256_cvtepi32_epi16(
-          _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
-    : reg(_mm512_cvtepi32_epi16(
-          _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
-#else
-namespace{
-__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) {
-  __m256i ai = _mm256_castps_si256(a);
-  ai = _mm256_srli_epi32(ai, 16);
-  ai = _mm256_packus_epi32(ai, ai);
-  ai = _mm256_permute4x64_epi64(ai, 0b00111001);
-  return _mm256_extracti128_si256(ai, 0);
-}
-}
-
-inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
-    : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {}
-
-inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) {
-  BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low));
-  BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high));
-  reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1);
-}
-#endif // __AVX512F__
-#endif // __AVX512BF16__
-
-inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
-
-}; // namespace vec_op
-
-#endif
--- a/csrc/cpu/layernorm.cpp
+++ b/csrc/cpu/layernorm.cpp
@ -88,7 +88,7 @@ void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
 }  // namespace

 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-              double epsilon) {
+              float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

@ -102,7 +102,7 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 }

 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
-                        torch::Tensor& weight, double epsilon) {
+                        torch::Tensor& weight, float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@ -168,7 +168,7 @@ void rotary_embedding_gptj_impl(
 };  // namespace

 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int64_t head_size,
+                      torch::Tensor& key, int head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox) {
  int num_tokens = query.numel() / query.size(-1);
  int rot_dim = cos_sin_cache.size(1);
--- a/csrc/cpu/pybind.cpp
+++ b/csrc/cpu/pybind.cpp
@ -0,0 +1,44 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // vLLM custom ops
+  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
+
+  // Attention ops
+  ops.def("paged_attention_v1", &paged_attention_v1,
+          "Compute the attention between an input query and the cached "
+          "keys/values using PagedAttention.");
+  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
+
+  // Activation ops
+  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
+  ops.def("gelu_and_mul", &gelu_and_mul,
+          "Activation function used in GeGLU with `none` approximation.");
+  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
+          "Activation function used in GeGLU with `tanh` approximation.");
+  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
+  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
+
+  // Layernorm
+  ops.def("rms_norm", &rms_norm,
+          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
+
+  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
+          "In-place fused Add and RMS Normalization");
+
+  // Rotary embedding
+  ops.def("rotary_embedding", &rotary_embedding,
+          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
+
+  // Cache ops
+  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
+  cache_ops.def("swap_blocks", &swap_blocks,
+                "Swap in (out) the cache blocks from src to dst");
+  cache_ops.def("copy_blocks", &copy_blocks,
+                "Copy the cache blocks from src to dst");
+  cache_ops.def("reshape_and_cache", &reshape_and_cache,
+                "Reshape the key and value tensors and cache them");
+}
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -1,110 +0,0 @@
-#include "cache.h"
-#include "ops.h"
-#include "registration.h"
-
-#include <torch/library.h>
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-  // vLLM custom ops
-
-  // Attention ops
-  // Compute the attention between an input query and the cached keys/values
-  // using PagedAttention.
-  ops.def(
-      "paged_attention_v1("
-      "    Tensor! out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
-
-  // PagedAttention V2.
-  ops.def(
-      "paged_attention_v2("
-      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
-      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
-      "    Tensor value_cache, int num_kv_heads, float scale,"
-      "    Tensor block_tables, Tensor seq_lens, int block_size,"
-      "    int max_seq_len, Tensor? alibi_slopes,"
-      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
-      "    int blocksparse_local_blocks,"
-      "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step) -> ()");
-  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
-
-  // Activation ops
-
-  // Activation function used in SwiGLU.
-  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
-  ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
-
-  // Activation function used in GeGLU with `none` approximation.
-  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
-
-  // Activation function used in GeGLU with `tanh` approximation.
-  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
-
-  // GELU implementation used in GPT-2.
-  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_new", torch::kCPU, &gelu_new);
-
-  // Approximate GELU implementation.
-  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
-
-  // Quick GELU implementation.
-  ops.def("gelu_quick(Tensor! out, Tensor input) -> ()");
-  ops.impl("gelu_quick", torch::kCPU, &gelu_quick);
-
-  // Layernorm
-  // Apply Root Mean Square (RMS) Normalization to the input tensor.
-  ops.def(
-      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
-      "()");
-  ops.impl("rms_norm", torch::kCPU, &rms_norm);
-
-  // In-place fused Add and RMS Normalization.
-  ops.def(
-      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
-      "float epsilon) -> ()");
-  ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
-
-  // Rotary embedding
-  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
-  ops.def(
-      "rotary_embedding(Tensor positions, Tensor! query,"
-      "                 Tensor! key, int head_size,"
-      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
-  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
-}
-
-TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
-  // Cache ops
-  // Swap in (out) the cache blocks from src to dst.
-  cache_ops.def(
-      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
-  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
-
-  // Copy the cache blocks from src to dst.
-  cache_ops.def(
-      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
-      "block_mapping) -> ()");
-  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
-
-  // Reshape the key and value tensors and cache them.
-  cache_ops.def(
-      "reshape_and_cache(Tensor key, Tensor value,"
-      "                  Tensor! key_cache, Tensor! value_cache,"
-      "                  Tensor slot_mapping,"
-      "                  str kv_cache_dtype,"
-      "                  float kv_scale) -> ()");
-  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@ -1,5 +1,7 @@
 #pragma once

-int64_t get_device_attribute(int64_t attribute, int64_t device_id);
+#include <torch/extension.h>

-int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
+int get_device_attribute(int attribute, int device_id);
+
+int get_max_shared_memory_per_block_device_attribute(int device_id);
--- a/csrc/cuda_utils_kernels.cu
+++ b/csrc/cuda_utils_kernels.cu
@ -2,7 +2,7 @@
  #include <hip/hip_runtime.h>
  #include <hip/hip_runtime_api.h>
 #endif
-int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
+int get_device_attribute(int attribute, int device_id) {
  int device, value;
  if (device_id < 0) {
    cudaGetDevice(&device);
@ -14,8 +14,8 @@ int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
  return value;
 }

-int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
-  int64_t attribute;
+int get_max_shared_memory_per_block_device_attribute(int device_id) {
+  int attribute;
  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
  // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74

--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@ -1,17 +1,17 @@
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
-#include <torch/all.h>
+#include <torch/extension.h>

 #include "custom_all_reduce.cuh"

-// fake pointer type, must match fptr_t type in ops.h
-using fptr_t = int64_t;
+// fake pointer type
+using fptr_t = uint64_t;
 static_assert(sizeof(void*) == sizeof(fptr_t));

 fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                      const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int64_t rank,
+                      const std::vector<int64_t>& offsets, int rank,
                      bool full_nvlink) {
  int world_size = offsets.size();
  if (world_size > 8)
@ -55,7 +55,7 @@ bool _is_weak_contiguous(torch::Tensor& t) {
          t.numel() * t.element_size());
 }

-bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
+bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
                      bool full_nvlink) {
  auto inp_size = inp.numel() * inp.element_size();
  // custom allreduce requires input byte size to be multiples of 16
@ -125,7 +125,7 @@ void dispose(fptr_t _fa) {
  delete fa;
 }

-int64_t meta_size() { return sizeof(vllm::Signal); }
+int meta_size() { return sizeof(vllm::Signal); }

 void register_buffer(fptr_t _fa, torch::Tensor& t,
                     const std::vector<std::string>& handles,
@ -134,16 +134,10 @@ void register_buffer(fptr_t _fa, torch::Tensor& t,
  fa->register_buffer(handles, offsets, t.data_ptr());
 }

-std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
    fptr_t _fa) {
  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
-  auto options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
-  auto handles =
-      torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
-  std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
-  return {handles, std::move(offsets)};
+  return fa->get_graph_buffer_ipc_meta();
 }

 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@ -4,7 +4,7 @@
 */
 #pragma once

-#include <torch/all.h>
+#include <torch/extension.h>

 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

@ -291,7 +291,7 @@ fused_add_rms_norm_kernel(
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
              torch::Tensor& input,   // [..., hidden_size]
              torch::Tensor& weight,  // [hidden_size]
-              double epsilon) {
+              float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

@ -319,7 +319,7 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
 void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                        torch::Tensor& residual,  // [..., hidden_size]
                        torch::Tensor& weight,    // [hidden_size]
-                        double epsilon) {
+                        float epsilon) {
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;

--- a/csrc/moe/moe_ops.cpp
+++ b/csrc/moe/moe_ops.cpp
@ -0,0 +1,8 @@
+#include "moe_ops.h"
+
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("topk_softmax", &topk_softmax,
+        "Apply topk softmax to the gating outputs.");
+}
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -1,6 +1,6 @@
 #pragma once

-#include <torch/all.h>
+#include <torch/extension.h>

 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                  torch::Tensor& token_expert_indices,
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -16,7 +16,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -1,12 +0,0 @@
-#include "registration.h"
-#include "moe_ops.h"
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
-  // Apply topk softmax to the gating outputs.
-  m.def(
-      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
-      "token_expert_indices, Tensor gating_output) -> ()");
-  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>

 #include <ATen/ATen.h>
@ -108,8 +108,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
 }
 }  // namespace vllm

-void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
-                          int64_t block_size, torch::Tensor sorted_token_ids,
+void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
+                          int block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad) {
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -1,43 +1,40 @@
 #pragma once

-#include <optional>
-#include <torch/library.h>
+#include <torch/extension.h>

 void paged_attention_v1(
    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step);
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step);

 void paged_attention_v2(
    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
-    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
-    const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step);
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
+    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+    const int blocksparse_block_size, const int blocksparse_head_sliding_step);

 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-              double epsilon);
+              float epsilon);

 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
-                        torch::Tensor& weight, double epsilon);
+                        torch::Tensor& weight, float epsilon);

 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int64_t head_size,
+                      torch::Tensor& key, int head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox);

 void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                              torch::Tensor& key, int64_t head_size,
+                              torch::Tensor& key, int head_size,
                              torch::Tensor& cos_sin_cache, bool is_neox,
-                              int64_t rot_dim,
+                              int rot_dim,
                              torch::Tensor& cos_sin_cache_offsets);

 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
@ -50,8 +47,6 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input);

 void gelu_fast(torch::Tensor& out, torch::Tensor& input);

-void gelu_quick(torch::Tensor& out, torch::Tensor& input);
-
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                        const torch::Tensor& codebooks,
@ -65,12 +60,12 @@ torch::Tensor aqlm_dequant(const torch::Tensor& codes,

 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
-                       int64_t split_k_iters);
+                       int split_k_iters);

 torch::Tensor awq_dequantize(torch::Tensor _kernel,
                             torch::Tensor _scaling_factors,
-                             torch::Tensor _zeros, int64_t split_k_iters,
-                             int64_t thx, int64_t thy);
+                             torch::Tensor _zeros, int split_k_iters, int thx,
+                             int thy);

 torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                          torch::Tensor& b_scales, torch::Tensor& workspace,
@ -93,35 +88,24 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                 int64_t size_k, int64_t size_n,
                                 int64_t num_bits);

-torch::Tensor fp8_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
-                              torch::Tensor& b_scales, torch::Tensor& workspace,
-                              int64_t num_bits, int64_t size_m, int64_t size_n,
-                              int64_t size_k);
-
-bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability);
-
-void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a,
-                       torch::Tensor const& b, torch::Tensor const& a_scales,
-                       torch::Tensor const& b_scales,
-                       c10::optional<torch::Tensor> const& bias);
+int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b, torch::Tensor const& a_scales,
+                         torch::Tensor const& b_scales);

 #endif

 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                              torch::Tensor const& scale);

-void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
-                               torch::Tensor& scales);
-
 void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
                     torch::Tensor lookup_table);

 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                        torch::Tensor b_gptq_qzeros,
                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int64_t bit);
+                        bool use_exllama, int bit);

-void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit);

 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                             torch::Tensor& scale);
@ -129,28 +113,28 @@ void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
 void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                              torch::Tensor& scale);

-void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
-                          int64_t block_size, torch::Tensor sorted_token_ids,
+void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
+                          int block_size, torch::Tensor sorted_token_ids,
                          torch::Tensor experts_ids,
                          torch::Tensor num_tokens_post_pad);

 #ifndef USE_ROCM
-using fptr_t = int64_t;
+using fptr_t = uint64_t;
 fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                      const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int64_t rank,
+                      const std::vector<int64_t>& offsets, int rank,
                      bool full_nvlink);
-bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
+bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
                      bool full_nvlink);
 void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
 void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
                      torch::Tensor& out);
 void dispose(fptr_t _fa);
-int64_t meta_size();
+int meta_size();
 void register_buffer(fptr_t _fa, torch::Tensor& t,
                     const std::vector<std::string>& handles,
                     const std::vector<int64_t>& offsets);
-std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
    fptr_t _fa);
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
                            const std::vector<std::vector<int64_t>>& offsets);
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>

@ -127,7 +127,7 @@ void rotary_embedding(
                           // [num_tokens, num_heads * head_size]
    torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
                           // [num_tokens, num_kv_heads * head_size]
-    int64_t head_size,
+    int head_size,
    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
    bool is_neox) {
  int64_t num_tokens = query.numel() / query.size(-1);
@ -138,7 +138,7 @@ void rotary_embedding(
  int64_t key_stride = key.stride(-2);

  dim3 grid(num_tokens);
-  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+  dim3 block(std::min(num_heads * rot_dim / 2, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
@ -168,9 +168,9 @@ void batched_rotary_embedding(
                           // [num_tokens, num_heads * head_size]
    torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
                           // [num_tokens, num_kv_heads * head_size]
-    int64_t head_size,
+    int head_size,
    torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
-    bool is_neox, int64_t rot_dim,
+    bool is_neox, int rot_dim,
    torch::Tensor& cos_sin_cache_offsets  // [num_tokens]
 ) {
  int64_t num_tokens = cos_sin_cache_offsets.size(0);
@ -180,7 +180,7 @@ void batched_rotary_embedding(
  int64_t key_stride = key.stride(-2);

  dim3 grid(num_tokens);
-  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
+  dim3 block(std::min(num_heads * rot_dim / 2, 512));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@ -16,20 +16,14 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 512) \
    f(in_T, out_T, W_T, narrow, 640) \
    f(in_T, out_T, W_T, narrow, 768) \
-    f(in_T, out_T, W_T, narrow, 896) \
    f(in_T, out_T, W_T, narrow, 1024) \
    f(in_T, out_T, W_T, narrow, 1152) \
-    f(in_T, out_T, W_T, narrow, 1216) \
    f(in_T, out_T, W_T, narrow, 1280) \
    f(in_T, out_T, W_T, narrow, 1536) \
-    f(in_T, out_T, W_T, narrow, 1664) \
    f(in_T, out_T, W_T, narrow, 1728) \
    f(in_T, out_T, W_T, narrow, 1792) \
    f(in_T, out_T, W_T, narrow, 2048) \
-    f(in_T, out_T, W_T, narrow, 2240) \
    f(in_T, out_T, W_T, narrow, 2304) \
-    f(in_T, out_T, W_T, narrow, 2368) \
-    f(in_T, out_T, W_T, narrow, 2432) \
    f(in_T, out_T, W_T, narrow, 2560) \
    f(in_T, out_T, W_T, narrow, 2752) \
    f(in_T, out_T, W_T, narrow, 2816) \
@ -37,47 +31,32 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 3328) \
    f(in_T, out_T, W_T, narrow, 3456) \
    f(in_T, out_T, W_T, narrow, 3584) \
-    f(in_T, out_T, W_T, narrow, 3712) \
    f(in_T, out_T, W_T, narrow, 4096) \
-    f(in_T, out_T, W_T, narrow, 4480) \
    f(in_T, out_T, W_T, narrow, 4608) \
-    f(in_T, out_T, W_T, narrow, 4736) \
-    f(in_T, out_T, W_T, narrow, 4864) \
    f(in_T, out_T, W_T, narrow, 5120) \
    f(in_T, out_T, W_T, narrow, 5504) \
    f(in_T, out_T, W_T, narrow, 5632) \
-    f(in_T, out_T, W_T, narrow, 5888) \
    f(in_T, out_T, W_T, narrow, 6144) \
    f(in_T, out_T, W_T, narrow, 6400) \
    f(in_T, out_T, W_T, narrow, 6848) \
    f(in_T, out_T, W_T, narrow, 6912) \
    f(in_T, out_T, W_T, narrow, 7168) \
-    f(in_T, out_T, W_T, narrow, 7424) \
    f(in_T, out_T, W_T, narrow, 8192) \
-    f(in_T, out_T, W_T, narrow, 8960) \
    f(in_T, out_T, W_T, narrow, 9216) \
-    f(in_T, out_T, W_T, narrow, 9472) \
    f(in_T, out_T, W_T, narrow, 10240) \
    f(in_T, out_T, W_T, narrow, 11008) \
-    f(in_T, out_T, W_T, narrow, 11264) \
    f(in_T, out_T, W_T, narrow, 12288) \
    f(in_T, out_T, W_T, narrow, 13696) \
    f(in_T, out_T, W_T, narrow, 13824) \
    f(in_T, out_T, W_T, narrow, 14336) \
-    f(in_T, out_T, W_T, narrow, 14784) \
-    f(in_T, out_T, W_T, narrow, 14848) \
    f(in_T, out_T, W_T, narrow, 15360) \
    f(in_T, out_T, W_T, narrow, 16384) \
-    f(in_T, out_T, W_T, narrow, 18944) \
    f(in_T, out_T, W_T, narrow, 20480) \
    f(in_T, out_T, W_T, narrow, 22016) \
-    f(in_T, out_T, W_T, narrow, 22528) \
    f(in_T, out_T, W_T, narrow, 24576) \
    f(in_T, out_T, W_T, narrow, 27392) \
    f(in_T, out_T, W_T, narrow, 27648) \
    f(in_T, out_T, W_T, narrow, 28672) \
-    f(in_T, out_T, W_T, narrow, 29568) \
-    f(in_T, out_T, W_T, narrow, 29696) \
    f(in_T, out_T, W_T, narrow, 32000) \
    f(in_T, out_T, W_T, narrow, 32256) \
    f(in_T, out_T, W_T, narrow, 32512) \
@ -86,9 +65,6 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 36864) \
    f(in_T, out_T, W_T, narrow, 43264) \
    f(in_T, out_T, W_T, narrow, 49152) \
-    f(in_T, out_T, W_T, narrow, 49408) \
-    f(in_T, out_T, W_T, narrow, 60544) \
-    f(in_T, out_T, W_T, narrow, 60672) \
    f(in_T, out_T, W_T, narrow, 64000) \
    f(in_T, out_T, W_T, narrow, 64256) \
    f(in_T, out_T, W_T, narrow, 64512) \
@ -98,14 +74,12 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, narrow, 128000) \
    f(in_T, out_T, W_T, narrow, 128256) \
    f(in_T, out_T, W_T, narrow, 128512) \
-    
-    
 // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA
 // and vllm/tests/lora/test_punica.py

-// Used for defining kernels going from the variety of
+// Used for defining kernels going from the variety of 
 // dim in to the narrow dim out
-    // Using it for the fully sharded column
+    // Using it for the fully sharded column 
    // parallel LoRA A which splits the rank dim
 #define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \
    f(in_T, out_T, W_T, 128, narrow) \
@ -113,20 +87,14 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, 512, narrow) \
    f(in_T, out_T, W_T, 640, narrow) \
    f(in_T, out_T, W_T, 768, narrow) \
-    f(in_T, out_T, W_T, 896, narrow) \
    f(in_T, out_T, W_T, 1024, narrow) \
    f(in_T, out_T, W_T, 1152, narrow) \
-    f(in_T, out_T, W_T, 1216, narrow) \
    f(in_T, out_T, W_T, 1280, narrow) \
    f(in_T, out_T, W_T, 1536, narrow) \
-    f(in_T, out_T, W_T, 1664, narrow) \
    f(in_T, out_T, W_T, 1728, narrow) \
    f(in_T, out_T, W_T, 1792, narrow) \
    f(in_T, out_T, W_T, 2048, narrow) \
-    f(in_T, out_T, W_T, 2240, narrow) \
    f(in_T, out_T, W_T, 2304, narrow) \
-    f(in_T, out_T, W_T, 2368, narrow) \
-    f(in_T, out_T, W_T, 2432, narrow) \
    f(in_T, out_T, W_T, 2560, narrow) \
    f(in_T, out_T, W_T, 2752, narrow) \
    f(in_T, out_T, W_T, 2816, narrow) \
@ -134,47 +102,32 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, 3328, narrow) \
    f(in_T, out_T, W_T, 3456, narrow) \
    f(in_T, out_T, W_T, 3584, narrow) \
-    f(in_T, out_T, W_T, 3712, narrow) \
    f(in_T, out_T, W_T, 4096, narrow) \
-    f(in_T, out_T, W_T, 4480, narrow) \
    f(in_T, out_T, W_T, 4608, narrow) \
-    f(in_T, out_T, W_T, 4736, narrow) \
-    f(in_T, out_T, W_T, 4864, narrow) \
    f(in_T, out_T, W_T, 5120, narrow) \
    f(in_T, out_T, W_T, 5504, narrow) \
    f(in_T, out_T, W_T, 5632, narrow) \
-    f(in_T, out_T, W_T, 5888, narrow) \
    f(in_T, out_T, W_T, 6144, narrow) \
    f(in_T, out_T, W_T, 6400, narrow) \
    f(in_T, out_T, W_T, 6848, narrow) \
    f(in_T, out_T, W_T, 6912, narrow) \
    f(in_T, out_T, W_T, 7168, narrow) \
-    f(in_T, out_T, W_T, 7424, narrow) \
    f(in_T, out_T, W_T, 8192, narrow) \
-    f(in_T, out_T, W_T, 8960, narrow) \
    f(in_T, out_T, W_T, 9216, narrow) \
-    f(in_T, out_T, W_T, 9472, narrow) \
    f(in_T, out_T, W_T, 10240, narrow) \
    f(in_T, out_T, W_T, 11008, narrow) \
-    f(in_T, out_T, W_T, 11264, narrow) \
    f(in_T, out_T, W_T, 12288, narrow) \
    f(in_T, out_T, W_T, 13696, narrow) \
    f(in_T, out_T, W_T, 13824, narrow) \
    f(in_T, out_T, W_T, 14336, narrow) \
-    f(in_T, out_T, W_T, 14784, narrow) \
-    f(in_T, out_T, W_T, 14848, narrow) \
    f(in_T, out_T, W_T, 15360, narrow) \
    f(in_T, out_T, W_T, 16384, narrow) \
-    f(in_T, out_T, W_T, 18944, narrow) \
    f(in_T, out_T, W_T, 20480, narrow) \
    f(in_T, out_T, W_T, 22016, narrow) \
-    f(in_T, out_T, W_T, 22528, narrow) \
    f(in_T, out_T, W_T, 24576, narrow) \
    f(in_T, out_T, W_T, 27392, narrow) \
    f(in_T, out_T, W_T, 27648, narrow) \
    f(in_T, out_T, W_T, 28672, narrow) \
-    f(in_T, out_T, W_T, 29568, narrow) \
-    f(in_T, out_T, W_T, 29696, narrow) \
    f(in_T, out_T, W_T, 32000, narrow) \
    f(in_T, out_T, W_T, 32256, narrow) \
    f(in_T, out_T, W_T, 32512, narrow) \
@ -183,9 +136,6 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
    f(in_T, out_T, W_T, 36864, narrow) \
    f(in_T, out_T, W_T, 43264, narrow) \
    f(in_T, out_T, W_T, 49152, narrow) \
-    f(in_T, out_T, W_T, 49408, narrow) \
-    f(in_T, out_T, W_T, 60544, narrow) \
-    f(in_T, out_T, W_T, 60672, narrow) \
    f(in_T, out_T, W_T, 64000, narrow) \
    f(in_T, out_T, W_T, 64256, narrow) \
    f(in_T, out_T, W_T, 64512, narrow) \
--- a/csrc/punica/punica_ops.cu
+++ b/csrc/punica/punica_ops.cu
@ -1,4 +1,4 @@
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cstdint>

@ -88,7 +88,7 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
 }

 void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale) {
+                   torch::Tensor indicies, int64_t layer_idx, float scale) {
  CHECK_INPUT(y);
  CHECK_INPUT(x);
  CHECK_INPUT(w);
@ -320,7 +320,7 @@ void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,

 void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
+                             float scale, int64_t h_in, int64_t h_out,
                             int64_t y_offset) {
  CHECK_INPUT(y);
  CHECK_INPUT(x);
--- a/csrc/punica/punica_ops.h
+++ b/csrc/punica/punica_ops.h
@ -1,11 +1,11 @@
 #pragma once

-#include <torch/all.h>
+#include <torch/extension.h>

 void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, double scale);
+                   torch::Tensor indicies, int64_t layer_idx, float scale);

 void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
                             torch::Tensor indicies, int64_t layer_idx,
-                             double scale, int64_t h_in, int64_t h_out,
+                             float scale, int64_t h_in, int64_t h_out,
                             int64_t y_offset);
--- a/csrc/punica/punica_pybind.cpp
+++ b/csrc/punica/punica_pybind.cpp
@ -0,0 +1,13 @@
+#include <torch/extension.h>
+
+#include "punica_ops.h"
+
+//====== pybind ======
+
+#define DEFINE_pybind(name) m.def(#name, &name, #name);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("dispatch_bgmv", &dispatch_bgmv, "dispatch_bgmv");
+  m.def("dispatch_bgmv_low_level", &dispatch_bgmv_low_level,
+        "dispatch_bgmv_low_level");
+}
--- a/csrc/punica/torch_bindings.cpp
+++ b/csrc/punica/torch_bindings.cpp
@ -1,18 +0,0 @@
-#include "registration.h"
-#include "punica_ops.h"
-
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
-  m.def(
-      "dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int "
-      "layer_idx, float scale) -> ()");
-  m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
-
-  m.def(
-      "dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w,"
-      "Tensor indicies, int layer_idx,"
-      "float scale, int h_in, int h_out,"
-      "int y_offset) -> ()");
-  m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level);
-}
-
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@ -0,0 +1,111 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // vLLM custom ops
+  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
+
+  // Attention ops
+  ops.def("paged_attention_v1", &paged_attention_v1,
+          "Compute the attention between an input query and the cached "
+          "keys/values using PagedAttention.");
+  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
+
+  // Activation ops
+  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
+  ops.def("gelu_and_mul", &gelu_and_mul,
+          "Activation function used in GeGLU with `none` approximation.");
+  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
+          "Activation function used in GeGLU with `tanh` approximation.");
+  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
+  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
+
+  // Layernorm
+  ops.def("rms_norm", &rms_norm,
+          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
+
+  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
+          "In-place fused Add and RMS Normalization");
+
+  // Rotary embedding
+  ops.def("rotary_embedding", &rotary_embedding,
+          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
+
+  ops.def("batched_rotary_embedding", &batched_rotary_embedding,
+          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key "
+          "(supports multiple loras)");
+
+// Quantization ops
+#ifndef USE_ROCM
+  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
+  ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
+  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
+  ops.def("marlin_gemm", &marlin_gemm,
+          "Marlin (Dense) Optimized Quantized GEMM for GPTQ");
+  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm,
+          "Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ");
+  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm,
+          "gptq_marlin Optimized Quantized GEMM for GPTQ");
+  ops.def("gptq_marlin_repack", &gptq_marlin_repack,
+          "gptq_marlin repack from GPTQ");
+  ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
+  ops.def("cutlass_scaled_mm_dq", &cutlass_scaled_mm_dq,
+          "CUTLASS w8a8 GEMM, supporting symmetric per-tensor or "
+          "per-row/column quantization.");
+#endif
+
+  ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
+  ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
+  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
+  ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant,
+          "Compute FP8 quantized tensor for given scaling factor");
+  ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant,
+          "Compute FP8 quantized tensor and scaling factor");
+  ops.def("moe_align_block_size", &moe_align_block_size,
+          "Aligning the number of tokens to be processed by each expert such "
+          "that it is divisible by the block size.");
+
+  ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
+          "Compute int8 quantized tensor for given scaling factor");
+
+  // Cache ops
+  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
+  cache_ops.def("swap_blocks", &swap_blocks,
+                "Swap in (out) the cache blocks from src to dst");
+  cache_ops.def("copy_blocks", &copy_blocks,
+                "Copy the cache blocks from src to dst");
+  cache_ops.def("reshape_and_cache", &reshape_and_cache,
+                "Reshape the key and value tensors and cache them");
+  cache_ops.def("reshape_and_cache_flash", &reshape_and_cache_flash,
+                "Reshape the key and value tensors and cache them");
+  cache_ops.def("convert_fp8", &convert_fp8,
+                "Convert the key and value cache to fp8 data type");
+
+  // Cuda utils
+  pybind11::module cuda_utils =
+      m.def_submodule("cuda_utils", "vLLM cuda utils");
+  cuda_utils.def("get_device_attribute", &get_device_attribute,
+                 "Gets the specified device attribute.");
+
+  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
+                 &get_max_shared_memory_per_block_device_attribute,
+                 "Gets the maximum shared memory per block device attribute.");
+
+#ifndef USE_ROCM
+  // Custom all-reduce kernels
+  pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
+  custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
+  custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
+  custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
+  custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
+  custom_ar.def("dispose", &dispose, "dispose");
+  custom_ar.def("meta_size", &meta_size, "meta_size");
+  custom_ar.def("register_buffer", &register_buffer, "register_buffer");
+  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
+                "get_graph_buffer_ipc_meta");
+  custom_ar.def("register_graph_buffers", &register_graph_buffers,
+                "register_graph_buffers");
+#endif
+}
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@ -18,7 +18,7 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/cuda/CUDAGuard.h>

--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@ -7,7 +7,7 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
 }
 */

-#include <torch/all.h>
+#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>

 #include "dequantize.cuh"
@ -435,8 +435,8 @@ __global__ void __launch_bounds__(64)

 torch::Tensor awq_dequantize(torch::Tensor _kernel,
                             torch::Tensor _scaling_factors,
-                             torch::Tensor _zeros, int64_t split_k_iters,
-                             int64_t thx, int64_t thy) {
+                             torch::Tensor _zeros, int split_k_iters, int thx,
+                             int thy) {
  int in_c = _kernel.size(0);
  int qout_c = _kernel.size(1);
  int out_c = qout_c * 8;
@ -491,7 +491,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,

 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                       torch::Tensor _scaling_factors, torch::Tensor _zeros,
-                       int64_t split_k_iters) {
+                       int split_k_iters) {
  int num_in_feats = _in_feats.size(0);
  int num_in_channels = _in_feats.size(1);
  const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@ -1,9 +1,8 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
+#include <torch/extension.h>
 #include <cmath>

 #include "../../dispatch_utils.h"
-#include "../../reduction_utils.cuh"

 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
@ -28,48 +27,17 @@ namespace vllm {

 template <typename scalar_t, typename scale_type>
 __global__ void static_scaled_int8_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type const* scale_ptr, const int hidden_size) {
-  int const tid = threadIdx.x;
-  int const token_idx = blockIdx.x;
-  scale_type const scale = *scale_ptr;
+    const scalar_t* __restrict__ input, int8_t* __restrict__ out,
+    const scale_type* scale_ptr, const int hidden_size) {
+  const int tid = threadIdx.x;
+  const int token_idx = blockIdx.x;
+  scale_type scale = *scale_ptr;

  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[token_idx * hidden_size + i] = float_to_int8_rn(
-        static_cast<float>(input[token_idx * hidden_size + i]) / scale);
+    out[token_idx * hidden_size + i] =
+        float_to_int8_rn(((float)input[token_idx * hidden_size + i]) / scale);
  }
 }
-
-template <typename scalar_t, typename scale_type>
-__global__ void dynamic_scaled_int8_quant_kernel(
-    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
-    scale_type* scale, const int hidden_size) {
-  int const tid = threadIdx.x;
-  int const token_idx = blockIdx.x;
-  float absmax_val = 0.0f;
-  float const zero = 0.0f;
-
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    float val = static_cast<float>(input[token_idx * hidden_size + i]);
-    val = val > zero ? val : -val;
-    absmax_val = val > absmax_val ? val : absmax_val;
-  }
-
-  float const block_absmax_val_maybe = blockReduceMax(absmax_val);
-  __shared__ float block_absmax_val;
-  if (tid == 0) {
-    block_absmax_val = block_absmax_val_maybe;
-    scale[token_idx] = block_absmax_val / 127.0f;
-  }
-  __syncthreads();
-
-  float const tmp_scale = 127.0f / block_absmax_val;
-  for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[token_idx * hidden_size + i] = float_to_int8_rn(
-        static_cast<float>(input[token_idx * hidden_size + i]) * tmp_scale);
-  }
-}
-
 }  // namespace vllm

 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
@ -79,10 +47,10 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(scale.numel() == 1);

-  int const hidden_size = input.size(-1);
-  int const num_tokens = input.numel() / hidden_size;
-  dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
@ -92,24 +60,3 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                                         scale.data_ptr<float>(), hidden_size);
      });
 }
-
-void dynamic_scaled_int8_quant(
-    torch::Tensor& out,          // [..., hidden_size]
-    torch::Tensor const& input,  // [..., hidden_size]
-    torch::Tensor& scales) {
-  TORCH_CHECK(input.is_contiguous());
-  TORCH_CHECK(out.is_contiguous());
-
-  int const hidden_size = input.size(-1);
-  int const num_tokens = input.numel() / hidden_size;
-  dim3 const grid(num_tokens);
-  dim3 const block(std::min(hidden_size, 1024));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
-        vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
-            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
-                                         out.data_ptr<int8_t>(),
-                                         scales.data_ptr<float>(), hidden_size);
-      });
-}
--- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast {

    CUTLASS_DEVICE void
    begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
-      if (!params.row_broadcast) {
+      if (params.ptr_row == nullptr) {
        return;
      }

--- a/csrc/quantization/cutlass_w8a8/common.hpp
+++ b/csrc/quantization/cutlass_w8a8/common.hpp
@ -1,7 +1,6 @@
 #pragma once

 #include "cutlass/cutlass.h"
-#include <climits>

 /**
 * Helper function for checking CUTLASS errors
@ -11,17 +10,3 @@
    TORCH_CHECK(status == cutlass::Status::kSuccess, \
                cutlassGetStatusString(status))      \
  }
-
-inline uint32_t next_pow_2(uint32_t const num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
-}
-
-inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
-  int max_shared_mem_per_block_opt_in = 0;
-  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
-                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
-                        device);
-  return max_shared_mem_per_block_opt_in;
-}
-
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu
@ -1,609 +0,0 @@
-#include <stddef.h>
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-// clang-format will break include orders
-// clang-format off
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/util/device_memory.h"
-
-#include "cutlass/cutlass.h"
-#include "cutlass/gemm_coord.h"
-#include "cutlass/arch/mma_sm75.h"
-#include "cutlass/arch/arch.h"
-#include "cutlass/arch/mma.h"
-#include "cutlass/gemm/device/gemm.h"
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-
-#include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
-#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
-
-#include "broadcast_load_epilogue_c2x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
-
-/*
-   This file defines quantized GEMM operations using the CUTLASS 2.x API, for
-   NVIDIA GPUs with SM versions prior to sm90 (Hopper).
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm80EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
-*/
-
-namespace {
-
-// Wrappers for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm75_to_sm80 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm80_to_sm89 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm89_to_sm90 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE static void invoke(Args&&... args) {
-#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
-    Kernel::invoke(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-/*
- * This class provides the common ScaleA and ScaleB descriptors for the
- * ScaledEpilogue and ScaledEpilogueBias classes.
- */
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::threadblock::VisitorAccFetch;
-
-  using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast<
-      OutputTileThreadMap, float, Stride<Int<1>, Int<0>, Int<0>>>;
-
-  using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast<
-      OutputTileThreadMap, float, Stride<Int<0>, Int<1>, Int<0>>>;
-};
-
-/*
- This epilogue function defines a quantized GEMM operation similar to
- torch._scaled_mm.
-
- A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or
- per-row. B can be quantized per-tensor or per-column.
- Any combination of per-tensor and per-row or column is supported.
- A and B must have symmetric quantization (zero point == 0).
-
- So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
- scales are applied elementwise with numpy-style broadcasting.
-
- ScaleA and ScaleB define the epilogue functions that apply the scales for
- the A and B operands respectively. These scales may be either per-tensor or
- per row or column.
-*/
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    using ScaleAArgs = typename ScaleA::Arguments;
-    using ScaleBArgs = typename ScaleB::Arguments;
-
-    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-
-    typename EVTCompute0::Arguments evt0_compute_args{b_args};
-
-    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args};
-    return evt_compute_args;
-  }
-};
-
-template <typename ElementD, typename OutputTileThreadMap>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementD, OutputTileThreadMap> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementD, OutputTileThreadMap>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast<
-      OutputTileThreadMap, ElementD, Stride<Int<0>, Int<1>, Int<0>>>;
-
- public:
-  using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT<Compute1, ScaleA,
-                                                             EVTCompute0, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    using ScaleAArgs = typename ScaleA::Arguments;
-    using ScaleBArgs = typename ScaleB::Arguments;
-    using BiasArgs = typename Bias::Arguments;
-
-    ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-    ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-    BiasArgs bias_args{static_cast<ElementD*>(bias.data_ptr()), {}};
-
-    typename EVTCompute0::Arguments evt0_compute_args{b_args};
-
-    typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args,
-                                                    bias_args};
-    return evt_compute_args;
-  }
-};
-
-template <typename Arch, template <typename> typename ArchGuard,
-          typename ElementAB_, typename ElementD_,
-          template <typename, typename> typename Epilogue_, typename TileShape,
-          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
-struct cutlass_2x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using Operator =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>,
-                                cutlass::arch::OpMultiplyAddSaturate,
-                                cutlass::arch::OpMultiplyAdd>::type;
-
-  using OutputTileThreadMap =
-      cutlass::epilogue::threadblock::OutputTileThreadLayout<
-          TileShape, WarpShape, float, 4, 1 /* epilogue stages */
-          >;
-
-  using Epilogue = Epilogue_<ElementD, OutputTileThreadMap>;
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using D = cutlass::epilogue::threadblock::VisitorAuxStore<
-      OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest,
-      Stride<int64_t, Int<1>, Int<0>>>;
-
-  using EVTD = cutlass::epilogue::threadblock::Sm80EVT<D, EVTCompute>;
-
-  // clang-format off
-  using RowMajor = typename cutlass::layout::RowMajor;
-  using ColumnMajor = typename cutlass::layout::ColumnMajor;
-  using KernelType =
-    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
-      ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16,
-      ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16,
-      float, cutlass::layout::RowMajor, 4,
-      ElementAcc, float, cutlass::arch::OpClassTensorOp,
-      Arch,
-      TileShape, WarpShape, InstructionShape,
-      EVTD,
-      cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
-      MainLoopStages, Operator,
-      1 /* epilogue stages */
-      >::GemmKernel>;
-  // clang-format on
-
-  using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-  cutlass::gemm::GemmCoord problem_size{m, n, k};
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideC = Stride<int64_t, Int<1>, Int<0>>;
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  auto a_ptr = static_cast<ElementAB const*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB const*>(b.data_ptr());
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-
-  typename Gemm::D::Arguments d_args{c_ptr, c_stride};
-
-  using Epilogue = typename Gemm::Epilogue;
-  auto evt_args =
-      Epilogue::prepare_args(std::forward<EpilogueArgs>(epilogue_params)...);
-
-  typename Gemm::EVTD::Arguments epilogue_args{
-      evt_args,
-      d_args,
-  };
-
-  typename Gemm::Op::Arguments args{
-      cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel,  // universal mode
-      problem_size,                                           // problem size
-      1,                                                      // batch count
-      epilogue_args,
-      a_ptr,
-      b_ptr,
-      nullptr,
-      nullptr,
-      0,
-      0,
-      0,
-      0,
-      lda,
-      ldb,
-      ldc,
-      ldc};
-
-  // Launch the CUTLASS GEMM kernel.
-  typename Gemm::Op gemm_op;
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-  cutlass::Status status = gemm_op(args, workspace.get(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename Gemm, typename FallbackGemm, typename... EpilogueArgs>
-void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                                  torch::Tensor const& b,
-                                  EpilogueArgs&&... args) {
-  // In some cases, the GPU isn't able to accommodate the
-  // shared memory requirements of the Gemm. In such cases, use
-  // the FallbackGemm instead.
-  static const int max_shared_mem_per_block_opt_in =
-      get_cuda_max_shared_memory_per_block_opt_in(0);
-
-  size_t const gemm_shared_mem_size =
-      sizeof(typename Gemm::KernelType::SharedStorage);
-  size_t const fallback_gemm_shared_mem_size =
-      sizeof(typename FallbackGemm::KernelType::SharedStorage);
-
-  if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) {
-    return cutlass_gemm_caller<Gemm>(out, a, b,
-                                     std::forward<EpilogueArgs>(args)...);
-  } else {
-    TORCH_CHECK(fallback_gemm_shared_mem_size <=
-                max_shared_mem_per_block_opt_in);
-    return cutlass_gemm_caller<FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_default {
-  // This config is used in 2 cases,
-  //  - M in (128, inf)
-  //  - M in (64, 128] and N >= 8192
-  // Shared Memory required by this Gemm - 81920 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M64 {
-  // This config is used in 2 cases,
-  // - M in (32, 64]
-  // - M in (64, 128] and N < 8192
-  // Shared Memory required by this Gemm - 122880 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M32 {
-  // M in (16, 32]
-  // Shared Memory required by this Gemm - 61440 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue>
-struct sm80_config_M16 {
-  // M in [1, 16]
-  // Shared Memory required by this Gemm - 51200 bytes
-  static_assert(std::is_same<InType, int8_t>());
-  using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>;
-  using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-  using Cutlass2xGemm =
-      cutlass_2x_gemm<cutlass::arch::Sm80, enable_sm80_to_sm89, InType, OutType,
-                      Epilogue, TileShape, WarpShape, InstructionShape, 5>;
-};
-
-}  // namespace
-
-template <typename InType, typename OutType,
-          template <typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                torch::Tensor const& b,
-                                EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, int8_t>());
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-
-  using Cutlass2xGemmDefault =
-      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM128BigN =
-      typename sm80_config_default<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM128SmallN =
-      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM64 =
-      typename sm80_config_M64<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM32 =
-      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
-  using Cutlass2xGemmM16 =
-      typename sm80_config_M16<InType, OutType, Epilogue>::Cutlass2xGemm;
-
-  // Due to shared memory requirements, some Gemms may fail to run on some
-  // GPUs. As the name indicates, the Fallback Gemm is used as an alternative
-  // in such cases.
-  // sm80_config_M16 has the least shared-memory requirement. However,
-  // based on some profiling, we select sm80_config_M32 as a better alternative
-  // performance wise.
-  using FallbackGemm =
-      typename sm80_config_M32<InType, OutType, Epilogue>::Cutlass2xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(16), next_pow_2(m));  // next power of 2
-  if (mp2 <= 16) {
-    // M in [1, 16]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM16, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 32) {
-    // M in (16, 32]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM32, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 64) {
-    // M in (32, 64]
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmM64, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // M in (64, 128]
-    uint32_t const n = out.size(1);
-    bool const small_n = n < 8192;
-    if (small_n) {
-      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128SmallN,
-                                          FallbackGemm>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    } else {
-      return fallback_cutlass_gemm_caller<Cutlass2xGemmM128BigN, FallbackGemm>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    }
-  } else {
-    // M in (128, inf)
-    return fallback_cutlass_gemm_caller<Cutlass2xGemmDefault, FallbackGemm>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <template <typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... epilogue_args) {
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
-
-  if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
-        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-  } else {
-    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_caller<cutlass_2x_gemm<
-        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
-        Epilogue, TileShape, WarpShape, InstructionShape, 2>>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-  }
-}
-
-void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  if (bias) {
-    TORCH_CHECK(bias->dtype() == out.dtype(),
-                "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogueBias>(
-        out, a, b, a_scales, b_scales, *bias);
-  } else {
-    return cutlass_scaled_mm_sm75_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
-  }
-}
-
-template <template <typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... epilogue_args) {
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-
-  if (out.dtype() == torch::kBFloat16) {
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::bfloat16_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-  } else {
-    TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_gemm_sm80_dispatch<int8_t, cutlass::half_t, Epilogue>(
-        out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-  }
-}
-
-void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  if (bias) {
-    TORCH_CHECK(bias->dtype() == out.dtype(),
-                "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogueBias>(
-        out, a, b, a_scales, b_scales, *bias);
-  } else {
-    return cutlass_scaled_mm_sm80_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
-  }
-}
-
-template <template <typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... epilogue_args) {
-  using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>;
-  using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>;
-  using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
-
-  if (a.dtype() == torch::kInt8) {
-    TORCH_CHECK(b.dtype() == torch::kInt8);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
-          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      assert(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<cutlass_2x_gemm<
-          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
-          Epilogue, TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  } else {
-    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_caller<
-          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
-                          cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue,
-                          TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_caller<
-          cutlass_2x_gemm<cutlass::arch::Sm89, enable_sm89_to_sm90,
-                          cutlass::float_e4m3_t, cutlass::half_t, Epilogue,
-                          TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  }
-}
-
-void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  if (bias) {
-    TORCH_CHECK(bias->dtype() == out.dtype(),
-                "currently bias dtype must match output dtype ", out.dtype());
-    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogueBias>(
-        out, a, b, a_scales, b_scales, *bias);
-  } else {
-    return cutlass_scaled_mm_sm89_epilogue<ScaledEpilogue>(out, a, b, a_scales,
-                                                           b_scales);
-  }
-}
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu
@ -1,557 +0,0 @@
-// clang-format will break include orders
-// clang-format off
-#include <cudaTypedefs.h>
-
-#if defined CUDA_VERSION && CUDA_VERSION >= 12000
-
-#include <torch/all.h>
-
-#include <ATen/cuda/CUDAContext.h>
-
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include "cutlass/cutlass.h"
-
-#include "cute/tensor.hpp"
-#include "cute/atom/mma_atom.hpp"
-#include "cutlass/numeric_types.h"
-
-#include "cutlass/util/device_memory.h"
-
-#include "cutlass/gemm/device/gemm_universal_adapter.h"
-#include "cutlass/gemm/kernel/gemm_universal.hpp"
-#include "cutlass/epilogue/collective/collective_builder.hpp"
-#include "cutlass/gemm/collective/collective_builder.hpp"
-
-#include "broadcast_load_epilogue_c3x.hpp"
-#include "common.hpp"
-// clang-format on
-
-using namespace cute;
-
-/*
-   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
-   NVIDIA GPUs with sm90a (Hopper) or later.
-
-   Epilogue functions can be defined to post-process the output before it is
-   written to GPU memory.
-   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
-   as well as a static prepare_args function that constructs an
-   EVTCompute::Arguments struct.
-*/
-
-namespace {
-
-// A wrapper for the GEMM kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
-// into code that will be executed on the device where it is defined.
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
-    Kernel::operator()(std::forward<Args>(args)...);
-  #endif
-  }
-};
-
-/*
- * This class provides the common ScaleA and ScaleB descriptors for the
- * ScaledEpilogue and ScaledEpilogueBias classes.
- */
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBase {
- protected:
-  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
-
-  using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
-      0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
-      Stride<Int<1>, Int<0>, Int<0>>>;
-
-  using ScaleBDescriptor =
-      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
-          EpilogueDescriptor, float>;
-
-  using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
-      ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
-      typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
-};
-
-/*
-   This epilogue function defines a quantized GEMM operation similar to
-   torch.scaled_mm_.
-
-   A and B may be both either int8 or fp8_e4m3. A can be
-   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
-   Any combination of per-tensor and per-row or column is supported.
-   A and B must have symmetric quantization (zero point == 0).
-
-   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
-   scales are applied elementwise with numpy-style broadcasting.
-
-   ScaleA and ScaleB define the epilogue functions that apply the scales for
-   the A and B operands respectively. These scales may be either per-tensor or
-   per row or column.
-*/
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogue
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales) {
-    using ScaleA_Args = typename ScaleA::Arguments;
-    using ScaleB_Args = typename ScaleB::Arguments;
-
-    ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-    ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-
-    return ArgumentType{a_args, {b_args}};
-  }
-};
-
-template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
-struct ScaledEpilogueBias
-    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
- private:
-  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
-  using Accum = typename SUPER::Accum;
-  using ScaleA = typename SUPER::ScaleA;
-  using ScaleB = typename SUPER::ScaleB;
-
-  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiplies, float, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using EVTCompute0 =
-      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
-
-  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::multiply_add, ElementD, float,
-      cutlass::FloatRoundStyle::round_to_nearest>;
-
-  using BiasDescriptor =
-      cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
-          EpilogueDescriptor, ElementD>;
-
-  using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<
-      BiasDescriptor::Stages, typename EpilogueDescriptor::TileShape, ElementD,
-      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<ElementD>, false>;
-
- public:
-  using EVTCompute =
-      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
-  using ArgumentType = typename EVTCompute::Arguments;
-
-  static ArgumentType prepare_args(torch::Tensor const& a_scales,
-                                   torch::Tensor const& b_scales,
-                                   torch::Tensor const& bias) {
-    using ScaleA_Args = typename ScaleA::Arguments;
-    using ScaleB_Args = typename ScaleB::Arguments;
-    using Bias_Args = typename Bias::Arguments;
-
-    ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
-    ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
-    Bias_Args bias_args{static_cast<ElementD*>(bias.data_ptr())};
-
-    return ArgumentType{a_args, {b_args}, bias_args};
-  }
-};
-
-template <typename ElementAB_, typename ElementD_,
-          template <typename, typename, typename> typename Epilogue_,
-          typename TileShape, typename ClusterShape, typename KernelSchedule,
-          typename EpilogueSchedule>
-struct cutlass_3x_gemm {
-  using ElementAB = ElementAB_;
-  using ElementD = ElementD_;
-  using ElementAcc =
-      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
-                                float>::type;
-
-  using EpilogueDescriptor =
-      cutlass::epilogue::collective::detail::EpilogueDescriptor<
-          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
-          ElementD, EpilogueSchedule>;
-
-  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
-
-  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
-  using ElementC = void;
-  using StrideC = StrideD;
-
-  using EVTCompute = typename Epilogue::EVTCompute;
-
-  using CollectiveEpilogue =
-      typename cutlass::epilogue::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
-          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
-          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
-          EpilogueSchedule, EVTCompute>::CollectiveOp;
-
-  static constexpr size_t CEStorageSize =
-      sizeof(typename CollectiveEpilogue::SharedStorage);
-  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
-      static_cast<int>(CEStorageSize)>;
-
-  // clang-format off
-  using CollectiveMainloop =
-      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, 16, 
-          ElementAB, cutlass::layout::ColumnMajor, 16, 
-          ElementAcc, TileShape, ClusterShape,
-          Stages,
-          KernelSchedule>::CollectiveOp;
-  // clang-format on
-
-  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
-      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>>;
-
-  struct GemmKernel : public KernelType {};
-};
-
-template <typename Gemm, typename... EpilogueArgs>
-void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b,
-                         EpilogueArgs&&... epilogue_params) {
-  using ElementAB = typename Gemm::ElementAB;
-  using ElementD = typename Gemm::ElementD;
-
-  int32_t m = a.size(0);
-  int32_t n = b.size(1);
-  int32_t k = a.size(1);
-
-  int64_t lda = a.stride(0);
-  int64_t ldb = b.stride(1);
-  int64_t ldc = out.stride(0);
-
-  using StrideA = Stride<int64_t, Int<1>, Int<0>>;
-  using StrideB = Stride<int64_t, Int<1>, Int<0>>;
-  using StrideC = typename Gemm::StrideC;
-
-  StrideA a_stride{lda, Int<1>{}, Int<0>{}};
-  StrideB b_stride{ldb, Int<1>{}, Int<0>{}};
-  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
-
-  using GemmKernel = typename Gemm::GemmKernel;
-  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
-
-  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
-  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
-  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
-                                                       b_stride};
-
-  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
-  typename GemmKernel::EpilogueArguments epilogue_args{
-      Gemm::Epilogue::prepare_args(
-          std::forward<EpilogueArgs>(epilogue_params)...),
-      c_ptr, c_stride, c_ptr, c_stride};
-
-  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
-                                      prob_shape, mainloop_args, epilogue_args};
-
-  // Launch the CUTLASS GEMM kernel.
-  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
-  GemmOp gemm_op;
-  CUTLASS_CHECK(gemm_op.can_implement(args));
-
-  size_t workspace_size = gemm_op.get_workspace_size(args);
-  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
-  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-
-  cutlass::Status status = gemm_op.run(args, workspace.get(), stream);
-  CUTLASS_CHECK(status);
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_default {
-  // M in (128, inf)
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M128 {
-  // M in (64, 128]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M64 {
-  // M in [1, 64]
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  using KernelSchedule =
-      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _128>;
-  using ClusterShape = Shape<_1, _8, _1>;
-
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_default {
-  // For M > 128 and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_128, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M128 {
-  // For M in (64, 128] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule =
-      typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _128>;
-  using ClusterShape = Shape<_2, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M64 {
-  // For M in (32, 64] and any N
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _1, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NBig {
-  // For M in [1, 32] and N >= 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _128, _256>;
-  using ClusterShape = Shape<_1, _4, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue>
-struct sm90_int8_config_M32_NSmall {
-  // For M in [1, 32] and N < 8192
-  static_assert(std::is_same<InType, int8_t>());
-  using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
-  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
-  using TileShape = Shape<_64, _64, _256>;
-  using ClusterShape = Shape<_1, _8, _1>;
-  using Cutlass3xGemm =
-      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                      KernelSchedule, EpilogueSchedule>;
-};
-
-}  // namespace
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                    torch::Tensor const& b,
-                                    EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
-  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-  using Cutlass3xGemmDefault =
-      typename sm90_fp8_config_default<InType, OutType,
-                                       Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 64) {
-    // m in [1, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <typename InType, typename OutType,
-          template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... args) {
-  static_assert(std::is_same<InType, int8_t>());
-  TORCH_CHECK(a.dtype() == torch::kInt8);
-  TORCH_CHECK(b.dtype() == torch::kInt8);
-
-  using Cutlass3xGemmDefault =
-      typename sm90_int8_config_default<InType, OutType,
-                                        Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM128 =
-      typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM64 =
-      typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM32NBig =
-      typename sm90_int8_config_M32_NBig<InType, OutType,
-                                         Epilogue>::Cutlass3xGemm;
-  using Cutlass3xGemmM32NSmall =
-      typename sm90_int8_config_M32_NSmall<InType, OutType,
-                                           Epilogue>::Cutlass3xGemm;
-
-  uint32_t const n = out.size(1);
-  bool const is_small_n = n < 8192;
-
-  uint32_t const m = a.size(0);
-  uint32_t const mp2 =
-      std::max(static_cast<uint32_t>(32), next_pow_2(m));  // next power of 2
-
-  if (mp2 <= 32) {
-    // m in [1, 32]
-    if (is_small_n) {
-      return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    } else {
-      return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
-          out, a, b, std::forward<EpilogueArgs>(args)...);
-    }
-  } else if (mp2 <= 64) {
-    // m in (32, 64]
-    return cutlass_gemm_caller<Cutlass3xGemmM64>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else if (mp2 <= 128) {
-    // m in (64, 128]
-    return cutlass_gemm_caller<Cutlass3xGemmM128>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  } else {
-    // m in (128, inf)
-    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
-        out, a, b, std::forward<EpilogueArgs>(args)...);
-  }
-}
-
-template <template <typename, typename, typename> typename Epilogue,
-          typename... EpilogueArgs>
-void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
-                                     torch::Tensor const& b,
-                                     EpilogueArgs&&... epilogue_args) {
-  if (a.dtype() == torch::kInt8) {
-    TORCH_CHECK(b.dtype() == torch::kInt8);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::bfloat16_t,
-                                             Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_int8_dispatch<int8_t, cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  } else {
-    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
-    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
-
-    if (out.dtype() == torch::kBFloat16) {
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::bfloat16_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    } else {
-      TORCH_CHECK(out.dtype() == torch::kFloat16);
-      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
-                                            cutlass::half_t, Epilogue>(
-          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
-    }
-  }
-}
-
-void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
-                            torch::Tensor const& b,
-                            torch::Tensor const& a_scales,
-                            torch::Tensor const& b_scales,
-                            c10::optional<torch::Tensor> const& bias) {
-  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
-  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
-  if (bias) {
-    TORCH_CHECK(bias->dtype() == c.dtype(),
-                "currently bias dtype must match output dtype ", c.dtype());
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogueBias>(
-        c, a, b, a_scales, b_scales, *bias);
-  } else {
-    return cutlass_scaled_mm_sm90_epilogue<ScaledEpilogue>(c, a, b, a_scales,
-                                                           b_scales);
-  }
-}
-
-#endif
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Zhuohan Li	d5bf492f16	Merge branch 'main' into optimize-prefix-caching-scheduling	2024-06-04 00:20:15 +00:00
Zhuohan Li	8c7bab79f5	simplify code	2024-06-03 03:36:38 +00:00
Zhuohan Li	1936d7bab0	format	2024-06-02 00:02:54 +00:00
Zhuohan Li	996cf2de5c	Fix hashing logic for non-full blocks	2024-06-02 00:01:30 +00:00