updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
2025-10-21 07:13:52 +08:00 · 2025-08-19 17:05:49 +00:00
1231 changed files with 36198 additions and 79263 deletions
--- a/.buildkite/check-wheel-size.py
+++ b/.buildkite/check-wheel-size.py
@ -5,11 +5,11 @@ import os
 import sys
 import zipfile
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
-# Note that we have 800 MiB quota, please use it wisely.
+# Note that we have 400 MiB quota, please use it wisely.
-# See https://github.com/pypi/support/issues/6326 .
+# See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
+VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
 def print_top_10_largest_files(zip_file):
--- a/.buildkite/generate_index.py
+++ b/.buildkite/generate_index.py
@ -8,8 +8,7 @@ template = """<!DOCTYPE html>
 <html>
    <body>
    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
    </body>
 </html>
 """
@ -22,25 +21,7 @@ filename = os.path.basename(args.wheel)
 with open("index.html", "w") as f:
    print(f"Generated index.html for {args.wheel}")
    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
    if "x86_64" in filename:
        x86_wheel = filename
        arm_wheel = filename.replace("x86_64", "aarch64").replace(
            "manylinux1", "manylinux2014"
        )
    elif "aarch64" in filename:
        x86_wheel = filename.replace("aarch64", "x86_64").replace(
            "manylinux2014", "manylinux1"
        )
        arm_wheel = filename
    else:
        raise ValueError(f"Unsupported wheel: {filename}")
    # cloudfront requires escaping the '+' character
    f.write(
-        template.format(
+        template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
            x86_wheel=x86_wheel,
            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
            arm_wheel=arm_wheel,
            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
        )
    )
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
@ -0,0 +1,12 @@
 # For vllm script, with -t option (tensor parallel size).
 # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 tasks:
 - name: "gsm8k"
  metrics:
  - name: "exact_match,strict-match"
    value: 0.419
  - name: "exact_match,flexible-extract"
    value: 0.416
 limit: 1000
 num_fewshot: 5
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
 Mixtral-8x7B-Instruct-v0.1.yaml
 Qwen2-57B-A14-Instruct.yaml
 DeepSeek-V2-Lite-Chat.yaml
 Meta-Llama-3-8B-QQQ.yaml
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@ -2,7 +2,7 @@
 # We can use this script to compute baseline accuracy on GSM for transformers.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install lm-eval==0.4.4
 usage() {
    echo``
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@ -3,7 +3,7 @@
 # We use this for fp8, which HF does not support.
 #
 # Make sure you have lm-eval-harness installed:
-#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
+#   pip install lm-eval==0.4.4
 usage() {
    echo``
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
 `compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.  
 If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
-Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
+Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
 `python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
 |   | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps  | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio        |
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@ -17,7 +17,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
    - SGLang: `lmsysorg/sglang:v0.3.2-cu121`
    - LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
    - TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
-        - *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
+        - *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
    - Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
 - Hardware
    - 8x Nvidia A100 GPUs
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@ -3,129 +3,44 @@
 import argparse
 import json
 import os
 from importlib import util
 import pandas as pd
 plotly_found = util.find_spec("plotly.express") is not None
 def compare_data_columns(
    files, name_column, data_column, info_cols, drop_column, debug=False
 ):
-    """
+    print("\ncompare_data_column: " + data_column)
    Align concatenation by keys derived from info_cols instead of row order.
    - Pick one canonical key list: subset of info_cols present in ALL files.
    - For each file: set index to those keys, aggregate duplicates
    - (mean for metric, first for names).
    - Concat along axis=1 (indexes align), then reset_index so callers can
    - group by columns.
    - If --debug, add a <file_label>_name column per file.
    """
    print("\ncompare_data_column:", data_column)
    frames = []
    raw_data_cols = []
    compare_frames = []
    # 1) choose a canonical key list from info_cols that exists in ALL files
    cols_per_file = []
    for f in files:
        try:
            df_tmp = pd.read_json(f, orient="records")
        except Exception as err:
            raise ValueError(f"Failed to read {f}") from err
        cols_per_file.append(set(df_tmp.columns))
    key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
    if not key_cols:
        # soft fallback: use any info_cols present in the first file
        key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
    if not key_cols:
        raise ValueError(
            "No common key columns found from info_cols across the input files."
        )
    # 2) build a single "meta" block (keys as columns) once, aligned by the key index
    meta_added = False
    for file in files:
-        df = pd.read_json(file, orient="records")
+        data_df = pd.read_json(file)
        serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
        # Show all info columns in the first couple columns
        if not frames:
            for col in info_cols:
                if col not in serving_df.columns:
                    print(f"Skipping missing column: {col}")
                    continue
                frames.append(serving_df[col])
        # only show test name under debug mode
        if debug is True:
            serving_df = serving_df.rename(columns={name_column: file + "_name"})
            frames.append(serving_df[file + "_name"])
-        # Keep rows that actually have the compared metric (same as original behavior)
+        file = "/".join(file.split("/")[:-1])
-        if drop_column in df.columns:
+        serving_df = serving_df.rename(columns={data_column: file})
-            df = df.dropna(subset=[drop_column], ignore_index=True)
+        frames.append(serving_df[file])
-
+        raw_data_cols.append(file)
-        # Stabilize numeric key columns (harmless if missing)
+        compare_frames.append(serving_df[file])
        for c in (
            "Input Len",
            "Output Len",
            "TP Size",
            "PP Size",
            "# of max concurrency.",
            "qps",
        ):
            if c in df.columns:
                df[c] = pd.to_numeric(df[c], errors="coerce")
        # Ensure all key columns exist
        for c in key_cols:
            if c not in df.columns:
                df[c] = pd.NA
        # Set index = key_cols and aggregate duplicates → unique MultiIndex
        df_idx = df.set_index(key_cols, drop=False)
        # meta (key columns), unique per key
        meta = df_idx[key_cols]
        if not meta.index.is_unique:
            meta = meta.groupby(level=key_cols, dropna=False).first()
        # metric series for this file, aggregated to one row per key
        file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
        s = df_idx[data_column]
        if not s.index.is_unique:
            s = s.groupby(level=key_cols, dropna=False).mean()
        s.name = file_label  # column label like original
        # add meta once (from first file) so keys are the leftmost columns
        if not meta_added:
            frames.append(meta)
            meta_added = True
        # (NEW) debug: aligned test-name column per file
        if debug and name_column in df_idx.columns:
            name_s = df_idx[name_column]
            if not name_s.index.is_unique:
                name_s = name_s.groupby(level=key_cols, dropna=False).first()
            name_s.name = f"{file_label}_name"
            frames.append(name_s)
        frames.append(s)
        raw_data_cols.append(file_label)
        compare_frames.append(s)
        # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
        if len(compare_frames) >= 2:
-            base = compare_frames[0]
+            # Compare numbers among two files
-            current = compare_frames[-1]
+            ratio_df = compare_frames[1] / compare_frames[0]
-            ratio = current / base
+            frames.append(ratio_df)
-            ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
+            compare_frames.pop(1)
            ratio.name = f"Ratio 1 vs {len(compare_frames)}"
            frames.append(ratio)
    # 4) concat on columns with aligned MultiIndex;
    # then reset_index to return keys as columns
    concat_df = pd.concat(frames, axis=1)
    concat_df = concat_df.reset_index(drop=True).reset_index()
    if "index" in concat_df.columns:
        concat_df = concat_df.drop(columns=["index"])
    # Ensure key/info columns appear first (in your info_cols order)
    front = [c for c in info_cols if c in concat_df.columns]
    rest = [c for c in concat_df.columns if c not in front]
    concat_df = concat_df[front + rest]
    print(raw_data_cols)
    return concat_df, raw_data_cols
@ -152,15 +67,6 @@ def split_json_by_tp_pp(
    df = pd.DataFrame(data)
    # Keep only "serving" tests
    name_col = next(
        (c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
    )
    if name_col:
        df = df[
            df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
        ].copy()
    # Handle alias column names
    rename_map = {
        "tp_size": "TP Size",
@ -218,7 +124,7 @@ if __name__ == "__main__":
        "--xaxis",
        type=str,
        default="# of max concurrency.",
-        help="column name to use as X Axis in comparison graph",
+        help="column name to use as X Axis in comparision graph",
    )
    args = parser.parse_args()
@ -275,6 +181,7 @@ if __name__ == "__main__":
                    f"Expected subset: {filtered_info_cols}, "
                    f"but DataFrame has: {list(output_df.columns)}"
                )
            output_df_sorted = output_df.sort_values(by=existing_group_cols)
            output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
            for name, group in output_groups:
@ -282,7 +189,8 @@ if __name__ == "__main__":
                text_file.write(html_msgs_for_data_cols[i])
                text_file.write(html)
-                if plot and plotly_found:
+                if plot is True:
                    import pandas as pd
                    import plotly.express as px
                    df = group[raw_data_cols]
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@ -382,7 +382,7 @@ run_genai_perf_tests() {
      client_command="genai-perf profile \
        -m $model \
        --service-kind openai \
-        --backend "$backend" \
+        --backend vllm \
        --endpoint-type chat \
        --streaming \
        --url localhost:$port \
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
@ -1,6 +1,6 @@
 [
    {
-        "test_name": "serving_llama8B_bf16_tp1_sharegpt",
+        "test_name": "serving_llama8B_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -32,7 +32,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+        "test_name": "serving_llama8B_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -64,7 +64,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
+        "test_name": "serving_llama8B_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -96,7 +96,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp1_random_128_128",
+        "test_name": "serving_llama8B_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -131,7 +131,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+        "test_name": "serving_llama8B_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -166,7 +166,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
+        "test_name": "serving_llama8B_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -198,413 +198,5 @@
 	    "random-output-len": 128,
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp4_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    }
 ]
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@ -1,6 +1,6 @@
 [
    {
-        "test_name": "serving_llama8B_bf16_pp1_sharegpt",
+        "test_name": "serving_llama8B_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -32,39 +32,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2_sharegpt",
+        "test_name": "serving_llama8B_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_bf16_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -96,7 +64,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
+        "test_name": "serving_llama8B_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
@ -129,7 +97,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_pp1_random_128_128",
+        "test_name": "serving_llama8B_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -164,42 +132,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2_random_128_128",
+        "test_name": "serving_llama8B_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "meta-llama/Llama-3.1-8B-Instruct",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_bf16_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -234,7 +167,7 @@
        }
    },
    {
-        "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
+        "test_name": "serving_llama8B_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
@ -268,553 +201,5 @@
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp1_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "sharegpt",
            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
            "num_prompts": 200
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp1_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 1,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    },
    {
        "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
        "qps_list": ["inf"],
        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
        "server_environment_variables": {
            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
        },
        "server_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
 	    "quantization": "awq",
            "tensor_parallel_size": 2,
            "pipeline_parallel_size": 3,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
 	    "trust_remote_code": "",
 	    "enable_chunked_prefill": "",
            "disable_log_stats": "",
 	    "enforce_eager": "",
 	    "max_num_batched_tokens": 2048,
 	    "max_num_seqs": 256,
            "load_format": "dummy"
        },
        "client_parameters": {
            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
            "backend": "vllm",
            "dataset_name": "random",
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
            "num_prompts": 1000
        }
    }
 ]
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -1,22 +1,21 @@
 steps:
-  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
+  # aarch64 + CUDA builds
-  - label: "Build arm64 wheel - CUDA 12.9"
+  - label: "Build arm64 wheel - CUDA 12.8"
-    depends_on: ~
+    id: build-wheel-arm64-cuda-12-8
    id: build-wheel-arm64-cuda-12-9
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
  # x86 + CUDA builds
  - label: "Build wheel - CUDA 12.8"
    depends_on: ~
    id: build-wheel-cuda-12-8
    agents:
      queue: cpu_queue_postmerge
@ -29,7 +28,6 @@ steps:
      DOCKER_BUILDKIT: "1"
  - label: "Build wheel - CUDA 12.6"
    depends_on: ~
    id: build-wheel-cuda-12-6
    agents:
      queue: cpu_queue_postmerge
@ -41,61 +39,44 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
-  # x86 + CUDA builds
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
-  - label: "Build wheel - CUDA 12.9"
+  # However, this block can be uncommented to save some compute hours.
-    depends_on: ~
+  # - block: "Build CUDA 11.8 wheel"
-    id: build-wheel-cuda-12-9
+  #   key: block-build-cu118-wheel
  - label: "Build wheel - CUDA 11.8"
    # depends_on: block-build-cu118-wheel
    id: build-wheel-cuda-11-8
    agents:
      queue: cpu_queue_postmerge
    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
      - "mkdir artifacts"
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
      - "bash .buildkite/scripts/upload-wheels.sh"
    env:
      DOCKER_BUILDKIT: "1"
-  - label: "Build release image (x86)"
+  - block: "Build release image"
    depends_on: ~
-    id: build-release-image-x86
+    key: block-release-image-build
  - label: "Build release image"
    depends_on: block-release-image-build
    id: build-release-image
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
      # re-tag to default image tag and push, just in case arm64 build fails
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
  - label: "Build release image (arm64)"
    depends_on: ~
    id: build-release-image-arm64
    agents:
      queue: arm64_cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
  # Add job to create multi-arch manifest
  - label: "Create multi-arch manifest"
    depends_on:
      - build-release-image-x86
      - build-release-image-arm64
    id: create-multi-arch-manifest
    agents:
      queue: cpu_queue_postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
      - "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
      - "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
  - label: "Annotate release workflow"
    depends_on:
-      - create-multi-arch-manifest
+      - build-release-image
      - build-wheel-cuda-12-8
      - build-wheel-cuda-12-6
      - build-wheel-cuda-11-8
    id: annotate-release-workflow
    agents:
      queue: cpu_queue_postmerge
@ -142,24 +123,18 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"
-  - label: "Build and publish nightly multi-arch image to DockerHub"
+  - block: "Build Neuron release image"
-    depends_on:
+    key: block-neuron-release-image-build
-      - create-multi-arch-manifest
+    depends_on: ~
-    if: build.env("NIGHTLY") == "1"
+
  - label: "Build and publish Neuron release image"
    depends_on: block-neuron-release-image-build
    agents:
-      queue: cpu_queue_postmerge
+      queue: neuron-postmerge
    commands:
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
      - "docker push vllm/vllm-openai:nightly"
      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
      # Clean up old nightly builds (keep only last 14)
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
    plugins:
      - docker-login#v3.0.0:
          username: vllmbot
          password-env: DOCKERHUB_TOKEN
    env:
      DOCKER_BUILDKIT: "1"
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@ -14,33 +14,18 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 To download the wheel:
 \`\`\`
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
 \`\`\`
 To download and upload the image:
 \`\`\`
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
-docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
-
+docker tag vllm/vllm-openai vllm/vllm-openai:latest
-docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
+docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
+docker push vllm/vllm-openai:latest
-docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
+docker push vllm/vllm-openai:v${RELEASE_VERSION}
 docker push vllm/vllm-openai:latest-x86_64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
 docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
 EOF 
--- a/.buildkite/scripts/cleanup-nightly-builds.sh
+++ b/.buildkite/scripts/cleanup-nightly-builds.sh
@ -1,97 +0,0 @@
 #!/bin/bash
 set -ex
 # Clean up old nightly builds from DockerHub, keeping only the last 14 builds
 # This script uses DockerHub API to list and delete old tags with "nightly-" prefix
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
 # Get DockerHub token from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
    exit 1
 fi
 # Function to get all tags from DockerHub
 get_all_tags() {
    local page=1
    local all_tags=""
    while true; do
        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
            "$REPO_API_URL?page=$page&page_size=100")
        # Get both last_updated timestamp and tag name, separated by |
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
        if [ -z "$tags" ]; then
            break
        fi
        all_tags="$all_tags$tags"$'\n'
        page=$((page + 1))
    done
    # Sort by timestamp (newest first) and extract just the tag names
    echo "$all_tags" | sort -r | cut -d'|' -f2
 }
 delete_tag() {
    local tag_name="$1"
    echo "Deleting tag: $tag_name"
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
    else
        echo "Successfully deleted tag: $tag_name"
    fi
 }
 # Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
 echo "Fetching all tags from DockerHub..."
 all_tags=$(get_all_tags)
 if [ -z "$all_tags" ]; then
    echo "No tags found to clean up"
    exit 0
 fi
 # Count total tags
 total_tags=$(echo "$all_tags" | wc -l)
 echo "Found $total_tags tags"
 # Keep only the last 14 builds (including the current one)
 tags_to_keep=14
 tags_to_delete=$((total_tags - tags_to_keep))
 if [ $tags_to_delete -le 0 ]; then
    echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
    exit 0
 fi
 echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
 # Get tags to delete (skip the first $tags_to_keep tags)
 tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
 if [ -z "$tags_to_delete_list" ]; then
    echo "No tags to delete"
    exit 0
 fi
 # Delete old tags
 echo "Deleting old tags..."
 while IFS= read -r tag; do
    if [ -n "$tag" ]; then
        delete_tag "$tag"
        # Add a small delay to avoid rate limiting
        sleep 1
    fi
 done <<< "$tags_to_delete_list"
 echo "Cleanup completed successfully"
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@ -164,6 +164,7 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
  --ignore=entrypoints/llm/test_chat.py \
  --ignore=entrypoints/llm/test_accuracy.py \
  --ignore=entrypoints/llm/test_init.py \
  --ignore=entrypoints/llm/test_generate_multiple_loras.py \
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 function cpu_tests() {
  set -e
@ -46,26 +46,21 @@ function cpu_tests() {
    set -e
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
  # Run kernel tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    pytest -x -v -s tests/kernels/test_onednn.py"
  # Run basic model test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
    # Note: disable until supports V1
-    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
+    # pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
-    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
+    # pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
    # Note: disable Bart until supports V1
-    pytest -x -v -s tests/models/language/generation -m cpu_model \
+    pytest -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
+    VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
                --ignore=tests/models/language/generation/test_bart.py
-    pytest -x -v -s tests/models/language/pooling -m cpu_model
+    pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -x -v -s tests/models/multimodal/generation \
+    pytest -v -s tests/models/multimodal/generation \
                --ignore=tests/models/multimodal/generation/test_mllama.py \
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
                -m cpu_model"
@ -73,51 +68,35 @@ function cpu_tests() {
  # Run compressed-tensor test
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -x -s -v \
+    pytest -s -v \
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
  # Note: disable it until supports V1
  # Run AWQ test
  # docker exec cpu-test-"$NUMA_NODE" bash -c "
  #   set -e
-  #   VLLM_USE_V1=0 pytest -x -s -v \
+  #   VLLM_USE_V1=0 pytest -s -v \
  #   tests/quantization/test_ipex_quant.py"
  # Run multi-lora tests
  docker exec cpu-test-"$NUMA_NODE" bash -c "
    set -e
-    pytest -x -s -v \
+    pytest -s -v \
    tests/lora/test_qwen2vl.py"
-  # online serving: tp+pp
+  # online serving
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
-      --endpoint /v1/completions
+      --endpoint /v1/completions'
    kill -s SIGTERM $server_pid &'
  # online serving: tp+dp
  docker exec cpu-test-"$NUMA_NODE" bash -c '
    set -e
    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
    server_pid=$!
    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
    vllm bench serve \
      --backend vllm \
      --dataset-name random \
      --model meta-llama/Llama-3.2-3B-Instruct \
      --num-prompts 20 \
      --endpoint /v1/completions
    kill -s SIGTERM $server_pid &'
 }
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@ -0,0 +1,64 @@
 #!/bin/bash
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
 set -v
 image_name="neuron/vllm-ci"
 container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
 NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 # Try building the docker image
 aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
 if [ -f /tmp/neuron-docker-build-timestamp ]; then
    last_build=$(cat /tmp/neuron-docker-build-timestamp)
    current_time=$(date +%s)
    if [ $((current_time - last_build)) -gt 86400 ]; then
        # Remove dangling images (those that are not tagged and not used by any container)
        docker image prune -f
        # Remove unused volumes / force the system prune for old images as well.
        docker volume prune -f && docker system prune -f
        echo "$current_time" > /tmp/neuron-docker-build-timestamp
    fi
 else
    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 docker build -t "${image_name}" -f docker/Dockerfile.neuron .
 # Setup cleanup
 remove_docker_container() {
    docker image rm -f "${image_name}" || true;
 }
 trap remove_docker_container EXIT
 # Run the image
 docker run --rm -it --device=/dev/neuron0 --network bridge \
       -v "${HF_CACHE}:${HF_MOUNT}" \
       -e "HF_HOME=${HF_MOUNT}" \
       -e "HF_TOKEN=${HF_TOKEN}" \
       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
       --name "${container_name}" \
       ${image_name} \
       /bin/bash -c "
            set -e; # Exit on first error
            python3 /workspace/vllm/examples/offline_inference/neuron.py;
            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
                echo \"Running test file: \$f\";
                python3 -m pytest \$f -v --capture=tee-sys;
            done
       "
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
@ -61,7 +61,7 @@ echo "Results will be stored in: $RESULTS_DIR"
 echo "--- Installing Python dependencies ---"
 python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
-    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
+    && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
    && python3 -m pip install --progress-bar off hf-transfer
 echo "--- Python dependencies installed ---"
 export VLLM_USE_V1=1
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@ -30,12 +30,9 @@ docker run \
    bash -c '
    set -e
    echo $ZE_AFFINITY_MASK
-    pip install tblib==3.1.0
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
    cd tests
    pytest -v -s v1/core
    pytest -v -s v1/engine
--- a/.buildkite/scripts/tpu/cleanup_docker.sh
+++ b/.buildkite/scripts/tpu/cleanup_docker.sh
@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
  # Remove dangling images (those that are not tagged and not used by any container)
  docker image prune -f
  # Remove unused volumes / force the system prune for old images as well.
-  docker volume prune -f && docker system prune --force --filter "until=24h" --all
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
  echo "Docker images and volumes cleanup completed."
 else
  echo "Disk usage is below $threshold%. No cleanup needed."
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@ -14,19 +14,8 @@ fi
 # Get the single wheel file
 wheel="${wheel_files[0]}"
-# Detect architecture and rename 'linux' to appropriate manylinux version
+# Rename 'linux' to 'manylinux1' in the wheel filename
-arch=$(uname -m)
+new_wheel="${wheel/linux/manylinux1}"
 if [[ $arch == "x86_64" ]]; then
    manylinux_version="manylinux1"
 elif [[ $arch == "aarch64" ]]; then
    manylinux_version="manylinux2014"
 else
    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
    manylinux_version="manylinux1"
 fi
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
@ -58,15 +47,14 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
 elif [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 elif [[ $normal_wheel == *"cu128"* ]]; then
    # if $normal_wheel matches cu128, do not upload the index.html
    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu129 wheels (default wheels) as it
+    # only upload index.html for cu128 wheels (default wheels)
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 fi
@ -75,15 +63,14 @@ fi
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
-if [[ $normal_wheel == *"cu126"* ]]; then
+if [[ $normal_wheel == *"cu118"* ]]; then
    # if $normal_wheel matches cu118, do not upload the index.html
    echo "Skipping index files for cu118 wheels"
 elif [[ $normal_wheel == *"cu126"* ]]; then
    # if $normal_wheel matches cu126, do not upload the index.html
    echo "Skipping index files for cu126 wheels"
 elif [[ $normal_wheel == *"cu128"* ]]; then
    # if $normal_wheel matches cu128, do not upload the index.html
    echo "Skipping index files for cu128 wheels"
 else
-    # only upload index.html for cu129 wheels (default wheels) as it
+    # only upload index.html for cu128 wheels (default wheels)
    # is available on both x86 and arm64
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 fi
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -41,8 +41,7 @@ steps:
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh
- label: Async Engine, Inputs, Utils, Worker Test # 36min
+- label: Async Engine, Inputs, Utils, Worker Test # 24min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -54,7 +53,6 @@ steps:
  - tests/utils_
  - tests/worker
  - tests/standalone_tests/lazy_imports.py
  - tests/transformers_utils
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s mq_llm_engine # MQLLMEngine
@ -64,10 +62,8 @@ steps:
  - pytest -v -s multimodal
  - pytest -v -s utils_ # Utils
  - pytest -v -s worker # Worker
  - pytest -v -s transformers_utils # transformers_utils
- label: Python-only Installation Test # 10min
+- label: Python-only Installation Test
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
@ -75,8 +71,7 @@ steps:
  commands:
  - bash standalone_tests/python_only_compile.sh
- label: Basic Correctness Test # 20min
+- label: Basic Correctness Test # 30min
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  fast_check: true
  torch_nightly: true
@ -93,8 +88,7 @@ steps:
  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test # 22min
+- label: Core Test # 10min
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  fast_check: true
  source_file_dependencies:
@ -104,19 +98,7 @@ steps:
  commands:
  - pytest -v -s core
- label: Entrypoints Unit Tests # 5min
+- label: Entrypoints Test (LLM) # 40min
  timeout_in_minutes: 10
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  source_file_dependencies:
  - vllm/entrypoints
  - tests/entrypoints/
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 - label: Entrypoints Integration Test (LLM) # 30min
  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -127,13 +109,13 @@ steps:
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Integration Test (API Server) # 100min
+- label: Entrypoints Test (API Server) # 40min
  timeout_in_minutes: 130
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
@ -144,25 +126,10 @@ steps:
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
  - pytest -v -s entrypoints/test_chat_utils.py
- label: Entrypoints Integration Test (Pooling)
+- label: Distributed Tests (4 GPUs) # 10min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  fast_check: true
  torch_nightly: true
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/pooling
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling
 - label: Distributed Tests (4 GPUs) # 35min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@ -205,8 +172,7 @@ steps:
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
  - popd
- label: EPLB Algorithm Test # 5min
+- label: EPLB Algorithm Test
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
@ -215,7 +181,6 @@ steps:
  - pytest -v -s distributed/test_eplb_algo.py
 - label: EPLB Execution Test # 5min
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
  source_file_dependencies:
@ -224,14 +189,13 @@ steps:
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
- label: Metrics, Tracing Test # 12min
+- label: Metrics, Tracing Test # 10min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  num_gpus: 2
  source_file_dependencies:
  - vllm/
  - tests/metrics
-  - tests/v1/tracing
+  - tests/tracing
  commands:
  - pytest -v -s metrics
  - "pip install \
@ -244,8 +208,7 @@ steps:
 ##### fast check tests  #####
 #####  1 GPU test  #####
- label: Regression Test # 7min
+- label: Regression Test # 5min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -255,8 +218,7 @@ steps:
  - pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 25min
+- label: Engine Test # 10min
  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -271,29 +233,7 @@ steps:
  # OOM in the CI unless we run this separately
  - pytest -v -s tokenization
- label: V1 Test e2e + engine # 30min
+- label: V1 Test
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
    - pytest -v -s v1/engine
 - label: V1 Test entrypoints # 35min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
    - tests/v1
  commands:
    - pytest -v -s v1/entrypoints
 - label: V1 Test others # 42min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
    - vllm/
@ -301,7 +241,8 @@ steps:
  commands:
    # split the test to avoid interference
    - pytest -v -s v1/core
-    - pytest -v -s v1/executor
+    - pytest -v -s v1/engine
    - pytest -v -s v1/entrypoints
    - pytest -v -s v1/sample
    - pytest -v -s v1/logits_processors
    - pytest -v -s v1/worker
@ -313,12 +254,14 @@ steps:
    - pytest -v -s v1/test_utils.py
    - pytest -v -s v1/test_oracle.py
    - pytest -v -s v1/test_metrics_reader.py
    # TODO: accuracy does not match, whether setting
    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
    - pytest -v -s v1/e2e
    # Integration test for streaming correctness (requires special branch).
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Examples Test # 30min
+- label: Examples Test # 25min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
@ -336,14 +279,14 @@ steps:
    - python3 offline_inference/vision_language_pooling.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/encoder_decoder.py
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    - python3 offline_inference/basic/classify.py
    - python3 offline_inference/basic/embed.py
    - python3 offline_inference/basic/score.py
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Platform Tests (CUDA) # 4min
+- label: Platform Tests (CUDA)
  timeout_in_minutes: 15
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -351,8 +294,7 @@ steps:
  commands:
    - pytest -v -s cuda/test_cuda_context.py
- label: Samplers Test # 56min
+- label: Samplers Test # 36min
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers
@ -363,23 +305,15 @@ steps:
    - pytest -v -s samplers
    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
- label: LoRA Test %N # 20min each
+- label: LoRA Test %N # 15min each
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/lora
  - tests/lora
-  commands:
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
    - pytest -v -s lora \
      --shard-id=$$BUILDKITE_PARALLEL_JOB \
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
      --ignore=lora/test_chatglm3_tp.py \
      --ignore=lora/test_llama_tp.py \
      --ignore=lora/test_llm_with_multi_loras.py
  parallelism: 4
- label: PyTorch Compilation Unit Tests # 15min
+- label: PyTorch Compilation Unit Tests
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -393,10 +327,8 @@ steps:
    - pytest -v -s compile/test_sequence_parallelism.py
    - pytest -v -s compile/test_async_tp.py
    - pytest -v -s compile/test_fusion_all_reduce.py
    - pytest -v -s compile/test_decorator.py
- label: PyTorch Fullgraph Smoke Test # 15min
+- label: PyTorch Fullgraph Smoke Test # 9min
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -404,10 +336,12 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/piecewise/
+  # these tests need to be separated, cannot combine
  - pytest -v -s compile/piecewise/test_simple.py
  - pytest -v -s compile/piecewise/test_toy_llama.py
  - pytest -v -s compile/piecewise/test_full_cudagraph.py
- label: PyTorch Fullgraph Test # 20min
+- label: PyTorch Fullgraph Test # 18min
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -416,8 +350,7 @@ steps:
  commands:
  - pytest -v -s compile/test_full_graph.py
- label: Kernels Core Operation Test # 48min
+- label: Kernels Core Operation Test
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -425,8 +358,7 @@ steps:
  commands:
    - pytest -v -s kernels/core
- label: Kernels Attention Test %N # 23min
+- label: Kernels Attention Test %N
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/attention/
@ -437,8 +369,7 @@ steps:
    - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
- label: Kernels Quantization Test %N # 64min
+- label: Kernels Quantization Test %N
  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/quantization/
@ -448,21 +379,18 @@ steps:
    - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
- label: Kernels MoE Test %N # 40min
+- label: Kernels MoE Test %N
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
  commands:
    - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  parallelism: 2
- label: Kernels Mamba Test # 31min
+- label: Kernels Mamba Test
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/mamba/
@ -470,8 +398,7 @@ steps:
  commands:
    - pytest -v -s kernels/mamba
- label: Tensorizer Test # 14min
+- label: Tensorizer Test # 11min
  timeout_in_minutes: 25
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/model_loader
@ -483,8 +410,7 @@ steps:
    - pytest -v -s tensorizer_loader
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test # 7min
+- label: Model Executor Test
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor
@ -494,8 +420,7 @@ steps:
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
    - pytest -v -s model_executor
- label: Benchmarks # 11min
+- label: Benchmarks # 9min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
@ -503,8 +428,7 @@ steps:
  commands:
  - bash scripts/run-benchmarks.sh
- label: Benchmarks CLI Test # 7min
+- label: Benchmarks CLI Test # 10min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -512,8 +436,7 @@ steps:
  commands:
  - pytest -v -s benchmarks/
- label: Quantization Test # 70min
+- label: Quantization Test
  timeout_in_minutes: 90
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -521,25 +444,21 @@ steps:
  - tests/quantization
  commands:
  # temporary install here since we need nightly, will move to requirements/test.in
-  # after torchao 0.12 release, and pin a working version of torchao nightly here
+  # after torchao 0.12 release
-
+  - pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 - label: LM Eval Small Models # 53min
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
- label: OpenAI API correctness # 22min
+- label: OpenAI API correctness
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - csrc/
@ -548,8 +467,7 @@ steps:
  commands: # LMEval+Transcription WER check
  - pytest -s entrypoints/openai/correctness/
- label: Encoder Decoder tests # 12min
+- label: Encoder Decoder tests # 5min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/
@ -557,8 +475,7 @@ steps:
  commands:
    - pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 23 min
+- label: OpenAI-Compatible Tool Use # 20 min
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental]
  fast_check: false
  source_file_dependencies:
@ -571,8 +488,7 @@ steps:
 #####  models test  #####
- label: Basic Models Test # 57min
+- label: Basic Models Test # 24min
  timeout_in_minutes: 75
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -585,8 +501,7 @@ steps:
    - pytest -v -s models/test_vision.py
    - pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) # 35min
+- label: Language Models Test (Standard)
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -597,7 +512,6 @@ steps:
    - pytest -v -s models/language -m core_model
 - label: Language Models Test (Hybrid) # 35 min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -610,8 +524,7 @@ steps:
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
    - pytest -v -s models/language/generation -m hybrid_model
- label: Language Models Test (Extended Generation) # 80min
+- label: Language Models Test (Extended Generation) # 1hr20min
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
@ -622,18 +535,7 @@ steps:
    - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
    - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
 - label: Language Models Test (PPL)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation_ppl_test
  commands:
    - pytest -v -s models/language/generation_ppl_test
 - label: Language Models Test (Extended Pooling)  # 36min
  timeout_in_minutes: 50
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
@ -642,27 +544,7 @@ steps:
  commands:
    - pytest -v -s models/language/pooling -m 'not core_model'
- label: Language Models Test (MTEB)
+- label: Multi-Modal Models Test (Standard)
  timeout_in_minutes: 110
  mirror_hardwares: [amdexperimental]
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling_mteb_test
  commands:
    - pytest -v -s models/language/pooling_mteb_test
 - label: Multi-Modal Processor Test # 44min
  timeout_in_minutes: 60
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/processing
 - label: Multi-Modal Models Test (Standard) # 60min
  timeout_in_minutes: 80
  mirror_hardwares: [amdexperimental]
  torch_nightly: true
  source_file_dependencies:
@ -671,8 +553,10 @@ steps:
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pip freeze | grep -E 'torch'
-    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
+    - pytest -v -s models/multimodal/processing
-    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
+    - pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model
    - pytest -v -s models/multimodal/test_tensor_schema.py -m core_model  # Needs mp_method="spawn"
    - cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 - label: Multi-Modal Models Test (Extended) 1
  mirror_hardwares: [amdexperimental]
@ -682,7 +566,7 @@ steps:
  - tests/models/multimodal
  commands:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+    - pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
 - label: Multi-Modal Models Test (Extended) 2
  mirror_hardwares: [amdexperimental]
@ -704,8 +588,7 @@ steps:
    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
    - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
- label: Quantized Models Test # 45 min
+- label: Quantized Models Test
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
@ -735,8 +618,7 @@ steps:
    - python3 examples/offline_inference/audio_language.py --model-type whisper
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
- label: Blackwell Test # 38 min
+- label: Blackwell Test
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
  gpu: b200
  # optional: true
@ -747,7 +629,6 @@ steps:
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/fusion.py
  - vllm/compilation/fusion_attn.py
@ -758,28 +639,21 @@ steps:
    # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
    - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
    - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
-    - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
+    - pytest -v -s tests/kernels/test_cutlass_mla_decode.py
    - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
    # Quantization
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
    # Fusion
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 #####  1 GPU test  #####
 #####  multi gpus test  #####
 - label: Distributed Comm Ops Test # 7min
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -791,7 +665,6 @@ steps:
  - pytest -v -s distributed/test_shm_broadcast.py
 - label: 2 Node Tests (4 GPUs in total) # 16min
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -815,8 +688,7 @@ steps:
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
- label: Distributed Tests (2 GPUs) # 110min
+- label: Distributed Tests (2 GPUs) # 40min
  timeout_in_minutes: 150
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -847,8 +719,7 @@ steps:
  # Avoid importing model tests that cause CUDA reinitialization error
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
-  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
+  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
  # test sequence parallel
  - pytest -v -s distributed/test_sequence_parallel.py
  # this test fails consistently.
@ -858,7 +729,6 @@ steps:
  - pytest -v -s models/multimodal/generation/test_maverick.py
 - label: Plugin Tests (2 GPUs) # 40min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
@ -871,11 +741,6 @@ steps:
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # end platform plugin tests
  # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # end io_processor plugins test
  # other tests continue here:
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
@ -884,8 +749,7 @@ steps:
  - pytest -v -s models/test_oot_registration.py # it needs a clean process
  - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Pipeline + Context Parallelism Test # 45min
+- label: Pipeline Parallelism Test # 45min
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
@ -898,10 +762,8 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
- label: LoRA TP Test (Distributed) # 17 min
+- label: LoRA TP Test (Distributed)
  timeout_in_minutes: 30
  mirror_hardwares: [amdexperimental]
  num_gpus: 4
  source_file_dependencies:
@ -915,15 +777,13 @@ steps:
    # requires multi-GPU testing for validation.
    - pytest -v -s -x lora/test_chatglm3_tp.py
    - pytest -v -s -x lora/test_llama_tp.py
-    - pytest -v -s -x lora/test_llm_with_multi_loras.py
+    - pytest -v -s -x lora/test_multi_loras_with_tp.py
 - label: Weight Loading Multiple GPU Test  # 33min
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
@ -971,10 +831,3 @@ steps:
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 - label: Qwen MoE EP Test # optional
  gpu: h200
  optional: true
  num_gpus: 2
  commands:
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
--- a/.github/.bc-linter.yml
+++ b/.github/.bc-linter.yml
@ -1,24 +0,0 @@
 # doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
 version: 1
 paths:
 # We temporarily disable globally, and will only enable with `annotations.include`
 # include:
 #   - "vllm/v1/attetion/*.py"
 #   - "vllm/v1/core/*.py"
 exclude:
  - "**/*.py"
 scan:
  functions: true        # check free functions and methods
  classes: true          # check classes/dataclasses
  public_only: true      # ignore names starting with "_" at any level
 annotations:
  include:               # decorators that force‑include a symbol
    - name: "bc_linter_include"  # matched by simple name or dotted suffix
      propagate_to_members: false # for classes, include methods/inner classes
  exclude:               # decorators that force‑exclude a symbol
    - name: "bc_linter_skip"     # matched by simple name or dotted suffix
      propagate_to_members: true  # for classes, exclude methods/inner classes
 excluded_violations: []  # e.g. ["ParameterRenamed", "FieldTypeChanged"]
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -5,21 +5,18 @@
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 /vllm/model_executor/layers/mamba @tdoublep
-/vllm/model_executor/model_loader @22quinn
+/vllm/multimodal @DarkLight1337 @ywang96
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
 /vllm/v1/sample @22quinn @houseroad
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm @chaunceyjiang
+/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm @chaunceyjiang
+/vllm/entrypoints @aarnphm
 /vllm/compilation @zou3519 @youkaichao @ProExpertProg
 /vllm/distributed/kv_transfer @NickLucche
 CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # Any change to the VllmConfig changes can have a large user-facing impact,
@ -28,11 +25,8 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 # vLLM V1
 /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
+/vllm/v1/structured_output @mgoin @russellb @aarnphm
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
 /vllm/v1/core @heheda12345
 /vllm/v1/kv_cache_interface.py @heheda12345
 # Test ownership
 /.buildkite/lm-eval-harness @mgoin @simon-mo
@ -40,20 +34,18 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
 /tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
-/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
+/tests/multimodal @DarkLight1337 @ywang96
 /tests/prefix_caching @comaniac @KuntaiDu
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/v1/core @heheda12345
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
 # Docs
 /docs @hmellor
@ -75,9 +67,6 @@ mkdocs.yaml @hmellor
 /vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
 /vllm/model_executor/models/qwen* @sighingnow
 # MTP-specific files
 /vllm/model_executor/models/deepseek_mtp.py @luccafong
 # Mistral-specific files
 /vllm/model_executor/models/mistral*.py @patrickvonplaten
 /vllm/model_executor/models/mixtral*.py @patrickvonplaten
@ -90,15 +79,4 @@ mkdocs.yaml @hmellor
 /vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
 /vllm/attention/ops/triton_unified_attention.py @tdoublep
 # ROCm related: specify owner with write access to notify AMD folks for careful code review
 /docker/Dockerfile.rocm* @gshtras
 /vllm/v1/attention/backends/rocm*.py @gshtras
 /vllm/v1/attention/backends/mla/rocm*.py @gshtras
 /vllm/attention/ops/rocm*.py @gshtras
 /vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
 # TPU
 /vllm/v1/worker/tpu* @NickLucche
 /vllm/platforms/tpu.py @NickLucche
 /vllm/v1/sample/tpu @NickLucche
 /vllm/tests/v1/tpu @NickLucche
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -7,6 +7,8 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
 ## Test Result
 ## (Optional) Documentation Update
 ---
 <details>
 <summary> Essential Elements of an Effective PR Description Checklist </summary>
@ -15,7 +17,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
 - [ ] The test plan, such as providing test command.
 - [ ] The test results, such as pasting the results comparison before and after, or e2e results
 - [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
 - [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
 </details>
 **BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@ -273,20 +273,6 @@ pull_request_rules:
      users:
        - "sangstar"
 - name: assign reviewer for modelopt changes
  conditions:
    - or:
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
        - files~=^tests/models/quantization/test_modelopt\.py$
        - files~=^tests/quantization/test_modelopt\.py$
        - files~=^tests/models/quantization/test_nvfp4\.py$
        - files~=^docs/features/quantization/modelopt\.md$
  actions:
    assign:
      users:
        - "Edwardf0t1"
 - name: remove 'needs-rebase' label when conflict is resolved
  conditions:
      - -conflict
--- a/.github/scale-config.yml
+++ b/.github/scale-config.yml
@ -1,21 +0,0 @@
 # scale-config.yml:
 #   Powers what instance types are available for GHA auto-scaled
 #   runners. Runners listed here will be available as self hosted
 #   runners, configuration is directly pulled from the main branch.
 # runner_types:
 #   runner_label:
 #     instance_type: m4.large
 #     os: linux
 #     # min_available defaults to the global cfg in the ALI Terraform
 #     min_available: undefined
 #     # when max_available value is not defined, no max runners is enforced
 #     max_available: undefined
 #     disk_size: 50
 #     is_ephemeral: true
 runner_types:
  linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: true
    os: linux
--- a/.github/workflows/add_label_automerge.yml
+++ b/.github/workflows/add_label_automerge.yml
@ -10,7 +10,7 @@ jobs:
        runs-on: ubuntu-latest
        steps:
            -   name: Add label
-                uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+                uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
                with:
                    script: |
                        github.rest.issues.addLabels({
--- a/.github/workflows/bc-lint.yml
+++ b/.github/workflows/bc-lint.yml
@ -1,27 +0,0 @@
 name: BC Lint
 on:
  pull_request:
    types:
      - opened
      - synchronize
      - reopened
 jobs:
  bc_lint:
    if: github.repository_owner == 'vllm-project'
    runs-on: ubuntu-latest
    steps:
      - name: Run BC Lint Action
        uses: pytorch/test-infra/.github/actions/bc-lint@main
        with:
          repo: ${{ github.event.pull_request.head.repo.full_name }}
          base_sha: ${{ github.event.pull_request.base.sha }}
          head_sha: ${{ github.event.pull_request.head.sha }}
          suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
          docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
          config_dir: .github
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
  cancel-in-progress: true
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@ -16,7 +16,7 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Set up Python
-        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
        with:
          python-version: '3.12'
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@ -1,309 +0,0 @@
 name: Label issues based on keywords
 on:
  issues:
    types: [opened, edited, reopened]
 permissions:
  issues: write          # needed so the workflow can add labels
  contents: read
 concurrency:
  group: issue-labeler-${{ github.event.issue.number }}
  cancel-in-progress: true
 jobs:
  add-labels:
    runs-on: ubuntu-latest
    steps:
      - name: Label issues based on keywords
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
        with:
          script: |
            // Configuration: Add new labels and keywords here
            const labelConfig = {
              rocm: {
                // Keyword search - matches whole words only (with word boundaries)
                keywords: [
                  {
                    term: "composable kernel",
                    searchIn: "both"
                  },
                  {
                    term: "rccl",
                    searchIn: "body"  // only search in body
                  },
                  {
                    term: "migraphx",
                    searchIn: "title"  // only search in title
                  },
                  {
                    term: "hipgraph",
                    searchIn: "both"
                  },
                  {
                    term: "ROCm System Management Interface",
                    searchIn: "body"
                  },
                ],
                // Substring search - matches anywhere in text (partial matches)
                substrings: [
                  {
                    term: "VLLM_ROCM_",
                    searchIn: "both"
                  },
                  {
                    term: "aiter",
                    searchIn: "title"
                  },
                  {
                    term: "rocm",
                    searchIn: "title"
                  },
                  {
                    term: "amd",
                    searchIn: "title"
                  },
                  {
                    term: "hip-",
                    searchIn: "both"
                  },
                  {
                    term: "gfx",
                    searchIn: "both"
                  },
                  {
                    term: "cdna",
                    searchIn: "both"
                  },
                  {
                    term: "rdna",
                    searchIn: "both"
                  },
                  {
                    term: "torch_hip",
                    searchIn: "body"  // only in body
                  },
                  {
                    term: "_hip",
                    searchIn: "both"
                  },
                  {
                    term: "hip_",
                    searchIn: "both"
                  },
                  // ROCm tools and libraries
                  {
                    term: "hipify",
                    searchIn: "both"
                  },
                ],
                // Regex patterns - for complex pattern matching
                regexPatterns: [
                  {
                    pattern: "\\bmi\\d{3}[a-z]*\\b",
                    description: "AMD GPU names (mi + 3 digits + optional letters)",
                    flags: "gi",
                    searchIn: "both"  // "title", "body", or "both"
                  }
                ],
              },
            };
            // Helper function to create regex based on search type
            function createSearchRegex(term, type) {
              // Escape special regex characters in the term
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
              switch (type) {
                case 'keyword':
                  // Word boundary search - matches whole words only
                  return new RegExp(`\\b${escapedTerm}\\b`, "gi");
                case 'substring':
                  // Substring search - matches anywhere in the text
                  return new RegExp(escapedTerm, "gi");
                default:
                  throw new Error(`Unknown search type: ${type}`);
              }
            }
            // Helper function to find matching terms in text with line information
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
              const matches = [];
              const lines = text.split('\n');
              for (const termConfig of searchTerms) {
                let regex;
                let term, searchIn, pattern, description, flags;
                // Handle different input formats (string or object)
                if (typeof termConfig === 'string') {
                  term = termConfig;
                  searchIn = 'both'; // default
                } else {
                  term = termConfig.term;
                  searchIn = termConfig.searchIn || 'both';
                  pattern = termConfig.pattern;
                  description = termConfig.description;
                  flags = termConfig.flags;
                }
                // Skip if this term shouldn't be searched in the current location
                if (searchIn !== 'both' && searchIn !== searchLocation) {
                  continue;
                }
                // Create appropriate regex
                if (searchType === 'regex') {
                  regex = new RegExp(pattern, flags || "gi");
                } else {
                  regex = createSearchRegex(term, searchType);
                }
                const termMatches = [];
                // Check each line for matches
                lines.forEach((line, lineIndex) => {
                  const lineMatches = line.match(regex);
                  if (lineMatches) {
                    lineMatches.forEach(match => {
                      termMatches.push({
                        match: match,
                        lineNumber: lineIndex + 1,
                        lineContent: line.trim(),
                        searchType: searchType,
                        searchLocation: searchLocation,
                        originalTerm: term || pattern,
                        description: description,
                        // Show context around the match in the line
                        context: line.length > 100 ? 
                          line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), 
                                       line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' 
                          : line.trim()
                      });
                    });
                  }
                });
                if (termMatches.length > 0) {
                  matches.push({
                    term: term || (description || pattern),
                    searchType: searchType,
                    searchLocation: searchLocation,
                    searchIn: searchIn,
                    pattern: pattern,
                    matches: termMatches,
                    count: termMatches.length
                  });
                }
              }
              return matches;
            }
            // Helper function to check if label should be added
            async function processLabel(labelName, config) {
              const body = context.payload.issue.body || "";
              const title = context.payload.issue.title || "";
              core.notice(`Processing label: ${labelName}`);
              core.notice(`Issue Title: "${title}"`);
              core.notice(`Issue Body length: ${body.length} characters`);
              let shouldAddLabel = false;
              let allMatches = [];
              let reason = '';
              const keywords = config.keywords || [];
              const substrings = config.substrings || [];
              const regexPatterns = config.regexPatterns || [];
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
              // Search in title
              if (title.trim()) {
                core.notice(`Searching in title: "${title}"`);
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
              }
              // Search in body
              if (body.trim()) {
                core.notice(`Searching in body (${body.length} characters)`);
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
              }
              if (allMatches.length > 0) {
                core.notice(`Found ${allMatches.length} matching term(s):`);
                for (const termMatch of allMatches) {
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
                  if (termMatch.searchType === 'regex') {
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  } else {
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
                  }
                  // Show details for each match
                  termMatch.matches.forEach((match, index) => {
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
                    if (match.description) {
                      core.notice(`       Description: ${match.description}`);
                    }
                    core.notice(`       Context: ${match.context}`);
                    if (match.lineContent !== match.context) {
                      core.notice(`       Full line: ${match.lineContent}`);
                    }
                  });
                }
                shouldAddLabel = true;
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
                const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
              }
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
              if (shouldAddLabel) {
                const existingLabels = context.payload.issue.labels.map(l => l.name);
                if (!existingLabels.includes(labelName)) {
                  await github.rest.issues.addLabels({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    issue_number: context.issue.number,
                    labels: [labelName],
                  });
                  core.notice(`Label "${labelName}" added. ${reason}`);
                  return true;
                }
                core.notice(`Label "${labelName}" already present.`);
                return false;
              }
              core.notice(`No matching terms found for label "${labelName}".`);
              return false;
            }
            // Process all configured labels
            const processLabels = Object.entries(labelConfig)
              .map(([labelName, config]) => processLabel(labelName, config));
            const labelsAdded = await Promise.all(processLabels);
            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
--- a/.github/workflows/lint-and-deploy.yaml
+++ b/.github/workflows/lint-and-deploy.yaml
@ -0,0 +1,89 @@
 name: Lint and Deploy Charts
 on: pull_request
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true
 permissions:
  contents: read
 jobs:
  lint-and-deploy:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          fetch-depth: 0
      - name: Set up Helm
        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
        with:
          version: v3.14.4
       #Python is required because ct lint runs Yamale and yamllint which require Python.
      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
        with:
          python-version: '3.13'
      - name: Set up chart-testing
        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
        with:
          version: v3.10.1
      - name: Run chart-testing (lint)
        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
      - name: Setup minio
        run: |
          docker network create vllm-net
          docker run -d -p 9000:9000 --name minio --net vllm-net \
                     -e "MINIO_ACCESS_KEY=minioadmin" \
                     -e "MINIO_SECRET_KEY=minioadmin" \
                     -v /tmp/data:/data \
                     -v /tmp/config:/root/.minio \
                     minio/minio server /data
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          export AWS_EC2_METADATA_DISABLED=true
          mkdir opt-125m
          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
      - name: Create kind cluster
        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
      - name: Build the Docker image vllm cpu
        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
      - name: Configuration of docker images, network and namespace for the kind cluster
        run: |
          docker pull amazon/aws-cli:2.6.4
          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
          kind load docker-image vllm-cpu-env:latest --name chart-testing
          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
          kubectl create ns ns-vllm
      - name: Run chart-testing (install)
        run: |
          export AWS_ACCESS_KEY_ID=minioadmin
          export AWS_SECRET_ACCESS_KEY=minioadmin
          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
      - name: curl test
        run: |
          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
          sleep 10
          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
                  --header "Content-Type: application/json" \
                  --data '{
                          "model": "opt-125m",
                          "prompt": "San Francisco is a",
                          "max_tokens": 7,
                          "temperature": 0
                  }'):$CODE"
          echo "$CODE"
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -17,7 +17,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+    - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
      with:
        python-version: "3.12"
    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -0,0 +1,111 @@
 # This workflow will upload a Python Package to Release asset
 # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
 name: Create Release
 on:
  push:
    tags:
      - v*
 # Needed to create release and upload assets
 permissions:
  contents: write
 jobs:
  release:
    # Retrieve tag and create release
    name: Create Release
    runs-on: ubuntu-latest
    outputs:
      upload_url: ${{ steps.create_release.outputs.upload_url }}
    steps:
      - name: Checkout
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      - name: Extract branch info
        shell: bash
        run: |
          echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
      - name: Create Release
        id: create_release
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        env:
          RELEASE_TAG: ${{ env.release_tag }}
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          script: |
            const script = require('.github/workflows/scripts/create_release.js')
            await script(github, context, core)
  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
  # wheel:
  #   name: Build Wheel
  #   runs-on: ${{ matrix.os }}
  #   needs: release
  #   strategy:
  #     fail-fast: false
  #     matrix:
  #         os: ['ubuntu-20.04']
  #         python-version: ['3.9', '3.10', '3.11', '3.12']
  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
  #         cuda-version: ['11.8', '12.1']
  #   steps:
  #     - name: Checkout
  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
  #     - name: Setup ccache
  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
  #       with:
  #         create-symlink: true
  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
  #     - name: Set up Linux Env
  #       if: ${{ runner.os == 'Linux' }}
  #       run: |
  #         bash -x .github/workflows/scripts/env.sh
  #     - name: Set up Python
  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
  #       with:
  #           python-version: ${{ matrix.python-version }}
  #     - name: Install CUDA ${{ matrix.cuda-version }}
  #       run: |
  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
  #       run: |
  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
  #     - name: Build wheel
  #       shell: bash
  #       env:
  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
  #       run: |
  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
  #         asset_name=${wheel_name//"linux"/"manylinux1"}
  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
  #     - name: Upload Release Asset
  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
  #       env:
  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  #       with:
  #         upload_url: ${{ needs.release.outputs.upload_url }}
  #         asset_path: ./dist/${{ env.wheel_name }}
  #         asset_name: ${{ env.asset_name }}
  #         asset_content_type: application/*
      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
      # - name: Publish package
      #   uses: pypa/gh-action-pypi-publish@release/v1.8
      #   with:
      #     repository-url: https://test.pypi.org/legacy/
      #     password: ${{ secrets.PYPI_API_TOKEN }}
      #     skip-existing: true
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -9,46 +9,19 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Remind to run full CI on PR
-        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
        with:
          script: |
-            try {
+            github.rest.issues.createComment({
-              // Get the PR author
+              owner: context.repo.owner,
-              const prAuthor = context.payload.pull_request.user.login;
+              repo: context.repo.repo,
-              
+              issue_number: context.issue.number,
-              // Check if this is the author's first PR in this repository
+              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-              // Use GitHub's search API to find all PRs by this author
+                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-              const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
+                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
-                q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
+                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-                per_page: 100  
+                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-              });
+                '🚀'
-              
+            })
              const authorPRCount = searchResults.total_count;
              console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
              // Only post comment if this is the first PR (only one PR by this author)
              if (authorPRCount === 1) {
                console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
                await github.rest.issues.createComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: context.issue.number,
                body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
                  '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
                  'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
                  'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
                  'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
                  'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
                  'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
                  '🚀'
                });
              } else {
                console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
              }
            } catch (error) {
              console.error('Error checking PR history or posting comment:', error);
              // Don't fail the workflow, just log the error
            }
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -13,7 +13,7 @@ jobs:
      actions: write
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
+      - uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
        with:
          # Increasing this value ensures that changes to this workflow
          # propagate to all issues and PRs in days rather than months
--- a/.gitignore
+++ b/.gitignore
@ -4,7 +4,7 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
-# triton jit
+# triton jit 
 .triton
 # Byte-compiled / optimized / DLL files
@ -177,14 +177,6 @@ cython_debug/
 # VSCode
 .vscode/
 # Claude
 CLAUDE.md
 .claude/
 # Codex
 AGENTS.md
 .codex/
 # DS Store
 .DS_Store
@ -217,4 +209,4 @@ shellcheck*/
 csrc/moe/marlin_moe_wna16/kernel_*
 # Ignore ep_kernels_workspace folder
-ep_kernels_workspace/
+ep_kernels_workspace/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -21,7 +21,7 @@ repos:
  - id: ruff-format
    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/crate-ci/typos
-  rev: v1.35.5
+  rev: v1.34.0
  hooks:
  - id: typos
 - repo: https://github.com/PyCQA/isort
--- a/.yapfignore
+++ b/.yapfignore
@ -1,2 +1 @@
 collect_env.py
 vllm/model_executor/layers/fla/ops/*.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -30,7 +30,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
@ -45,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
 #
 # Try to find python package with an executable that exactly matches
@ -357,7 +357,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
    set(MARLIN_SRCS
       "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
       "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
       "csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin.cu"
       "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
       "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
@ -541,7 +543,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
@ -560,7 +561,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
    set(SRCS
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
      "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
      "csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
@ -752,33 +752,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                     "found in CUDA target architectures")
    endif()
  endif()
  # Only build W4A8 kernels if we are building for something compatible with sm90a
  cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
    set(SRCS
       "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
    set_gencode_flags_for_srcs(
      SRCS "${SRCS}"
      CUDA_ARCHS "${W4A8_ARCHS}")
    list(APPEND VLLM_EXT_SRC "${SRCS}")
    message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
  else()
    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
        AND W4A8_ARCHS)
      message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
                     "not >= 12.0, we recommend upgrading to CUDA 12.0 or "
                     "later if you intend on running w4a16 quantized models on "
                     "Hopper.")
    else()
      message(STATUS "Not building W4A8 kernels as no compatible archs "
                     "found in CUDA target architectures")
    endif()
  endif()
 # if CUDA endif
 endif()
@ -819,9 +792,7 @@ set(VLLM_MOE_EXT_SRC
  "csrc/moe/topk_softmax_kernels.cu")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_MOE_EXT_SRC
+  list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
    "csrc/moe/moe_wna16.cu"
    "csrc/moe/grouped_topk_kernels.cu")
 endif()
 if(VLLM_GPU_LANG STREQUAL "CUDA")
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -2,6 +2,7 @@ include LICENSE
 include requirements/common.txt
 include requirements/cuda.txt
 include requirements/rocm.txt
 include requirements/neuron.txt
 include requirements/cpu.txt
 include CMakeLists.txt
--- a/README.md
+++ b/README.md
@ -14,25 +14,18 @@ Easy, fast, and cheap LLM serving for everyone
 | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 ---
 Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
 ---
 *Latest News* 🔥
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
+- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
+- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
 - [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
 - [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 <details>
 <summary>Previous News</summary>
 - [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
 - [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
 - [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
 - [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
 - [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
 - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
--- a/SECURITY.md
+++ b/SECURITY.md
@ -42,9 +42,4 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma
 * If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
 * Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
    * Substantial internal deployment leveraging the upstream vLLM project.
    * Established internal security teams and comprehensive compliance measures.
    * Active and consistent contributions to the upstream vLLM project.
 * We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -32,14 +32,6 @@ become available.
        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
      </td>
    </tr>
        <tr>
      <td><strong>ShareGPT4Video (Video)</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td>
        <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
      </td>
    </tr>
    <tr>
      <td><strong>BurstGPT</strong></td>
@ -59,12 +51,6 @@ become available.
      <td style="text-align: center;">✅</td>
      <td><code>synthetic</code></td>
    </tr>
    <tr>
      <td><strong>RandomMultiModal (Image/Video)</strong></td>
      <td style="text-align: center;">🟡</td>
      <td style="text-align: center;">🚧</td>
      <td><code>synthetic</code> </td>
    </tr>
    <tr>
      <td><strong>Prefix Repetition</strong></td>
      <td style="text-align: center;">✅</td>
@ -95,24 +81,6 @@ become available.
      <td style="text-align: center;">✅</td>
      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
    </tr>
    <tr>
      <td><strong>HuggingFace-MTBench</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>philschmid/mt-bench</code></td>
    </tr>
    <tr>
      <td><strong>HuggingFace-Blazedit</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
    </tr>
    <tr>
      <td><strong>Spec Bench</strong></td>
      <td style="text-align: center;">✅</td>
      <td style="text-align: center;">✅</td>
      <td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
    </tr>
    <tr>
      <td><strong>Custom</strong></td>
      <td style="text-align: center;">✅</td>
@ -128,12 +96,7 @@ become available.
 🚧: to be supported
-**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
+**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
 For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
 ```bash
 --dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
 ```
 ## 🚀 Example - Online Benchmark
@ -231,7 +194,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
  --backend openai-chat \
  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -257,43 +219,6 @@ vllm bench serve \
    --num-prompts 2048
 ```
 ### Spec Bench Benchmark with Speculative Decoding
 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
    --speculative-config $'{"method": "ngram",
    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
    "prompt_lookup_min": 2}'
 ```
 [SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
 Run all categories:
 ``` bash
 # Download the dataset using:
 # wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
 vllm bench serve \
    --model meta-llama/Meta-Llama-3-8B-Instruct \
    --dataset-name spec_bench \ 
    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
    --num-prompts -1
 ```
 Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
 Run only a specific category like "summarization":
 ``` bash
 vllm bench serve \
    --model meta-llama/Meta-Llama-3-8B-Instruct \
    --dataset-name spec_bench \ 
    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
    --num-prompts -1
    --spec-bench-category "summarization"
 ```
 ### Other HuggingFaceDataset Examples
 ```bash
@ -305,7 +230,6 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
 ```bash
 vllm bench serve \
  --backend openai-chat \
  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -320,7 +244,6 @@ vllm bench serve \
 ```bash
 vllm bench serve \
  --backend openai-chat \
  --endpoint-type openai-chat \
  --model Qwen/Qwen2-VL-7B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name hf \
@ -350,18 +273,6 @@ vllm bench serve \
    --num-prompts 80
 ```
 `vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
 ``` bash
 vllm bench serve \
    --model Qwen/QwQ-32B \
    --dataset-name hf \
    --dataset-path vdaita/edit_5k_char \
    --num-prompts 90 \
    --blazedit-min-distance 0.01 \
    --blazedit-max-distance 0.99
 ```
 ### Running With Sampling Parameters
 When using OpenAI-compatible backends such as `vllm`, optional sampling
@ -698,7 +609,7 @@ vllm bench serve \
  --prefix-repetition-prefix-len 512 \
  --prefix-repetition-suffix-len 128 \
  --prefix-repetition-num-prefixes 5 \
-  --prefix-repetition-output-len 128
+  --prefix-repetition-output-len 128 
 ```
 </details>
@ -761,7 +672,7 @@ python -m vllm.entrypoints.openai.api_server \
 Send requests with images:
 ```bash
-vllm bench serve \
+python benchmarks/benchmark_serving.py \
  --backend openai-chat \
  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dataset-name sharegpt \
@ -773,102 +684,4 @@ vllm bench serve \
  --endpoint /v1/chat/completion
 ```
 ### Videos (ShareGPT4Video)
 Start vLLM:
 ```bash
 python -m vllm.entrypoints.openai.api_server \
  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dtype bfloat16 \
  --limit-mm-per-prompt '{"video": 1}' \
  --allowed-local-media-path /path/to/sharegpt4video/videos
 ```
 Send requests with videos:
 ```bash
 vllm bench serve \
  --backend openai-chat \
  --model Qwen/Qwen2.5-VL-7B-Instruct \
  --dataset-name sharegpt \
  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
  --num-prompts 100 \
  --save-result \
  --result-dir ~/vllm_benchmark_results \
  --save-detailed \
  --endpoint /v1/chat/completion
 ```
 ### Synthetic Random Images (random-mm)
 Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
 Notes:
 - Works only with online benchmark via the OpenAI  backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
 - Video sampling is not yet implemented.
 Start the server (example):
 ```bash
 vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
  --dtype bfloat16 \
  --max-model-len 16384 \
  --limit-mm-per-prompt '{"image": 3, "video": 0}' \
  --mm-processor-kwargs max_pixels=1003520
 ```
 Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
 Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
 ```bash
 vllm bench serve \
  --backend openai-chat \
  --model Qwen/Qwen2.5-VL-3B-Instruct \
  --endpoint /v1/chat/completions \
  --dataset-name random-mm \
  --num-prompts 100 \
  --max-concurrency 10 \
  --random-prefix-len 25 \
  --random-input-len 300 \
  --random-output-len 40 \
  --random-range-ratio 0.2 \
  --random-mm-base-items-per-request 2 \
  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
  --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
  --request-rate inf \
  --ignore-eos \
  --seed 42
 ```
 The number of items per request can be controlled by passing multiple image buckets:
 ```bash
  --random-mm-base-items-per-request 2 \
  --random-mm-num-mm-items-range-ratio 0.5 \
  --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
 ```
 Flags specific to `random-mm`:
 - `--random-mm-base-items-per-request`: base number of multimodal items per request.
 - `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
 - `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
 - `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
 Behavioral notes:
 - If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
 How sampling works:
 - Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
 - For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
 - If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
 This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
 - The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
 </details>
--- a/benchmarks/auto_tune/README.md
+++ b/benchmarks/auto_tune/README.md
@ -31,12 +31,6 @@ cd vllm
 You must set the following variables at the top of the script before execution.
   Note: You can also override the default values below via environment variables when running the script.
 ```bash
 MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
 ```
 | Variable | Description | Example Value |
 | --- | --- | --- |
 | `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
--- a/benchmarks/auto_tune/auto_tune.sh
+++ b/benchmarks/auto_tune/auto_tune.sh
@ -5,41 +5,25 @@
 TAG=$(date +"%Y_%m_%d_%H_%M")
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
+BASE="$SCRIPT_DIR/../../.."
-BASE=${BASE:-"$SCRIPT_DIR/../../.."}
+MODEL="meta-llama/Llama-3.1-8B-Instruct"
-MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
+SYSTEM="TPU"
-SYSTEM=${SYSTEM:-"TPU"}
+TP=1
-TP=${TP:-1}
+DOWNLOAD_DIR=""
-DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
+INPUT_LEN=4000
-INPUT_LEN=${INPUT_LEN:-4000}
+OUTPUT_LEN=16
-OUTPUT_LEN=${OUTPUT_LEN:-16}
+MAX_MODEL_LEN=4096
-MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
+MIN_CACHE_HIT_PCT=0
-MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
+MAX_LATENCY_ALLOWED_MS=100000000000
-MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
+NUM_SEQS_LIST="128 256"
-NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
+NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
 NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
 LOG_FOLDER="$BASE/auto-benchmark/$TAG"
 RESULT="$LOG_FOLDER/result.txt"
 PROFILE_PATH="$LOG_FOLDER/profile"
-echo "====================== AUTO TUNE PARAMETERS ===================="
+echo "result file: $RESULT"
-echo "SCRIPT_DIR=$SCRIPT_DIR"
+echo "model: $MODEL"
 echo "BASE=$BASE"
 echo "MODEL=$MODEL"
 echo "SYSTEM=$SYSTEM"
 echo "TP=$TP"
 echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
 echo "INPUT_LEN=$INPUT_LEN"
 echo "OUTPUT_LEN=$OUTPUT_LEN"
 echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
 echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
 echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
 echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
 echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
 echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
 echo "RESULT_FILE=$RESULT"
 echo "====================== AUTO TUNEPARAMETERS ===================="
 rm -rf $LOG_FOLDER
 rm -rf $PROFILE_PATH
@ -229,7 +213,7 @@ run_benchmark() {
    pkill -if vllm
    sleep 10
-    echo "===================="
+    printf '=%.0s' $(seq 1 20)
    return 0
 }
--- a/benchmarks/benchmark_block_pool.py
+++ b/benchmarks/benchmark_block_pool.py
@ -57,7 +57,7 @@ def invoke_main() -> None:
        "--num-iteration",
        type=int,
        default=1000,
-        help="Number of iterations to run to stabilize final data readings",
+        help="Number of iterations to run to stablize final data readings",
    )
    parser.add_argument(
        "--allocate-blocks",
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@ -293,41 +293,6 @@ def process_image(image: Any) -> Mapping[str, Any]:
    )
 def process_video(video: Any) -> Mapping[str, Any]:
    """
    Process a single video input and return a multimedia content dictionary.
    Supports the following input types:
    1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
       containing raw video data.
    2. String input: - Treats the string as a URL or local file path.  -
       Prepends "file://" if the string doesn't start with "http://" or
       "file://".  - Returns a dictionary with the image URL.
    Raises:
        ValueError: If the input is not a supported type.
    """
    if isinstance(video, dict) and "bytes" in video:
        video_bytes = video["bytes"]
        video_base64 = base64.b64encode(video_bytes).decode("utf-8")
        return {
            "type": "video_url",
            "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
        }
    if isinstance(video, str):
        video_url = (
            video if video.startswith(("http://", "file://")) else f"file://{video}"
        )
        return {"type": "video_url", "video_url": {"url": video_url}}
    raise ValueError(
        f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`."  # noqa: E501
    )
 # -----------------------------------------------------------------------------
 # Random Dataset Implementation (Synthetic Data)
 # -----------------------------------------------------------------------------
@ -403,7 +368,7 @@ class RandomDataset(BenchmarkDataset):
            # [6880, 6881] -> ['Ġcalls', 'here'] ->
            # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
            # To avoid uncontrolled change of the prompt length,
-            # the encoded sequence is truncated before being decoded again.
+            # the encoded sequence is truncated before being decode again.
            total_input_len = prefix_len + int(input_lens[i])
            re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
                :total_input_len
@ -486,10 +451,9 @@ class ShareGPTDataset(BenchmarkDataset):
                skip_min_output_len_check=output_len is not None,
            ):
                continue
            # TODO: Also support ShareGPT4Video.
            if image_path := entry.get("image"):
                mm_content = process_image(image_path)
            elif video_path := entry.get("video"):
                mm_content = process_video(video_path)
            else:
                mm_content = None
            if enable_multimodal_chat:
@ -958,10 +922,8 @@ class InstructCoderDataset(HuggingFaceDataset):
        for i, item in enumerate(self.data):
            if len(sampled_requests) >= num_requests:
                break
-            prompt = (
+            prompt = f"{item['input']}\n\n{item['instruction']} Just output \
-                f"{item['input']}\n\n{item['instruction']} Just output "
+            the code, do not include any explanation."
                "the code, do not include any explanation."
            )
            # apply template
            prompt = tokenizer.apply_chat_template(
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -1,17 +1,191 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import sys
+"""Benchmark the latency of processing a single batch of requests."""
 import argparse
 import dataclasses
 import json
 import os
 import time
 from typing import Any, Optional
 import numpy as np
 from tqdm import tqdm
 from typing_extensions import deprecated
 import vllm.envs as envs
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser
 def save_to_pytorch_benchmark_format(
    args: argparse.Namespace, results: dict[str, Any]
 ) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={"latency": results["latencies"]},
        extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
    )
    if pt_records:
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
        write_to_json(pt_file, pt_records)
@deprecated(
    "benchmark_latency.py is deprecated and will be removed in a "
    "future version. Please use 'vllm bench latency' instead.",
 )
 def main(args: argparse.Namespace):
    print(args)
    engine_args = EngineArgs.from_cli_args(args)
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
    llm = LLM(**dataclasses.asdict(engine_args))
    assert llm.llm_engine.model_config.max_model_len >= (
        args.input_len + args.output_len
    ), (
        "Please ensure that max_model_len is greater than"
        " the sum of input_len and output_len."
    )
    sampling_params = SamplingParams(
        n=args.n,
        temperature=1.0,
        top_p=1.0,
        ignore_eos=True,
        max_tokens=args.output_len,
        detokenize=not args.disable_detokenize,
    )
    print(sampling_params)
    dummy_prompt_token_ids = np.random.randint(
        10000, size=(args.batch_size, args.input_len)
    )
    dummy_prompts: list[PromptType] = [
        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
    ]
    def llm_generate():
        if not args.use_beam_search:
            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
        else:
            llm.beam_search(
                dummy_prompts,
                BeamSearchParams(
                    beam_width=args.n,
                    max_tokens=args.output_len,
                    ignore_eos=True,
                ),
            )
    def run_to_completion(profile_dir: Optional[str] = None):
        if profile_dir:
            llm.start_profile()
            llm_generate()
            llm.stop_profile()
        else:
            start_time = time.perf_counter()
            llm_generate()
            end_time = time.perf_counter()
            latency = end_time - start_time
            return latency
    print("Warming up...")
    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
        run_to_completion(profile_dir=None)
    if args.profile:
        profile_dir = envs.VLLM_TORCH_PROFILER_DIR
        print(f"Profiling (results will be saved to '{profile_dir}')...")
        run_to_completion(profile_dir=profile_dir)
        return
    # Benchmark.
    latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
        latencies.append(run_to_completion(profile_dir=None))
    latencies = np.array(latencies)
    percentages = [10, 25, 50, 75, 90, 99]
    percentiles = np.percentile(latencies, percentages)
    print(f"Avg latency: {np.mean(latencies)} seconds")
    for percentage, percentile in zip(percentages, percentiles):
        print(f"{percentage}% percentile latency: {percentile} seconds")
    # Output JSON results if specified
    if args.output_json:
        results = {
            "avg_latency": np.mean(latencies),
            "latencies": latencies.tolist(),
            "percentiles": dict(zip(percentages, percentiles.tolist())),
        }
        with open(args.output_json, "w") as f:
            json.dump(results, f, indent=4)
        save_to_pytorch_benchmark_format(args, results)
 def create_argument_parser():
    parser = FlexibleArgumentParser(
        description="Benchmark the latency of processing a single batch of "
        "requests till completion."
    )
    parser.add_argument("--input-len", type=int, default=32)
    parser.add_argument("--output-len", type=int, default=128)
    parser.add_argument("--batch-size", type=int, default=8)
    parser.add_argument(
        "--n",
        type=int,
        default=1,
        help="Number of generated sequences per prompt.",
    )
    parser.add_argument("--use-beam-search", action="store_true")
    parser.add_argument(
        "--num-iters-warmup",
        type=int,
        default=10,
        help="Number of iterations to run for warmup.",
    )
    parser.add_argument(
        "--num-iters", type=int, default=30, help="Number of iterations to run."
    )
    parser.add_argument(
        "--profile",
        action="store_true",
        help="profile the generation process of a single batch",
    )
    parser.add_argument(
        "--output-json",
        type=str,
        default=None,
        help="Path to save the latency results in JSON format.",
    )
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
        help=(
            "Do not detokenize responses (i.e. do not include "
            "detokenization time in the latency measurement)"
        ),
    )
    parser = EngineArgs.add_cli_args(parser)
    # V1 enables prefix caching by default which skews the latency
    # numbers. We need to disable prefix caching by default.
    parser.set_defaults(enable_prefix_caching=False)
    return parser
 if __name__ == "__main__":
-    print("""DEPRECATED: This script has been moved to the vLLM CLI.
+    parser = create_argument_parser()
-
+    args = parser.parse_args()
-Please use the following command instead:
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
-    vllm bench latency
+        raise OSError(
-
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
-For help with the new command, run:
+            "Please set it to a valid path to use torch profiler."
-    vllm bench latency --help
+        )
-
+    main(args)
 Alternatively, you can run the new command directly with:
    python -m vllm.entrypoints.cli.main bench latency --help
 """)
    sys.exit(1)
--- a/benchmarks/benchmark_ngram_proposer.py
+++ b/benchmarks/benchmark_ngram_proposer.py
@ -77,7 +77,7 @@ def invoke_main() -> None:
        "--num-iteration",
        type=int,
        default=100,
-        help="Number of iterations to run to stabilize final data readings",
+        help="Number of iterations to run to stablize final data readings",
    )
    parser.add_argument(
        "--num-req", type=int, default=128, help="Number of requests in the batch"
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@ -998,7 +998,7 @@ def create_argument_parser():
        "--percentile-metrics",
        type=str,
        default="ttft,tpot,itl",
-        help="Comma-separated list of selected metrics to report percentiles. "
+        help="Comma-separated list of selected metrics to report percentils. "
        "This argument specifies the metrics to report percentiles. "
        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
        'Default value is "ttft,tpot,itl".',
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -1,17 +1,742 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import sys
+"""Benchmark offline inference throughput."""
 import argparse
 import dataclasses
 import json
 import os
 import random
 import time
 import warnings
 from typing import Any, Optional, Union
 import torch
 import uvloop
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
 from typing_extensions import deprecated
 from benchmark_dataset import (
    AIMODataset,
    BurstGPTDataset,
    ConversationDataset,
    InstructCoderDataset,
    RandomDataset,
    SampleRequest,
    ShareGPTDataset,
    SonnetDataset,
    VisionArenaDataset,
 )
 from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args,
 )
 from vllm.inputs import TextPrompt, TokensPrompt
 from vllm.lora.request import LoRARequest
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 def run_vllm(
    requests: list[SampleRequest],
    n: int,
    engine_args: EngineArgs,
    disable_detokenize: bool = False,
 ) -> tuple[float, Optional[list[RequestOutput]]]:
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
        llm.llm_engine.model_config.max_model_len
        >= (request.prompt_len + request.expected_output_len)
        for request in requests
    ), (
        "Please ensure that max_model_len is greater than the sum of"
        " prompt_len and expected_output_len for all requests."
    )
    # Add the requests to the engine.
    prompts: list[Union[TextPrompt, TokensPrompt]] = []
    sampling_params: list[SamplingParams] = []
    for request in requests:
        prompts.append(
            TokensPrompt(
                prompt_token_ids=request.prompt["prompt_token_ids"],
                multi_modal_data=request.multi_modal_data,
            )
            if "prompt_token_ids" in request.prompt
            else TextPrompt(
                prompt=request.prompt, multi_modal_data=request.multi_modal_data
            )
        )
        sampling_params.append(
            SamplingParams(
                n=n,
                temperature=1.0,
                top_p=1.0,
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                detokenize=not disable_detokenize,
            )
        )
    lora_requests: Optional[list[LoRARequest]] = None
    if engine_args.enable_lora:
        lora_requests = [request.lora_request for request in requests]
    use_beam_search = False
    outputs = None
    if not use_beam_search:
        start = time.perf_counter()
        outputs = llm.generate(
            prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
        )
        end = time.perf_counter()
    else:
        assert lora_requests is None, "BeamSearch API does not support LoRA"
        prompts = [request.prompt for request in requests]
        # output_len should be the same for all requests.
        output_len = requests[0].expected_output_len
        for request in requests:
            assert request.expected_output_len == output_len
        start = time.perf_counter()
        llm.beam_search(
            prompts,
            BeamSearchParams(
                beam_width=n,
                max_tokens=output_len,
                ignore_eos=True,
            ),
        )
        end = time.perf_counter()
    return end - start, outputs
 def run_vllm_chat(
    requests: list[SampleRequest],
    n: int,
    engine_args: EngineArgs,
    disable_detokenize: bool = False,
 ) -> tuple[float, list[RequestOutput]]:
    """
    Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
    multimodal models as it properly handles multimodal inputs and chat
    formatting. For non-multimodal models, use run_vllm() instead.
    """
    from vllm import LLM, SamplingParams
    llm = LLM(**dataclasses.asdict(engine_args))
    assert all(
        llm.llm_engine.model_config.max_model_len
        >= (request.prompt_len + request.expected_output_len)
        for request in requests
    ), (
        "Please ensure that max_model_len is greater than the sum of "
        "prompt_len and expected_output_len for all requests."
    )
    prompts = []
    sampling_params: list[SamplingParams] = []
    for request in requests:
        prompts.append(request.prompt)
        sampling_params.append(
            SamplingParams(
                n=n,
                temperature=1.0,
                top_p=1.0,
                ignore_eos=True,
                max_tokens=request.expected_output_len,
                detokenize=not disable_detokenize,
            )
        )
    start = time.perf_counter()
    outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
    end = time.perf_counter()
    return end - start, outputs
 async def run_vllm_async(
    requests: list[SampleRequest],
    n: int,
    engine_args: AsyncEngineArgs,
    disable_frontend_multiprocessing: bool = False,
    disable_detokenize: bool = False,
 ) -> float:
    from vllm import SamplingParams
    async with build_async_engine_client_from_engine_args(
        engine_args,
        disable_frontend_multiprocessing=disable_frontend_multiprocessing,
    ) as llm:
        model_config = await llm.get_model_config()
        assert all(
            model_config.max_model_len
            >= (request.prompt_len + request.expected_output_len)
            for request in requests
        ), (
            "Please ensure that max_model_len is greater than the sum of"
            " prompt_len and expected_output_len for all requests."
        )
        # Add the requests to the engine.
        prompts: list[Union[TextPrompt, TokensPrompt]] = []
        sampling_params: list[SamplingParams] = []
        lora_requests: list[Optional[LoRARequest]] = []
        for request in requests:
            prompts.append(
                TokensPrompt(
                    prompt_token_ids=request.prompt["prompt_token_ids"],
                    multi_modal_data=request.multi_modal_data,
                )
                if "prompt_token_ids" in request.prompt
                else TextPrompt(
                    prompt=request.prompt, multi_modal_data=request.multi_modal_data
                )
            )
            sampling_params.append(
                SamplingParams(
                    n=n,
                    temperature=1.0,
                    top_p=1.0,
                    ignore_eos=True,
                    max_tokens=request.expected_output_len,
                    detokenize=not disable_detokenize,
                )
            )
            lora_requests.append(request.lora_request)
        generators = []
        start = time.perf_counter()
        for i, (prompt, sp, lr) in enumerate(
            zip(prompts, sampling_params, lora_requests)
        ):
            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
            generators.append(generator)
        all_gens = merge_async_iterators(*generators)
        async for i, res in all_gens:
            pass
        end = time.perf_counter()
        return end - start
 def run_hf(
    requests: list[SampleRequest],
    model: str,
    tokenizer: PreTrainedTokenizerBase,
    n: int,
    max_batch_size: int,
    trust_remote_code: bool,
    disable_detokenize: bool = False,
 ) -> float:
    llm = AutoModelForCausalLM.from_pretrained(
        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
    )
    if llm.config.model_type == "llama":
        # To enable padding in the HF backend.
        tokenizer.pad_token = tokenizer.eos_token
    llm = llm.cuda()
    pbar = tqdm(total=len(requests))
    start = time.perf_counter()
    batch: list[str] = []
    max_prompt_len = 0
    max_output_len = 0
    for i in range(len(requests)):
        prompt = requests[i].prompt
        prompt_len = requests[i].prompt_len
        output_len = requests[i].expected_output_len
        # Add the prompt to the batch.
        batch.append(prompt)
        max_prompt_len = max(max_prompt_len, prompt_len)
        max_output_len = max(max_output_len, output_len)
        if len(batch) < max_batch_size and i != len(requests) - 1:
            # Check if we can add more requests to the batch.
            next_prompt_len = requests[i + 1].prompt_len
            next_output_len = requests[i + 1].expected_output_len
            if (
                max(max_prompt_len, next_prompt_len)
                + max(max_output_len, next_output_len)
            ) <= 2048:
                # We can add more requests to the batch.
                continue
        # Generate the sequences.
        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
        llm_outputs = llm.generate(
            input_ids=input_ids.cuda(),
            do_sample=True,
            num_return_sequences=n,
            temperature=1.0,
            top_p=1.0,
            use_cache=True,
            max_new_tokens=max_output_len,
        )
        if not disable_detokenize:
            # Include the decoding time.
            tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
        pbar.update(len(batch))
        # Clear the batch.
        batch = []
        max_prompt_len = 0
        max_output_len = 0
    end = time.perf_counter()
    return end - start
 def run_mii(
    requests: list[SampleRequest],
    model: str,
    tensor_parallel_size: int,
    output_len: int,
 ) -> float:
    from mii import client, serve
    llm = serve(model, tensor_parallel=tensor_parallel_size)
    prompts = [request.prompt for request in requests]
    start = time.perf_counter()
    llm.generate(prompts, max_new_tokens=output_len)
    end = time.perf_counter()
    client = client(model)
    client.terminate_server()
    return end - start
 def save_to_pytorch_benchmark_format(
    args: argparse.Namespace, results: dict[str, Any]
 ) -> None:
    pt_records = convert_to_pytorch_benchmark_format(
        args=args,
        metrics={
            "requests_per_second": [results["requests_per_second"]],
            "tokens_per_second": [results["tokens_per_second"]],
        },
        extra_info={
            k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
        },
    )
    if pt_records:
        # Don't use json suffix here as we don't want CI to pick it up
        pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
        write_to_json(pt_file, pt_records)
 def get_requests(args, tokenizer):
    # Common parameters for all dataset types.
    common_kwargs = {
        "dataset_path": args.dataset_path,
        "random_seed": args.seed,
    }
    sample_kwargs = {
        "tokenizer": tokenizer,
        "lora_path": args.lora_path,
        "max_loras": args.max_loras,
        "num_requests": args.num_prompts,
        "input_len": args.input_len,
        "output_len": args.output_len,
    }
    if args.dataset_path is None or args.dataset_name == "random":
        sample_kwargs["range_ratio"] = args.random_range_ratio
        sample_kwargs["prefix_len"] = args.prefix_len
        dataset_cls = RandomDataset
    elif args.dataset_name == "sharegpt":
        dataset_cls = ShareGPTDataset
        if args.backend == "vllm-chat":
            sample_kwargs["enable_multimodal_chat"] = True
    elif args.dataset_name == "sonnet":
        assert tokenizer.chat_template or tokenizer.default_chat_template, (
            "Tokenizer/model must have chat template for sonnet dataset."
        )
        dataset_cls = SonnetDataset
        sample_kwargs["prefix_len"] = args.prefix_len
        sample_kwargs["return_prompt_formatted"] = True
    elif args.dataset_name == "burstgpt":
        dataset_cls = BurstGPTDataset
    elif args.dataset_name == "hf":
        common_kwargs["no_stream"] = args.no_stream
        if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = VisionArenaDataset
            common_kwargs["dataset_subset"] = None
            common_kwargs["dataset_split"] = "train"
            sample_kwargs["enable_multimodal_chat"] = True
        elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = InstructCoderDataset
            common_kwargs["dataset_split"] = "train"
        elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = ConversationDataset
            common_kwargs["dataset_subset"] = args.hf_subset
            common_kwargs["dataset_split"] = args.hf_split
            sample_kwargs["enable_multimodal_chat"] = True
        elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
            dataset_cls = AIMODataset
            common_kwargs["dataset_subset"] = None
            common_kwargs["dataset_split"] = "train"
    else:
        raise ValueError(f"Unknown dataset name: {args.dataset_name}")
    # Remove None values
    sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
    return dataset_cls(**common_kwargs).sample(**sample_kwargs)
@deprecated(
    "benchmark_throughput.py is deprecated and will be removed in a "
    "future version. Please use 'vllm bench throughput' instead.",
 )
 def main(args: argparse.Namespace):
    if args.seed is None:
        args.seed = 0
    print(args)
    random.seed(args.seed)
    # Sample the requests.
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer, trust_remote_code=args.trust_remote_code
    )
    requests = get_requests(args, tokenizer)
    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
    request_outputs: Optional[list[RequestOutput]] = None
    if args.backend == "vllm":
        if args.async_engine:
            elapsed_time = uvloop.run(
                run_vllm_async(
                    requests,
                    args.n,
                    AsyncEngineArgs.from_cli_args(args),
                    args.disable_frontend_multiprocessing,
                    args.disable_detokenize,
                )
            )
        else:
            elapsed_time, request_outputs = run_vllm(
                requests,
                args.n,
                EngineArgs.from_cli_args(args),
                args.disable_detokenize,
            )
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(
            requests,
            args.model,
            tokenizer,
            args.n,
            args.hf_max_batch_size,
            args.trust_remote_code,
            args.disable_detokenize,
        )
    elif args.backend == "mii":
        elapsed_time = run_mii(
            requests, args.model, args.tensor_parallel_size, args.output_len
        )
    elif args.backend == "vllm-chat":
        elapsed_time, request_outputs = run_vllm_chat(
            requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
        )
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
    if request_outputs:
        # Note: with the vllm and vllm-chat backends,
        # we have request_outputs, which we use to count tokens.
        total_prompt_tokens = 0
        total_output_tokens = 0
        for ro in request_outputs:
            if not isinstance(ro, RequestOutput):
                continue
            total_prompt_tokens += (
                len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
            )
            total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
        total_num_tokens = total_prompt_tokens + total_output_tokens
    else:
        total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
        total_output_tokens = sum(r.expected_output_len for r in requests)
        total_prompt_tokens = total_num_tokens - total_output_tokens
    if is_multi_modal and args.backend != "vllm-chat":
        print(
            "\033[91mWARNING\033[0m: Multi-modal request with "
            f"{args.backend} backend detected. The "
            "following metrics are not accurate because image tokens are not"
            " counted. See vllm-project/vllm/issues/9778 for details."
        )
        # TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
        # vllm-chat backend counts the image tokens now
    print(
        f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
        f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
        f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
    )
    print(f"Total num prompt tokens:  {total_prompt_tokens}")
    print(f"Total num output tokens:  {total_output_tokens}")
    # Output JSON results if specified
    if args.output_json:
        results = {
            "elapsed_time": elapsed_time,
            "num_requests": len(requests),
            "total_num_tokens": total_num_tokens,
            "requests_per_second": len(requests) / elapsed_time,
            "tokens_per_second": total_num_tokens / elapsed_time,
        }
        with open(args.output_json, "w") as f:
            json.dump(results, f, indent=4)
        save_to_pytorch_benchmark_format(args, results)
 def validate_args(args):
    """
    Validate command-line arguments.
    """
    # === Deprecation and Defaulting ===
    if args.dataset is not None:
        warnings.warn(
            "The '--dataset' argument will be deprecated in the next release. "
            "Please use '--dataset-name' and '--dataset-path' instead.",
            stacklevel=2,
        )
        args.dataset_path = args.dataset
    if not getattr(args, "tokenizer", None):
        args.tokenizer = args.model
    # === Backend Validation ===
    valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
    if args.backend not in valid_backends:
        raise ValueError(f"Unsupported backend: {args.backend}")
    # === Dataset Configuration ===
    if not args.dataset and not args.dataset_path:
        print("When dataset path is not set, it will default to random dataset")
        args.dataset_name = "random"
        if args.input_len is None:
            raise ValueError("input_len must be provided for a random dataset")
    # === Dataset Name Specific Checks ===
    # --hf-subset and --hf-split: only used
    # when dataset_name is 'hf'
    if args.dataset_name != "hf" and (
        getattr(args, "hf_subset", None) is not None
        or getattr(args, "hf_split", None) is not None
    ):
        warnings.warn(
            "--hf-subset and --hf-split will be ignored \
                since --dataset-name is not 'hf'.",
            stacklevel=2,
        )
    elif args.dataset_name == "hf":
        if args.dataset_path in (
            VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
            | ConversationDataset.SUPPORTED_DATASET_PATHS
        ):
            assert args.backend == "vllm-chat", (
                f"{args.dataset_path} needs to use vllm-chat as the backend."
            )  # noqa: E501
        elif args.dataset_path in (
            InstructCoderDataset.SUPPORTED_DATASET_PATHS
            | AIMODataset.SUPPORTED_DATASET_PATHS
        ):
            assert args.backend == "vllm", (
                f"{args.dataset_path} needs to use vllm as the backend."
            )  # noqa: E501
        else:
            raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
    # --random-range-ratio: only used when dataset_name is 'random'
    if args.dataset_name != "random" and args.random_range_ratio is not None:
        warnings.warn(
            "--random-range-ratio will be ignored since \
                --dataset-name is not 'random'.",
            stacklevel=2,
        )
    # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
    # set.
    if (
        args.dataset_name not in {"random", "sonnet", None}
        and args.prefix_len is not None
    ):
        warnings.warn(
            "--prefix-len will be ignored since --dataset-name\
                 is not 'random', 'sonnet', or not set.",
            stacklevel=2,
        )
    # === LoRA Settings ===
    if getattr(args, "enable_lora", False) and args.backend != "vllm":
        raise ValueError("LoRA benchmarking is only supported for vLLM backend")
    if getattr(args, "enable_lora", False) and args.lora_path is None:
        raise ValueError("LoRA path must be provided when enable_lora is True")
    # === Backend-specific Validations ===
    if args.backend == "hf" and args.hf_max_batch_size is None:
        raise ValueError("HF max batch size is required for HF backend")
    if args.backend != "hf" and args.hf_max_batch_size is not None:
        raise ValueError("HF max batch size is only for HF backend.")
    if (
        args.backend in {"hf", "mii"}
        and getattr(args, "quantization", None) is not None
    ):
        raise ValueError("Quantization is only for vLLM backend.")
    if args.backend == "mii" and args.dtype != "auto":
        raise ValueError("dtype must be auto for MII backend.")
    if args.backend == "mii" and args.n != 1:
        raise ValueError("n must be 1 for MII backend.")
    if args.backend == "mii" and args.tokenizer != args.model:
        raise ValueError("Tokenizer must be the same as the model for MII backend.")
    # --data-parallel is not supported currently.
    # https://github.com/vllm-project/vllm/issues/16222
    if args.data_parallel_size > 1:
        raise ValueError(
            "Data parallel is not supported in offline benchmark, \
            please use benchmark serving instead"
        )
 def create_argument_parser():
    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
    parser.add_argument(
        "--backend",
        type=str,
        choices=["vllm", "hf", "mii", "vllm-chat"],
        default="vllm",
    )
    parser.add_argument(
        "--dataset-name",
        type=str,
        choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
        help="Name of the dataset to benchmark on.",
        default="sharegpt",
    )
    parser.add_argument(
        "--no-stream",
        action="store_true",
        help="Do not load the dataset in streaming mode.",
    )
    parser.add_argument(
        "--dataset",
        type=str,
        default=None,
        help="Path to the ShareGPT dataset, will be deprecated in\
            the next release. The dataset is expected to "
        "be a json in form of list[dict[..., conversations: "
        "list[dict[..., value: <prompt_or_response>]]]]",
    )
    parser.add_argument(
        "--dataset-path", type=str, default=None, help="Path to the dataset"
    )
    parser.add_argument(
        "--input-len",
        type=int,
        default=None,
        help="Input prompt length for each request",
    )
    parser.add_argument(
        "--output-len",
        type=int,
        default=None,
        help="Output length for each request. Overrides the "
        "output length from the dataset.",
    )
    parser.add_argument(
        "--n", type=int, default=1, help="Number of generated sequences per prompt."
    )
    parser.add_argument(
        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
    )
    parser.add_argument(
        "--hf-max-batch-size",
        type=int,
        default=None,
        help="Maximum batch size for HF backend.",
    )
    parser.add_argument(
        "--output-json",
        type=str,
        default=None,
        help="Path to save the throughput results in JSON format.",
    )
    parser.add_argument(
        "--async-engine",
        action="store_true",
        default=False,
        help="Use vLLM async engine rather than LLM class.",
    )
    parser.add_argument(
        "--disable-frontend-multiprocessing",
        action="store_true",
        default=False,
        help="Disable decoupled async engine frontend.",
    )
    parser.add_argument(
        "--disable-detokenize",
        action="store_true",
        help=(
            "Do not detokenize the response (i.e. do not include "
            "detokenization time in the measurement)"
        ),
    )
    # LoRA
    parser.add_argument(
        "--lora-path",
        type=str,
        default=None,
        help="Path to the LoRA adapters to use. This can be an absolute path, "
        "a relative path, or a Hugging Face model identifier.",
    )
    parser.add_argument(
        "--prefix-len",
        type=int,
        default=None,
        help=f"Number of prefix tokens to be used in RandomDataset "
        "and SonnetDataset. For RandomDataset, the total input "
        "length is the sum of prefix-len (default: "
        f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
        "sampled from [input_len * (1 - range_ratio), "
        "input_len * (1 + range_ratio)]. For SonnetDataset, "
        f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
        "controls how much of the input is fixed lines versus "
        "random lines, but the total input length remains approximately "
        "input_len tokens.",
    )
    # random dataset
    parser.add_argument(
        "--random-range-ratio",
        type=float,
        default=None,
        help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
        "for sampling input/output length, "
        "used only for RandomDataset. Must be in the range [0, 1) to "
        "define a symmetric sampling range "
        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
    )
    # hf dtaset
    parser.add_argument(
        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
    )
    parser.add_argument(
        "--hf-split", type=str, default=None, help="Split of the HF dataset."
    )
    parser = AsyncEngineArgs.add_cli_args(parser)
    return parser
 if __name__ == "__main__":
-    print("""DEPRECATED: This script has been moved to the vLLM CLI.
+    parser = create_argument_parser()
-
+    args = parser.parse_args()
-Please use the following command instead:
+    if args.tokenizer is None:
-    vllm bench throughput
+        args.tokenizer = args.model
-
+    validate_args(args)
-For help with the new command, run:
+    main(args)
    vllm bench throughput --help
 Alternatively, you can run the new command directly with:
    python -m vllm.entrypoints.cli.main bench throughput --help
 """)
    sys.exit(1)
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@ -62,7 +62,7 @@ benchmark() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
  CUDA_VISIBLE_DEVICES=1 python3 \
@ -72,7 +72,7 @@ benchmark() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
  wait_for_server 8100
  wait_for_server 8200
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@ -69,7 +69,7 @@ launch_disagg_prefill() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
  CUDA_VISIBLE_DEVICES=1 python3 \
    -m vllm.entrypoints.openai.api_server \
@ -78,7 +78,7 @@ launch_disagg_prefill() {
    --max-model-len 10000 \
    --gpu-memory-utilization 0.6 \
    --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
+    '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
  wait_for_server 8100
  wait_for_server 8200
--- a/benchmarks/kernels/bench_block_fp8_gemm.py
+++ b/benchmarks/kernels/bench_block_fp8_gemm.py
@ -1,145 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    apply_w8a8_block_fp8_linear,
 )
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    CUTLASS_BLOCK_FP8_SUPPORTED,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton as vllm_triton
 assert current_platform.is_cuda(), (
    "Only support benchmarking w8a8 block fp8 kernel on CUDA device."
 )
 # DeepSeek-V3 weight shapes
 DEEPSEEK_V3_SHAPES = [
    (512 + 64, 7168),
    (2112, 7168),
    ((128 + 64) * 128, 7168),
    (128 * (128 + 128), 512),
    (7168, 16384),
    (7168, 18432),
    (18432 * 2, 7168),
    (24576, 1536),
    (12288, 7168),
    (4096, 7168),
    (7168, 2048),
 ]
 def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
    """Build runner function for w8a8 block fp8 matmul."""
    factor_for_scale = 1e-2
    fp8_info = torch.finfo(torch.float8_e4m3fn)
    fp8_max, fp8_min = fp8_info.max, fp8_info.min
    # Create random FP8 tensors
    A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
    B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
    B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
    # Create scales
    block_n, block_k = block_size[0], block_size[1]
    n_tiles = (N + block_n - 1) // block_n
    k_tiles = (K + block_k - 1) // block_k
    Bs = (
        torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
        * factor_for_scale
    )
    # SM90 CUTLASS requires row-major format for scales
    if use_cutlass and current_platform.is_device_capability(90):
        Bs = Bs.T.contiguous()
    def run():
        if use_cutlass:
            return apply_w8a8_block_fp8_linear(
                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
            )
        else:
            return apply_w8a8_block_fp8_linear(
                A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
            )
    return run
 # Determine available providers
 available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
 plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
 if CUTLASS_BLOCK_FP8_SUPPORTED:
    available_providers.append("w8a8-block-fp8-cutlass")
@vllm_triton.testing.perf_report(
    vllm_triton.testing.Benchmark(
        x_names=["batch_size"],
        x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
        x_log=False,
        line_arg="provider",
        line_vals=available_providers,
        line_names=available_providers,
        ylabel="TFLOP/s (larger is better)",
        plot_name="BF16 vs W8A8 Block FP8 GEMMs",
        args={},
    )
 )
 def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
    M = batch_size
    device = "cuda"
    quantiles = [0.5, 0.2, 0.8]
    if provider == "torch-bf16":
        a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
        b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
            lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
        )
    elif provider == "w8a8-block-fp8-triton":
        run_w8a8_triton = build_w8a8_block_fp8_runner(
            M, N, K, block_size, device, use_cutlass=False
        )
        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
            lambda: run_w8a8_triton(), quantiles=quantiles
        )
    elif provider == "w8a8-block-fp8-cutlass":
        run_w8a8_cutlass = build_w8a8_block_fp8_runner(
            M, N, K, block_size, device, use_cutlass=True
        )
        ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
            lambda: run_w8a8_cutlass(), quantiles=quantiles
        )
    else:
        raise ValueError(f"Unknown provider: {provider}")
    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
 if __name__ == "__main__":
    block_size = (128, 128)
    for N, K in DEEPSEEK_V3_SHAPES:
        print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
        print(f"TFLOP/s comparison (block_size={block_size}):")
        benchmark_tflops.run(
            print_data=True,
            # show_plots=False,
            # save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
            N=N,
            K=K,
            block_size=block_size,
        )
    print("\nBenchmark finished!")
--- a/benchmarks/kernels/benchmark_activation.py
+++ b/benchmarks/kernels/benchmark_activation.py
@ -1,104 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # benchmark custom activation op performance
 import itertools
 import torch
 import vllm.model_executor.layers.activation  # noqa F401
 from vllm.model_executor.custom_op import CustomOp
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
 batch_size_range = [1, 16, 32, 64, 128]
 seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
 intermediate_size = [3072, 9728, 12288]
 configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
 def benchmark_activation(
    batch_size: int,
    seq_len: int,
    intermediate_size: int,
    provider: str,
    func_name: str,
    dtype: torch.dtype,
 ):
    device = "cuda"
    num_tokens = batch_size * seq_len
    dim = intermediate_size
    current_platform.seed_everything(42)
    torch.set_default_device(device)
    if func_name == "gelu_and_mul":
        layer = CustomOp.op_registry[func_name](approximate="none")
    elif func_name == "gelu_and_mul_tanh":
        layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
    elif func_name == "fatrelu_and_mul":
        threshold = 0.5
        layer = CustomOp.op_registry[func_name](threshold)
    else:
        layer = CustomOp.op_registry[func_name]()
    x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
    compiled_layer = torch.compile(layer.forward_native)
    if provider == "custom":
        fn = lambda: layer(x)
    elif provider == "compiled":
        fn = lambda: compiled_layer(x)
    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
        fn, quantiles=[0.5, 0.2, 0.8]
    )
    return ms, max_ms, min_ms
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
    parser.add_argument(
        "--func-name",
        type=str,
        choices=[
            "mul_and_silu",
            "silu_and_mul",
            "gelu_and_mul",
            "gelu_and_mul_tanh",
            "fatrelu_and_mul",
            "swigluoai_and_mul",
            "gelu_new",
            "gelu_fast",
            "quick_gelu",
        ],
        default="silu_and_mul",
    )
    parser.add_argument(
        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
    )
    args = parser.parse_args()
    assert args
    func_name = args.func_name
    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
    perf_report = triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=["batch_size", "seq_len", "intermediate_size"],
            x_vals=configs,
            line_arg="provider",
            line_vals=["custom", "compiled"],
            line_names=["Custom OP", "Compiled"],
            styles=[("blue", "-"), ("green", "-")],
            ylabel="ms",
            plot_name=f"{func_name}-op-performance",
            args={},
        )
    )
    perf_report(
        lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
            batch_size, seq_len, intermediate_size, provider, func_name, dtype
        )
    ).run(print_data=True)
--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@ -1,486 +0,0 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 Benchmark script for device communicators:
 CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
 and SymmMemCommunicator (multimem, two-shot).
 Usage:
    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
 Example:
    torchrun --nproc_per_node=2 benchmark_device_communicators.py
    --sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
 """
 import json
 import os
 import time
 from contextlib import nullcontext
 from typing import Callable, Optional
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 from vllm.logger import init_logger
 from vllm.utils import FlexibleArgumentParser
 logger = init_logger(__name__)
 # Default sequence lengths to benchmark
 DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
 # Fixed hidden size and dtype for all benchmarks
 HIDDEN_SIZE = 8192
 BENCHMARK_DTYPE = torch.bfloat16
 # CUDA graph settings
 CUDA_GRAPH_CAPTURE_CYCLES = 10
 class CommunicatorBenchmark:
    """Benchmark class for testing device communicators."""
    def __init__(
        self,
        rank: int,
        world_size: int,
        device: torch.device,
        cpu_group: ProcessGroup,
        sequence_lengths: list[int],
    ):
        self.rank = rank
        self.world_size = world_size
        self.device = device
        self.cpu_group = cpu_group
        # Calculate max_size_override based on largest sequence length
        max_seq_len = max(sequence_lengths)
        max_tensor_elements = max_seq_len * HIDDEN_SIZE
        self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
        # Initialize communicators
        self.custom_allreduce = None
        self.pynccl_comm = None
        self.symm_mem_comm = None
        self.symm_mem_comm_multimem = None
        self.symm_mem_comm_two_shot = None
        self._init_communicators()
    def _init_communicators(self):
        """Initialize all available communicators."""
        try:
            self.custom_allreduce = CustomAllreduce(
                group=self.cpu_group,
                device=self.device,
                max_size=self.max_size_override,
            )
            if not self.custom_allreduce.disabled:
                logger.info("Rank %s: CustomAllreduce initialized", self.rank)
            else:
                logger.info("Rank %s: CustomAllreduce disabled", self.rank)
        except Exception as e:
            logger.warning(
                "Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
            )
            self.custom_allreduce = None
        try:
            self.pynccl_comm = PyNcclCommunicator(
                group=self.cpu_group, device=self.device
            )
            if not self.pynccl_comm.disabled:
                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
            else:
                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
                self.pynccl_comm = None
        except Exception as e:
            logger.warning(
                "Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
            )
            self.pynccl_comm = None
        # Initialize variants for SymmMemCommunicator
        try:
            self.symm_mem_comm_multimem = SymmMemCommunicator(
                group=self.cpu_group,
                device=self.device,
                force_multimem=True,
                max_size_override=self.max_size_override,
            )
            if not self.symm_mem_comm_multimem.disabled:
                logger.info(
                    "Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
                )
            else:
                self.symm_mem_comm_multimem = None
        except Exception as e:
            logger.warning(
                "Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
                self.rank,
                e,
            )
            self.symm_mem_comm_multimem = None
        try:
            self.symm_mem_comm_two_shot = SymmMemCommunicator(
                group=self.cpu_group,
                device=self.device,
                force_multimem=False,
                max_size_override=self.max_size_override,
            )
            if not self.symm_mem_comm_two_shot.disabled:
                logger.info(
                    "Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
                )
            else:
                self.symm_mem_comm_two_shot = None
        except Exception as e:
            logger.warning(
                "Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
                self.rank,
                e,
            )
            self.symm_mem_comm_two_shot = None
    def benchmark_allreduce(
        self, sequence_length: int, num_warmup: int, num_trials: int
    ) -> dict[str, float]:
        """Benchmark allreduce operations for all available communicators."""
        results = {}
        # Define communicators with their benchmark functions
        communicators = []
        if self.custom_allreduce is not None:
            comm = self.custom_allreduce
            # CustomAllreduce one-shot
            communicators.append(
                (
                    "ca_1stage",
                    lambda t, c=comm: c.custom_all_reduce(t),
                    lambda t, c=comm: c.should_custom_ar(t),
                    comm.capture(),
                    "1stage",  # env variable value
                )
            )
            # CustomAllreduce two-shot
            communicators.append(
                (
                    "ca_2stage",
                    lambda t, c=comm: c.custom_all_reduce(t),
                    lambda t, c=comm: c.should_custom_ar(t),
                    comm.capture(),
                    "2stage",  # env variable value
                )
            )
        if self.pynccl_comm is not None:
            comm = self.pynccl_comm
            communicators.append(
                (
                    "pynccl",
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t: True,  # Always available if initialized
                    nullcontext(),
                    None,  # no env variable needed
                )
            )
        if self.symm_mem_comm_multimem is not None:
            comm = self.symm_mem_comm_multimem
            communicators.append(
                (
                    "symm_mem_multimem",
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t, c=comm: c.should_use_symm_mem(t),
                    nullcontext(),
                    None,  # no env variable needed
                )
            )
        if self.symm_mem_comm_two_shot is not None:
            comm = self.symm_mem_comm_two_shot
            communicators.append(
                (
                    "symm_mem_two_shot",
                    lambda t, c=comm: c.all_reduce(t),
                    lambda t, c=comm: c.should_use_symm_mem(t),
                    nullcontext(),
                    None,  # no env variable needed
                )
            )
        # Benchmark each communicator
        for name, allreduce_fn, should_use_fn, context, env_var in communicators:
            # Set environment variable if needed
            if env_var is not None:
                os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
            else:
                # Clear the environment variable to avoid interference
                os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
            latency = self.benchmark_allreduce_single(
                sequence_length,
                allreduce_fn,
                should_use_fn,
                context,
                num_warmup,
                num_trials,
            )
            if latency is not None:
                results[name] = latency
        return results
    def benchmark_allreduce_single(
        self,
        sequence_length: int,
        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
        should_use_fn: Callable[[torch.Tensor], bool],
        context,
        num_warmup: int,
        num_trials: int,
    ) -> Optional[float]:
        """Benchmark method with CUDA graph optimization."""
        try:
            # Create test tensor (2D: sequence_length x hidden_size)
            tensor = torch.randn(
                sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
            )
            if not should_use_fn(tensor):
                return None
            torch.cuda.synchronize()
            stream = torch.cuda.Stream()
            with torch.cuda.stream(stream):
                graph_input = tensor.clone()
                # Warmup before capture
                for _ in range(3):
                    allreduce_fn(graph_input)
                # Capture the graph using context manager
                with context:
                    graph = torch.cuda.CUDAGraph()
                    with torch.cuda.graph(graph):
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
                            allreduce_fn(graph_input)
            torch.cuda.synchronize()
            for _ in range(num_warmup):
                graph.replay()
            torch.cuda.synchronize()
            torch.cuda.synchronize()
            start_time = time.perf_counter()
            for _ in range(num_trials):
                graph.replay()
            torch.cuda.synchronize()
            end_time = time.perf_counter()
            # Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
            return (
                (end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
            )
        except Exception as e:
            logger.error("CUDA graph benchmark failed: %s", e)
            raise RuntimeError(
                f"CUDA graph benchmark failed for communicator: {e}"
            ) from e
 def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
    """Calculate speedup information for a single tensor size."""
    if not comm_results:
        return "N/A"
    # Find the fastest communicator
    fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
    fastest_time = comm_results[fastest_comm]
    # Calculate speedup vs PyNccl if available
    if "pynccl" in comm_results:
        pynccl_time = comm_results["pynccl"]
        speedup = pynccl_time / fastest_time
        return f"{fastest_comm} ({speedup:.2f}x)"
    else:
        return f"{fastest_comm} (N/A)"
 def print_results(
    results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
 ):
    """Print benchmark results in a formatted table."""
    print(f"\n{'=' * 130}")
    print("Device Communicator Benchmark Results")
    print(
        f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
        f"Hidden Size: {HIDDEN_SIZE}"
    )
    print(f"{'=' * 130}")
    # Get all communicator names
    all_comms = set()
    for size_results in results.values():
        all_comms.update(size_results.keys())
    all_comms = sorted(list(all_comms))
    # Print header
    header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
    for comm in all_comms:
        header += f"{comm:<20}"
    header += f"{'Best (Speedup vs PyNccl)':<30}"
    print(header)
    print("-" * len(header))
    # Print results for each sequence length
    for seq_len in sequence_lengths:
        if seq_len in results:
            # Calculate tensor size in elements and bytes
            tensor_elements = seq_len * HIDDEN_SIZE
            tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
            # Format tensor size (MB)
            tensor_size_mb = tensor_bytes / (1024 * 1024)
            tensor_size_str = f"{tensor_size_mb:.2f} MB"
            # Format tensor shape
            tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
            row = f"{tensor_shape:<20}{tensor_size_str:<15}"
            for comm in all_comms:
                if comm in results[seq_len]:
                    row += f"{results[seq_len][comm]:<20.3f}"
                else:
                    row += f"{'N/A':<20}"
            # Calculate speedup information
            speedup_info = _calculate_speedup_info(results[seq_len])
            row += f"{speedup_info:<30}"
            print(row)
    print(f"{'=' * 130}")
    print("All times are in milliseconds (ms) per allreduce operation")
    print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
 def main():
    parser = FlexibleArgumentParser(description="Benchmark device communicators")
    parser.add_argument(
        "--sequence-lengths",
        type=int,
        nargs="+",
        default=DEFAULT_SEQUENCE_LENGTHS,
        help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
    )
    parser.add_argument(
        "--num-warmup", type=int, default=5, help="Number of warmup iterations"
    )
    parser.add_argument(
        "--num-trials", type=int, default=50, help="Number of benchmark trials"
    )
    parser.add_argument("--output-json", type=str, help="Output results to JSON file")
    args = parser.parse_args()
    # Initialize distributed
    if not dist.is_initialized():
        dist.init_process_group(backend="gloo")
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    # Set device
    device = torch.device(f"cuda:{rank}")
    torch.cuda.set_device(device)
    # Get CPU process group
    cpu_group = dist.new_group(backend="gloo")
    # Disable USE_SYMM_MEM to avoid affecting the max_sizes
    # in symm_mem and custom_all_reduce for benchmark
    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
    # Initialize benchmark
    benchmark = CommunicatorBenchmark(
        rank, world_size, device, cpu_group, args.sequence_lengths
    )
    # Run benchmarks
    all_results = {}
    for seq_len in args.sequence_lengths:
        if rank == 0:
            logger.info(
                "Benchmarking sequence length: %s (tensor shape: %s x %s)",
                seq_len,
                seq_len,
                HIDDEN_SIZE,
            )
        results = benchmark.benchmark_allreduce(
            sequence_length=seq_len,
            num_warmup=args.num_warmup,
            num_trials=args.num_trials,
        )
        all_results[seq_len] = results
        # Synchronize between ranks
        dist.barrier()
    # Print results (only rank 0)
    if rank == 0:
        print_results(all_results, args.sequence_lengths, world_size)
        # Save to JSON if requested
        if args.output_json:
            # Add speedup information to results
            enhanced_results = {}
            for seq_len, comm_results in all_results.items():
                enhanced_results[seq_len] = {
                    "timings": comm_results,
                    "speedup_info": _calculate_speedup_info(comm_results),
                }
            output_data = {
                "world_size": world_size,
                "dtype": str(BENCHMARK_DTYPE),
                "hidden_size": HIDDEN_SIZE,
                "sequence_lengths": args.sequence_lengths,
                "num_warmup": args.num_warmup,
                "num_trials": args.num_trials,
                "cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
                "results": enhanced_results,
            }
            with open(args.output_json, "w") as f:
                json.dump(output_data, f, indent=2)
            logger.info("Results saved to %s", args.output_json)
    # Cleanup
    if cpu_group != dist.group.WORLD:
        dist.destroy_process_group(cpu_group)
 if __name__ == "__main__":
    main()
--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@ -80,11 +80,6 @@ def bench_run(
        a, score, topk, renormalize=False
    )
    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
    def run_triton_moe(
        a: torch.Tensor,
        w1: torch.Tensor,
@ -116,10 +111,6 @@ def bench_run(
        w2: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
        ab_strides1: torch.Tensor,
        ab_strides2: torch.Tensor,
        c_strides1: torch.Tensor,
        c_strides2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        per_act_token: bool,
@ -134,10 +125,6 @@ def bench_run(
                topk_ids,
                w1_scale,
                w2_scale,
                ab_strides1,
                ab_strides2,
                c_strides1,
                c_strides2,
                per_act_token,
                a1_scale=None,
            )
@ -149,10 +136,6 @@ def bench_run(
        w2_q: torch.Tensor,
        w1_scale: torch.Tensor,
        w2_scale: torch.Tensor,
        ab_strides1: torch.Tensor,
        ab_strides2: torch.Tensor,
        c_strides1: torch.Tensor,
        c_strides2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
    ):
@ -167,10 +150,6 @@ def bench_run(
                topk_ids,
                w1_scale,
                w2_scale,
                ab_strides1,
                ab_strides2,
                c_strides1,
                c_strides2,
                per_act_token,
                a1_scale=None,
            )
@ -215,10 +194,6 @@ def bench_run(
            w2_q,
            w1_scale,
            w2_scale,
            ab_strides1,
            ab_strides2,
            c_strides1,
            c_strides2,
            topk_weights,
            topk_ids,
        )
@ -256,10 +231,6 @@ def bench_run(
        "w1_scale": w1_scale,
        "w2_scale": w2_scale,
        "per_act_token": per_act_token,
        "ab_strides1": ab_strides1,
        "ab_strides2": ab_strides2,
        "c_strides1": c_strides1,
        "c_strides2": c_strides2,
        # cuda graph params
        "cutlass_graph": cutlass_graph,
        "triton_graph": triton_graph,
@ -318,10 +289,6 @@ def bench_run(
        w2_q,
        w1_scale,
        w2_scale,
        ab_strides1,
        ab_strides2,
        c_strides1,
        c_strides2,
        topk_weights,
        topk_ids,
        per_act_token,
@ -330,7 +297,7 @@ def bench_run(
    results.append(
        benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
            globals=globals,
            label=label,
            sub_label=sub_label,
--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@ -637,7 +637,7 @@ def bench_optype(
    # Clear LoRA optimization hash-maps.
    _LORA_A_PTR_DICT.clear()
    _LORA_B_PTR_DICT.clear()
-    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
+    # Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
    for kwargs in kwargs_list:
        op_type.bench_fn()(**kwargs)
    torch.cuda.synchronize()
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@ -253,7 +253,28 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
    else:
        assert bt.a.dtype == torch.int8
        assert bt.wtype == scalar_types.uint4b8
-        raise NotImplementedError("QQQ is not supported anymore")
+
        if bt.w_ch_s is not None:
            s_ch = bt.w_ch_s.to(torch.float32)
        else:
            s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device)
        if bt.w_tok_s is not None:
            s_tok = bt.w_tok_s.to(torch.float32)
        else:
            s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device)
        fn = lambda: ops.marlin_qqq_gemm(
            a=bt.a,
            b_q_weight=w_q,
            s_group=w_s,
            s_tok=s_tok,
            s_ch=s_ch,
            workspace=workspace.scratch,
            size_m=bt.a.shape[0],
            size_n=bt.w_ref.shape[1],
            size_k=bt.w_ref.shape[0],
        )
    return fn
@ -284,25 +305,6 @@ def machete_create_bench_fn(
    )
 def cutlass_w4a8_create_bench_fn(
    bt: BenchmarkTensors, out_type=torch.dtype, schedule=None
 ) -> Callable:
    w_q = bt.w_q.t().contiguous().t()  # make col major
    w_q = ops.cutlass_encode_and_reorder_int4b(w_q)
    # expects fp8 scales
    w_s = ops.cutlass_pack_scale_fp8(bt.w_g_s.to(torch.float8_e4m3fn))
    return lambda: ops.cutlass_w4a8_mm(
        a=bt.a,
        b_q=w_q,
        b_group_scales=w_s,
        b_group_size=bt.group_size,
        b_channel_scales=bt.w_ch_s,
        a_token_scales=bt.w_tok_s,
        maybe_schedule=schedule,
    )
 # impl
 # bench
@ -404,20 +406,6 @@ def bench(
        )
    )
    # cutlass w4a8
    if types.act_type == torch.float8_e4m3fn and group_size == 128:
        timers.append(
            bench_fns(
                label,
                sub_label,
                f"cutlass w4a8 ({name_type_string})",
                [
                    cutlass_w4a8_create_bench_fn(bt, out_type=types.output_type)
                    for bt in benchmark_tensors
                ],
            )
        )
    if sweep_schedules:
        global _SWEEP_SCHEDULES_RESULTS
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@ -419,10 +419,8 @@ class BenchmarkWorker:
        )
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
        # is the intermediate size after silu_and_mul.
        block_n = block_quant_shape[0] if block_quant_shape else None
        block_k = block_quant_shape[1] if block_quant_shape else None
        op_config = get_moe_configs(
-            num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
+            num_experts, shard_intermediate_size // 2, dtype_str
        )
        if op_config is None:
            config = get_default_config(
@ -432,7 +430,6 @@ class BenchmarkWorker:
                hidden_size,
                topk,
                dtype_str,
                block_quant_shape,
            )
        else:
            config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
@ -594,11 +591,7 @@ def main(args: argparse.Namespace):
        E = config.n_routed_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
-    elif config.architectures[0] in (
+    elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
        "Qwen2MoeForCausalLM",
        "Qwen3MoeForCausalLM",
        "Qwen3NextForCausalLM",
    ):
        E = config.num_experts
        topk = config.num_experts_per_tok
        intermediate_size = config.moe_intermediate_size
@ -682,11 +675,7 @@ def main(args: argparse.Namespace):
        is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
        search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
        print(f"Start tuning over {len(search_space)} configurations...")
-        if use_deep_gemm:
+
            raise ValueError(
                "Tuning with --use-deep-gemm is not supported as it only tunes Triton "
                "kernels. Please remove the flag."
            )
        start = time.time()
        configs = _distribute(
            "tune",
--- a/benchmarks/kernels/benchmark_polynorm.py
+++ b/benchmarks/kernels/benchmark_polynorm.py
@ -1,155 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
 import torch
 from vllm import _custom_ops as vllm_ops
 from vllm.triton_utils import triton
 def polynorm_naive(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
    x = x.view(-1, x.shape[-1])
    def norm(x, eps: float):
        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
    x = x.float()
    return (
        (
            weight[0] * norm(x**3, eps)
            + weight[1] * norm(x**2, eps)
            + weight[2] * norm(x, eps)
            + bias
        )
        .to(weight.dtype)
        .view(orig_shape)
    )
 def polynorm_vllm(
    x: torch.Tensor,
    weight: torch.Tensor,
    bias: torch.Tensor,
    eps: float = 1e-6,
 ):
    orig_shape = x.shape
    x = x.view(-1, x.shape[-1])
    out = torch.empty_like(x)
    vllm_ops.poly_norm(out, x, weight, bias, eps)
    output = out
    output = output.view(orig_shape)
    return output
 def calculate_diff(batch_size, seq_len, hidden_dim):
    dtype = torch.bfloat16
    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
    weight = torch.ones(3, dtype=dtype, device="cuda")
    bias = torch.ones(1, dtype=dtype, device="cuda")
    output_naive = polynorm_naive(x, weight, bias)
    output_vllm = polynorm_vllm(x, weight, bias)
    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
        print("✅ All implementations match")
    else:
        print("❌ Implementations differ")
 batch_size_range = [2**i for i in range(0, 7, 2)]
 seq_length_range = [2**i for i in range(6, 11, 1)]
 dim_range = [2048, 4096]
 configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
 def get_benchmark():
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=["dim", "batch_size", "seq_len"],
            x_vals=[list(_) for _ in configs],
            line_arg="provider",
            line_vals=["naive", "vllm"],
            line_names=["Naive", "vLLM"],
            styles=[("blue", "-"), ("red", "-")],
            ylabel="us",
            plot_name="polynorm-perf",
            args={},
        )
    )
    def benchmark(dim, batch_size, seq_len, provider):
        dtype = torch.bfloat16
        hidden_dim = dim * 4
        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
        weight = torch.ones(3, dtype=dtype, device="cuda")
        bias = torch.ones(1, dtype=dtype, device="cuda")
        quantiles = [0.5, 0.2, 0.8]
        if provider == "naive":
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: polynorm_naive(x, weight, bias),
                quantiles=quantiles,
            )
        else:
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: polynorm_vllm(x, weight, bias),
                quantiles=quantiles,
            )
        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
    return benchmark
 if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--batch-size",
        type=int,
        default=4,
        help="Batch size",
    )
    parser.add_argument(
        "--seq-len",
        type=int,
        default=128,
        help="Sequence length",
    )
    parser.add_argument(
        "--hidden-dim",
        type=int,
        default=8192,
        help="Intermediate size of MLP",
    )
    parser.add_argument(
        "--save-path",
        type=str,
        default="./configs/polnorm/",
        help="Path to save polnorm benchmark results",
    )
    args = parser.parse_args()
    # Run correctness test
    calculate_diff(
        batch_size=args.batch_size,
        seq_len=args.seq_len,
        hidden_dim=args.hidden_dim,
    )
    benchmark = get_benchmark()
    # Run performance benchmark
    benchmark.run(print_data=True, save_path=args.save_path)
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@ -1,77 +0,0 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
 import torch
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
    silu_mul_fp8_quant_deep_gemm,
 )
 from vllm.platforms import current_platform
 def benchmark(E, T, H, G=128, runs=50):
    current_platform.seed_everything(42)
    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
    tokens_per_expert = torch.randint(
        T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
    )
    # Warmup
    for _ in range(10):
        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
        torch.cuda.synchronize()
    # Benchmark
    torch.cuda.synchronize()
    start = time.perf_counter()
    for _ in range(runs):
        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
    torch.cuda.synchronize()
    avg_time = (time.perf_counter() - start) / runs * 1000
    # Calculate actual work done (only count valid tokens)
    actual_tokens = tokens_per_expert.sum().item()
    actual_elements = actual_tokens * H
    # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
    ops_per_element = 8
    total_ops = actual_elements * ops_per_element
    gflops = total_ops / (avg_time / 1000) / 1e9
    # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
    input_bytes = actual_tokens * 2 * H * 2  # 2*H bfloat16 inputs
    output_bytes = actual_tokens * H * 1  # H fp8 outputs
    scale_bytes = actual_tokens * (H // G) * 4  # scales in float32
    total_bytes = input_bytes + output_bytes + scale_bytes
    memory_bw = total_bytes / (avg_time / 1000) / 1e9
    return avg_time, gflops, memory_bw
 configs = [
    (8, 32, 1024),
    (16, 64, 2048),
    (32, 128, 4096),
    # DeepSeekV3 Configs
    (256, 16, 7168),
    (256, 32, 7168),
    (256, 64, 7168),
    (256, 128, 7168),
    (256, 256, 7168),
    (256, 512, 7168),
    (256, 1024, 7168),
 ]
 print(f"GPU: {torch.cuda.get_device_name()}")
 print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
 print("-" * 50)
 for E, T, H in configs:
    try:
        time_ms, gflops, gbps = benchmark(E, T, H)
        print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
    except Exception:
        print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@ -9,11 +9,8 @@ from typing import Optional
 import flashinfer
 import torch
 from vllm.utils import round_up
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
 FP4_DTYPE = torch.uint8
 def to_float8(x, dtype=torch.float8_e4m3fn):
@ -64,13 +61,13 @@ def benchmark_decode(
    else:
        raise ValueError(f"Invalid kv_layout: {kv_layout}")
-    # Always using 1.0 scale to reflect the real perf in benchmarking
+    query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
    q_scale = 1.0
    ref_query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
    if q_quant_dtype == FP8_DTYPE:
-        query, _ = to_float8(ref_query)
+        query, q_scale = to_float8(query)
        ref_query = query.to(dtype) * q_scale
    else:
-        query = ref_query
+        q_scale = 1.0
        ref_query = query
    kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32)
    kv_lens[-1] = max_seq_len
@ -78,13 +75,14 @@ def benchmark_decode(
    seq_lens = kv_lens
    max_seq_len = torch.max(seq_lens).item()
-    # Always using 1.0 scale to reflect the real perf in benchmarking
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
    k_scale = v_scale = 1.0
    ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
    if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, _ = to_float8(ref_kv_cache)
+        kv_cache, kv_scale = to_float8(kv_cache)
        ref_kv_cache = kv_cache.to(dtype) * kv_scale
    else:
-        kv_cache = ref_kv_cache
+        kv_scale = 1.0
        ref_kv_cache = kv_cache
    k_scale = v_scale = kv_scale
    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
    block_tables = torch.randint(
@ -112,7 +110,7 @@ def benchmark_decode(
    wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
        workspace_buffer,
        kv_layout,
-        use_tensor_cores=True,
+        use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
    )
    wrapper.plan(
        kv_indptr,
@ -144,31 +142,11 @@ def benchmark_decode(
        return sum(times) / len(times), torch.std(torch.tensor(times))
    o_scale = 1.0
    o_sf_scale = None
    output_baseline = torch.empty(ref_query.shape, dtype=dtype)
-    if o_quant_dtype == FP4_DTYPE:
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
        o_sf_scale = 500.0
        output_trtllm = flashinfer.utils.FP4Tensor(
            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
            torch.empty(
                (
                    round_up(query.shape[0], 128),
                    round_up(query.shape[1] * query.shape[2] // 16, 4),
                ),
                dtype=torch.float8_e4m3fn,
            ),
        )
    else:
        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
    def baseline_decode():
-        return wrapper.run(
+        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
            ref_query,
            ref_kv_cache,
            k_scale=k_scale,
            v_scale=v_scale,
            out=output_baseline,
        )
    def trtllm_decode():
        return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
@ -180,7 +158,6 @@ def benchmark_decode(
            max_seq_len=max_seq_len,
            bmm1_scale=q_scale * k_scale * sm_scale,
            bmm2_scale=v_scale / o_scale,
            o_sf_scale=o_sf_scale,
            out=output_trtllm,
        )
@ -259,9 +236,7 @@ if __name__ == "__main__":
        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
        (None, None, None),
        (None, FP8_DTYPE, None),
        (FP8_DTYPE, FP8_DTYPE, None),
        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
    ]
    for quant_dtype in quant_dtypes:
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@ -9,11 +9,8 @@ from typing import Optional
 import flashinfer
 import torch
 from vllm.utils import round_up
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
 FP4_DTYPE = torch.uint8
 def to_float8(x, dtype=torch.float8_e4m3fn):
@ -75,15 +72,13 @@ def benchmark_prefill(
        ]
    )
-    # Always using 1.0 scale to reflect the real perf in benchmarking
+    query = torch.randn(torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype)
    q_scale = 1.0
    ref_query = torch.randn(
        torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype
    )
    if q_quant_dtype == FP8_DTYPE:
-        query, _ = to_float8(ref_query)
+        query, q_scale = to_float8(query)
        ref_query = query.to(dtype) * q_scale
    else:
-        query = ref_query
+        q_scale = 1.0
        ref_query = query
    kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32)
    kv_lens[-1] = max_kv_len
@ -91,13 +86,14 @@ def benchmark_prefill(
    seq_lens = kv_lens + q_lens
    max_seq_len = torch.max(seq_lens).item()
-    # Always using 1.0 scale to reflect the real perf in benchmarking
+    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
    k_scale = v_scale = 1.0
    ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
    if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, _ = to_float8(ref_kv_cache)
+        kv_cache, kv_scale = to_float8(kv_cache)
        ref_kv_cache = kv_cache.to(dtype) * kv_scale
    else:
-        kv_cache = ref_kv_cache
+        kv_scale = 1.0
        ref_kv_cache = kv_cache
    k_scale = v_scale = kv_scale
    max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
    block_tables = torch.randint(
@ -156,31 +152,11 @@ def benchmark_prefill(
        return sum(times) / len(times), torch.std(torch.tensor(times))
    o_scale = 1.0
    o_sf_scale = None
    output_baseline = torch.empty(ref_query.shape, dtype=dtype)
-    if o_quant_dtype == FP4_DTYPE:
+    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
        o_sf_scale = 500.0
        output_trtllm = flashinfer.utils.FP4Tensor(
            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
            torch.empty(
                (
                    round_up(query.shape[0], 128),
                    round_up(query.shape[1] * query.shape[2] // 16, 4),
                ),
                dtype=torch.float8_e4m3fn,
            ),
        )
    else:
        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
    def baseline_prefill():
-        return wrapper.run(
+        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
            ref_query,
            ref_kv_cache,
            k_scale=k_scale,
            v_scale=v_scale,
            out=output_baseline,
        )
    def trtllm_prefill():
        return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
@ -196,7 +172,6 @@ def benchmark_prefill(
            batch_size=batch_size,
            cum_seq_lens_q=q_indptr,
            cum_seq_lens_kv=kv_indptr,
            o_sf_scale=o_sf_scale,
            out=output_trtllm,
        )
@ -274,9 +249,7 @@ if __name__ == "__main__":
    quant_dtypes = [
        # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
        (None, None, None),
        (FP8_DTYPE, FP8_DTYPE, None),
        (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
    ]
    for quant_dtype in quant_dtypes:
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@ -11,8 +11,8 @@ from datetime import datetime
 from typing import Any
 import torch
 import tqdm
 import triton
 from tqdm import tqdm
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    _w8a8_block_fp8_matmul,
@ -141,7 +141,6 @@ def get_weight_shapes(tp_size):
    # cannot TP
    total = [
        (512 + 64, 7168),
        (2112, 7168),
        ((128 + 64) * 128, 7168),
        (128 * (128 + 128), 512),
        (7168, 16384),
--- a/benchmarks/kernels/weight_shapes.py
+++ b/benchmarks/kernels/weight_shapes.py
@ -95,10 +95,4 @@ WEIGHT_SHAPES = {
        ([2048, 2816], 1),
        ([1408, 2048], 0),
    ],
    "CohereLabs/c4ai-command-a-03-2025": [
        ([12288, 14336], 1),
        ([12288, 12288], 0),
        ([12288, 73728], 1),
        ([36864, 12288], 0),
    ],
 }
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@ -962,7 +962,7 @@ async def main_mp(
    # At this point all the clients finished,
    # collect results (TTFT, TPOT, etc.) from all the clients.
-    # This needs to happen before calling join on the clients
+    # This needs to happens before calling join on the clients
    # (result_queue should be emptied).
    while not result_queue.empty():
        client_metrics.append(result_queue.get())
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@ -1,7 +1,6 @@
 include(FetchContent)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -88,7 +87,6 @@ is_avx512_disabled(AVX512_DISABLED)
 if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
    message(STATUS "Apple Silicon Detected")
    set(APPLE_SILICON_FOUND TRUE)
    set(ENABLE_NUMA OFF)
    check_sysctl(hw.optional.neon ASIMD_FOUND)
    check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
@ -184,17 +182,17 @@ endif()
 #
 # Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
 # Flag to enable ACL kernels for AARCH64 platforms
-if (VLLM_BUILD_ACL STREQUAL "ON")
+if ( VLLM_BUILD_ACL STREQUAL "ON")
    set(USE_ACL ON)
 else()
    set(USE_ACL OFF)
 endif()
-if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
+if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
    FetchContent_Declare(
        oneDNN
        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG v3.9
+        GIT_TAG  v3.8.1
        GIT_PROGRESS TRUE
        GIT_SHALLOW TRUE
    )
@ -206,7 +204,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
        endif()
        set(ONEDNN_AARCH64_USE_ACL "ON")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
-    endif()
+        endif()
    set(ONEDNN_LIBRARY_TYPE "STATIC")
    set(ONEDNN_BUILD_DOC "OFF")
@ -219,23 +217,38 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
    set(ONEDNN_VERBOSE "OFF")
    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
    FetchContent_MakeAvailable(oneDNN)
-    add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
+    
-    target_include_directories(
+    list(APPEND LIBS dnnl)
-        dnnl_ext
+elseif(POWER10_FOUND)
-        PUBLIC ${oneDNN_SOURCE_DIR}/include
+    FetchContent_Declare(
-        PUBLIC ${oneDNN_BINARY_DIR}/include
+        oneDNN
-        PRIVATE ${oneDNN_SOURCE_DIR}/src
+        GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
        GIT_TAG v3.7.2
        GIT_PROGRESS TRUE
        GIT_SHALLOW TRUE
    )
-    target_link_libraries(dnnl_ext dnnl)
+
-    target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
+    set(ONEDNN_LIBRARY_TYPE "STATIC")
-    list(APPEND LIBS dnnl_ext)
+    set(ONEDNN_BUILD_DOC "OFF")
-    set(USE_ONEDNN ON)
+    set(ONEDNN_BUILD_EXAMPLES "OFF")
-else()
+    set(ONEDNN_BUILD_TESTS "OFF")
-    set(USE_ONEDNN OFF)
+    set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
    set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
    set(ONEDNN_BUILD_GRAPH "OFF")
    set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
    set(ONEDNN_ENABLE_ITT_TASKS "OFF")
    set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
    set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
    set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
    set(DNNL_CPU_RUNTIME "OMP")
    FetchContent_MakeAvailable(oneDNN)
    list(APPEND LIBS dnnl)
 endif()
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
@ -262,6 +275,7 @@ set(VLLM_EXT_SRC
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
    set(VLLM_EXT_SRC
        "csrc/cpu/quant.cpp"
        "csrc/cpu/shm.cpp"
        ${VLLM_EXT_SRC})
    if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
@ -275,11 +289,14 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
            ${VLLM_EXT_SRC})
        add_compile_definitions(-DCPU_CAPABILITY_AVX512)
    endif()
-endif()
+elseif(POWER10_FOUND)
 if(USE_ONEDNN)
    set(VLLM_EXT_SRC
-        "csrc/cpu/dnnl_kernels.cpp"
+        "csrc/cpu/quant.cpp"
        ${VLLM_EXT_SRC})
 endif()
 if (ASIMD_FOUND)
    set(VLLM_EXT_SRC
        "csrc/cpu/quant.cpp"
        ${VLLM_EXT_SRC})
 endif()
--- a/cmake/external_projects/flashmla.cmake
+++ b/cmake/external_projects/flashmla.cmake
@ -19,7 +19,7 @@ else()
  FetchContent_Declare(
        flashmla
        GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
-        GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de
+        GIT_TAG 0e43e774597682284358ff2c54530757b654b8d1
        GIT_PROGRESS TRUE
        CONFIGURE_COMMAND ""
        BUILD_COMMAND ""
@ -37,14 +37,13 @@ cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
 if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
    set(FlashMLA_SOURCES
        ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu
        ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
        ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
-        ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu)
+        ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
        ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu)
    set(FlashMLA_INCLUDES
        ${flashmla_SOURCE_DIR}/csrc/cutlass/include
-        ${flashmla_SOURCE_DIR}/csrc)
+        ${flashmla_SOURCE_DIR}/csrc/include)
    set_gencode_flags_for_srcs(
        SRCS "${FlashMLA_SOURCES}"
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@ -38,7 +38,7 @@ else()
  FetchContent_Declare(
          vllm-flash-attn
          GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a
+          GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
          GIT_PROGRESS TRUE
          # Don't share the vllm-flash-attn build between build types
          BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
--- a/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
+++ b/csrc/attention/mla/sm100_cutlass_mla_kernel.cu
@ -36,14 +36,12 @@ limitations under the License.
 #if !defined(CUDA_VERSION) || CUDA_VERSION < 12040
 void sm100_cutlass_mla_decode(
    torch::Tensor const& out,
    torch::Tensor const& lse,
    torch::Tensor const& q_nope,
    torch::Tensor const& q_pe,
    torch::Tensor const& kv_c_and_k_pe_cache,
    torch::Tensor const& seq_lens,
    torch::Tensor const& page_table,
    torch::Tensor const& workspace,
    double sm_scale,
    int64_t num_kv_splits) {
  TORCH_CHECK(false, "CUDA version must be >= 12.4 for cutlass_mla_decode");
 }
@ -66,11 +64,11 @@ struct IsPersistent {
  static const bool value = v;
 };
-template <typename T, typename TOut, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
+template <typename T, bool IsPaged128, typename PersistenceOption = IsPersistent<true>>
 struct MlaSm100 {
  using Element = T;
  using ElementAcc = float;
-  using ElementOut = TOut;
+  using ElementOut = T;
  using TileShape = Shape<_128, _128, Shape<_512, _64>>;
  using TileShapeH = cute::tuple_element_t<0, TileShape>;
@ -101,7 +99,6 @@ struct MlaSm100 {
 template <typename T>
 typename T::Fmha::Arguments args_from_options(
    at::Tensor const& out,
    at::Tensor const& lse,
    at::Tensor const& q_nope,
    at::Tensor const& q_pe,
    at::Tensor const& kv_c_and_k_pe_cache,
@ -165,15 +162,12 @@ typename T::Fmha::Arguments args_from_options(
       stride_PT,
       page_count_total,
       page_size},
-      {static_cast<ElementOut*>(out.data_ptr()),
+      {static_cast<ElementOut*>(out.data_ptr()), stride_O, static_cast<ElementAcc*>(nullptr), stride_LSE},
       stride_O,
       static_cast<ElementAcc*>(lse.defined() ? lse.data_ptr() : nullptr),
       stride_LSE},
      hw_info,
      // TODO(trevor-m): Change split_kv back to -1 when
      // https://github.com/NVIDIA/cutlass/issues/2274 is fixed. Split_kv=1 will
      // perform worse with larger context length and smaller batch sizes.
-      static_cast<int>(num_kv_splits), // split_kv
+      num_kv_splits, // split_kv
      nullptr,       // is_var_split_kv
  };
  // TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
@ -184,10 +178,9 @@ typename T::Fmha::Arguments args_from_options(
  return arguments;
 }
-template <typename Element, typename ElementOut, bool IsPaged128, typename PersistenceOption>
+template <typename Element, bool IsPaged128, typename PersistenceOption>
 void runMla(
    at::Tensor const& out,
    at::Tensor const& lse,
    at::Tensor const& q_nope,
    at::Tensor const& q_pe,
    at::Tensor const& kv_c_and_k_pe_cache,
@ -197,9 +190,9 @@ void runMla(
    double sm_scale,
    int64_t num_kv_splits,
    cudaStream_t stream) {
-  using MlaSm100Type = MlaSm100<Element, ElementOut, IsPaged128, PersistenceOption>;
+  using MlaSm100Type = MlaSm100<Element, IsPaged128, PersistenceOption>;
  typename MlaSm100Type::Fmha fmha;
-  auto arguments = args_from_options<MlaSm100Type>(out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
+  auto arguments = args_from_options<MlaSm100Type>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, sm_scale, num_kv_splits);
  CUTLASS_CHECK(fmha.can_implement(arguments));
@ -221,7 +214,6 @@ void runMla(
 void sm100_cutlass_mla_decode(
    torch::Tensor const& out,
    torch::Tensor const& lse,
    torch::Tensor const& q_nope,
    torch::Tensor const& q_pe,
    torch::Tensor const& kv_c_and_k_pe_cache,
@ -241,14 +233,14 @@ void sm100_cutlass_mla_decode(
  DISPATCH_BOOL(page_size == 128, IsPaged128, [&] {
    DISPATCH_BOOL(num_kv_splits <= 1, NotManualSplitKV, [&] {
      if (in_dtype == at::ScalarType::Half) {
-        runMla<cutlass::half_t, cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::half_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else if (in_dtype == at::ScalarType::BFloat16) {
-        runMla<cutlass::bfloat16_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
-        runMla<cutlass::float_e4m3_t, cutlass::bfloat16_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
+        runMla<cutlass::float_e4m3_t, IsPaged128, IsPersistent<NotManualSplitKV>>(
-          out, lse, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
+          out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, workspace, sm_scale, num_kv_splits, stream);
      } else {
        TORCH_CHECK(false, "Unsupported input data type of MLA");
      }
@ -261,7 +253,7 @@ void sm100_cutlass_mla_decode(
 int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_batches, int64_t sm_count, int64_t num_kv_splits) {
  // Workspace size depends on ElementAcc and ElementLSE (same as ElementAcc)
  // which are float, so Element type here doesn't matter.
-  using MlaSm100Type = MlaSm100<cutlass::half_t, cutlass::half_t, true>;
+  using MlaSm100Type = MlaSm100<cutlass::half_t, true>;
  // Get split kv. Requires problem shape and sm_count only.
  typename MlaSm100Type::Fmha::Arguments arguments;
@ -272,7 +264,7 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba
  // Assumes device 0 when getting sm_count.
  arguments.hw_info.sm_count =
      sm_count <= 0 ? cutlass::KernelHardwareInfo::query_device_multiprocessor_count(/*device_id=*/0) : sm_count;
-  arguments.split_kv = static_cast<int>(num_kv_splits);
+  arguments.split_kv = num_kv_splits;
  MlaSm100Type::Fmha::set_split_kv(arguments);
  return MlaSm100Type::Fmha::get_workspace_size(arguments);
--- a/csrc/cache.h
+++ b/csrc/cache.h
@ -40,19 +40,9 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                 const double scale, const std::string& kv_cache_dtype);
-void gather_and_maybe_dequant_cache(
+void gather_cache(
    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
    torch::Tensor const& scale,
    std::optional<torch::Tensor> seq_starts = std::nullopt);
 // TODO(hc): cp_gather_cache need support scaled kvcahe in the future.
 void cp_gather_cache(
    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@ -1,7 +1,6 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAException.h>
 #include "cuda_utils.h"
 #include "cuda_compat.h"
@ -625,9 +624,9 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {
 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+template <typename scalar_t>
-__global__ void gather_and_maybe_dequant_cache(
+__global__ void gather_cache(
-    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
+    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
                                              // ENTRIES...]
    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
@ -635,7 +634,6 @@ __global__ void gather_and_maybe_dequant_cache(
    const int32_t block_size, const int32_t entry_size,
    const int64_t block_table_stride, const int64_t cache_block_stride,
    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
    const float* __restrict__ scale,
    const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                               // batch
@ -677,16 +675,10 @@ __global__ void gather_and_maybe_dequant_cache(
    if (partial_block_size) full_blocks_end -= 1;
  }
-  auto copy_entry = [&](const cache_t* __restrict__ _src,
+  auto copy_entry = [&](const scalar_t* __restrict__ _src,
                        scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
+    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
-      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+      _dst[i] = _src[i];
        _dst[i] = static_cast<scalar_t>(_src[i]);
      } else {
        _dst[i] =
            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
      }
    }
  };
  for (int pid = split_start; pid < full_blocks_end; ++pid) {
@ -713,144 +705,8 @@ __global__ void gather_and_maybe_dequant_cache(
 }  // namespace vllm
 // Macro to dispatch the kernel based on the data type.
-// SCALAR_T is the data type of the destination tensor.
+#define CALL_GATHER_CACHE(CPY_DTYPE)                                    \
-// CACHE_T is the stored data type of kv-cache.
+  vllm::gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(            \
 // KV_DTYPE is the real data type of kv-cache.
 #define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
      <<<grid, block, 0, stream>>>(                                         \
          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
          block_size, entry_size, block_table_stride, cache_block_stride,   \
          cache_entry_stride, dst_entry_stride,                             \
          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
 void gather_and_maybe_dequant_cache(
    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
    int64_t batch_size, const std::string& kv_cache_dtype,
    torch::Tensor const& scale,
    std::optional<torch::Tensor> seq_starts = std::nullopt) {
  at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  int32_t block_size = src_cache.size(1);
  int32_t entry_size = src_cache.flatten(2, -1).size(2);
  TORCH_CHECK(block_table.dtype() == torch::kInt32,
              "block_table must be int32");
  TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
              "cu_seq_lens must be int32");
  if (seq_starts.has_value()) {
    TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                "seq_starts must be int32");
  }
  TORCH_CHECK(src_cache.device() == dst.device(),
              "src_cache and dst must be on the same device");
  TORCH_CHECK(src_cache.device() == block_table.device(),
              "src_cache and block_table must be on the same device");
  TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
              "src_cache and cu_seq_lens must be on the same device");
  if (seq_starts.has_value()) {
    TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
                "src_cache and seq_starts must be on the same device");
  }
  int64_t block_table_stride = block_table.stride(0);
  int64_t cache_block_stride = src_cache.stride(0);
  int64_t cache_entry_stride = src_cache.stride(1);
  int64_t dst_entry_stride = dst.stride(0);
  // Decide on the number of splits based on the batch size.
  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
  dim3 grid(batch_size, num_splits);
  dim3 block(1024);
  const int32_t* seq_starts_ptr =
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
  DISPATCH_BY_KV_CACHE_DTYPE(dst.dtype(), kv_cache_dtype, CALL_GATHER_CACHE);
 }
 namespace vllm {
 template <typename scalar_t>
 // Note(hc): The cp_gather_cache allows seq_starts to no longer be divisible by
 // block_size.
 __global__ void cp_gather_cache(
    const scalar_t* __restrict__ src_cache,   // [NUM_BLOCKS, BLOCK_SIZE,
                                              // ENTRY_SIZE]
    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRY_SIZE]
    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
    const int32_t block_size, const int32_t entry_size,
    const int64_t block_table_stride, const int64_t cache_block_stride,
    const int64_t cache_entry_stride, const int64_t dst_entry_stride,
    const int32_t* __restrict__ seq_starts  // Optional: starting offsets per
                                            // batch
 ) {
  const int64_t bid = blockIdx.x;  // Batch ID
  const int32_t num_splits = gridDim.y;
  const int32_t split = blockIdx.y;
  const int32_t seq_start = cu_seq_lens[bid];
  const int32_t seq_end = cu_seq_lens[bid + 1];
  const int32_t seq_len = seq_end - seq_start;
  const int32_t tot_slots = seq_len;
  const int32_t split_slots = cuda_utils::ceil_div(tot_slots, num_splits);
  const int32_t split_start = split * split_slots;
  const int32_t split_end = min((split + 1) * split_slots, tot_slots);
  const bool is_active_split = (split_start < tot_slots);
  if (!is_active_split) return;
  // Adjust the pointer for the block_table for this batch.
  // If seq_starts is provided, compute an offset based on it
  const int32_t batch_offset = bid * block_table_stride;
  int32_t offset = split_start;
  if (seq_starts != nullptr) {
    offset += seq_starts[bid];
  }
  int32_t offset_div = offset / block_size;
  offset = offset % block_size;
  const int32_t* batch_block_table = block_table + batch_offset;
  // Adjust dst pointer based on the cumulative sequence lengths.
  dst += seq_start * dst_entry_stride;
  auto copy_entry = [&](const scalar_t* __restrict__ _src,
                        scalar_t* __restrict__ _dst) {
    for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
      _dst[i] = _src[i];
  };
  for (int pid = split_start; pid < split_end; ++pid) {
    auto block_id = batch_block_table[offset_div];
    auto block_start_ptr = src_cache + block_id * cache_block_stride;
    auto block_dst_ptr = dst + pid * dst_entry_stride;
    copy_entry(block_start_ptr + offset * cache_entry_stride, block_dst_ptr);
    offset += 1;
    // bump to next block
    if (offset == block_size) {
      offset_div += 1;
      offset = 0;
    }
  }
 }
 }  // namespace vllm
 // Macro to dispatch the kernel based on the data type.
 #define CALL_CP_GATHER_CACHE(CPY_DTYPE)                                 \
  vllm::cp_gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>(         \
      reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()),               \
      reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()),                     \
      block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
@ -860,9 +716,9 @@ __global__ void cp_gather_cache(
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
-//  - Optionally, seq_starts (if provided) offsets the starting slot index by
+//  - Optionally, seq_starts (if provided) offsets the starting block index by
-//  seq_starts[bid]
+//  (seq_starts[bid] / page_size)
-void cp_gather_cache(
+void gather_cache(
    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
@ -913,11 +769,11 @@ void cp_gather_cache(
      seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
  if (dtype_bits == 32) {
-    CALL_CP_GATHER_CACHE(uint32_t);
+    CALL_GATHER_CACHE(uint32_t);
  } else if (dtype_bits == 16) {
-    CALL_CP_GATHER_CACHE(uint16_t);
+    CALL_GATHER_CACHE(uint16_t);
  } else if (dtype_bits == 8) {
-    CALL_CP_GATHER_CACHE(uint8_t);
+    CALL_GATHER_CACHE(uint8_t);
  } else {
    TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
  }
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@ -89,7 +89,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
  explicit FP16Vec16(const FP32Vec16&);
-  void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
  void save(void* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
@ -126,7 +126,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
  explicit BF16Vec16(const FP32Vec16&);
-  void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
+  void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
  void save(void* ptr, const int elem_num) const {
    constexpr uint32_t M = 0xFFFFFFFF;
@ -180,8 +180,8 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
            (__m128i)vec8_data.reg, 1)) {}
  void save(void* ptr) const {
-    _mm256_storeu_si256((__m256i*)ptr, reg_low);
+    *reinterpret_cast<__m256i*>(ptr) = reg_low;
-    _mm256_storeu_si256((__m256i*)ptr + 1, reg_high);
+    *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high;
  }
 };
 #endif
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@ -1,523 +0,0 @@
 #include <list>
 #include <optional>
 #include "common/memory_desc.hpp"
 #include "common/memory.hpp"
 #include "dnnl_helper.h"
 static dnnl::engine& default_engine() {
  static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
  return engine;
 }
 static dnnl::stream& default_stream() {
  static dnnl::stream stream(default_engine());
  return stream;
 }
 void release_dnnl_matmul_handler(int64_t handler) {
  DNNLMatMulPrimitiveHandler* ptr =
      reinterpret_cast<DNNLMatMulPrimitiveHandler*>(handler);
  delete ptr;
 }
 DNNLScratchPadManager::DNNLScratchPadManager() : size_(0), ptr_(nullptr) {
  this->realloc(allocation_unit * 128);
 }
 void DNNLScratchPadManager::realloc(size_t new_size) {
  new_size = round(new_size);
  if (new_size > size_) {
    ptr_ = std::aligned_alloc(64, new_size);
    size_ = new_size;
  }
 }
 DNNLScratchPadManager* DNNLScratchPadManager::get_dnnl_scratchpad_manager() {
  static DNNLScratchPadManager manager;
  return &manager;
 }
 template <typename KT, typename VT>
 class DNNLPrimitiveCache {
 public:
  using cache_value_t = std::pair<KT, VT>;
  using result_value_t = VT;
  using container_t = std::list<cache_value_t>;
  using value_iterator_t = typename container_t::iterator;
  using map_t = std::unordered_map<KT, value_iterator_t>;
  using creator_t = VT (*)();
 public:
  DNNLPrimitiveCache(size_t capacity)
      : capacity_(capacity),
        values_(),
        key_to_value_(std::min(256lu, capacity)) {
    assert(capacity > 0);
  }
  template <typename F>
  result_value_t get_or_create(const KT& key, F&& creator) {
    std::optional<value_iterator_t> value = get_value(key);
    if (value.has_value()) {
      return value.value()->second;
    } else {
      return add_value({key, creator()})->second;
    }
  }
  size_t size() const { return values_.size(); }
 private:
  void dump_data() {
    std::stringstream ss;
    ss << "table_id: " << std::hex << reinterpret_cast<size_t>(this) << std::dec
       << "\n";
    ss << "container: [";
    for (auto&& iter : values_) {
      ss << "(" << iter.first << ", " << std::hex
         << reinterpret_cast<size_t>(iter.second.get()) << "), " << std::dec;
    }
    ss << "]\n";
    ss << "map: [";
    for (auto&& iter : key_to_value_) {
      ss << "(" << iter.first << ", " << iter.second->first << ", " << std::hex
         << reinterpret_cast<size_t>(iter.second->second.get()) << std::dec
         << "), ";
    }
    ss << "]\n";
    std::printf("%s\n", ss.str().c_str());
  }
  value_iterator_t add_value(cache_value_t&& new_value) {
    if (size() == capacity_) {
      cache_value_t& last_item = values_.back();
      key_to_value_.erase(last_item.first);
      values_.pop_back();
    }
    auto& added_value_ = values_.emplace_front(std::move(new_value));
    key_to_value_.emplace(added_value_.first, values_.begin());
    return values_.begin();
  }
  std::optional<value_iterator_t> get_value(const KT& key) {
    if (key_to_value_.size() > 0 && key == values_.begin()->first) {
      return values_.begin();
    }
    auto value_map_iterator = key_to_value_.find(key);
    if (value_map_iterator != key_to_value_.end()) {
      values_.splice(values_.begin(), values_, value_map_iterator->second);
      return value_map_iterator->second;
    } else {
      return {};
    }
  }
 private:
  const size_t capacity_;
  container_t values_;
  map_t key_to_value_;
 };
 DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler(
    const Args& args, dnnl::memory::data_type b_type)
    : b_n_size_(args.b_n_size),
      b_n_stride_(args.b_n_stride),
      b_k_size_(args.b_k_size),
      b_k_stride_(args.b_k_stride),
      b_type_(b_type),
      c_type_(args.c_type),
      runtime_memory_ptrs_(8),
      primitive_cache_size_(args.primitive_cache_size) {
  assert(primitive_cache_size_ > 0);
 }
 void DNNLMatMulPrimitiveHandler::prepack_weight(
    void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) {
  dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                   {b_k_stride_, b_n_stride_});
  dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr);
  dnnl::memory packed_weight(b_target_mem_desc, default_engine());
  {
    dnnl::reorder(original_weight, packed_weight)
        .execute(default_stream(), original_weight, packed_weight);
    default_stream().wait();
  }
  memory_cache_[DNNL_ARG_WEIGHTS] = packed_weight;
  b_target_mem_desc_ = b_target_mem_desc;
 }
 void DNNLMatMulPrimitiveHandler::set_runtime_memory_ptr(
    size_t index, dnnl_memory* memory_ptr) {
  dnnl::impl::memory_storage_t* mem_storage_ptr = memory_ptr->memory_storage();
  dnnl_memory_desc* mem_desc = const_cast<dnnl_memory_desc*>(memory_ptr->md());
  runtime_memory_ptrs_[index] = {mem_storage_ptr, mem_desc};
 }
 std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>
 DNNLMatMulPrimitiveHandler::get_runtime_memory_ptr(size_t index) {
  return runtime_memory_ptrs_[index];
 }
 namespace std {
 template <>
 struct hash<W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey> {
  size_t operator()(
      const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size) ^
           hash<int>()(static_cast<int>(val.a_qs)) ^
           hash<int>()(static_cast<int>(val.b_qs)) ^ hash<bool>()(val.use_azp) ^
           hash<int>()(static_cast<int>(val.c_type));
  }
 };
 template <>
 struct hash<W8A8MatMulPrimitiveHandler::MSizeCacheKey> {
  size_t operator()(
      const W8A8MatMulPrimitiveHandler::MSizeCacheKey& val) const {
    return hash<dnnl_dim_t>()(val.a_m_size) ^ hash<bool>()(val.use_bias) ^
           hash<int>()(static_cast<int>(val.bias_type));
  }
 };
 template <>
 struct hash<MatMulPrimitiveHandler::ClassMatmulCacheKey> {
  size_t operator()(
      const MatMulPrimitiveHandler::ClassMatmulCacheKey& val) const {
    return hash<dnnl_dim_t>()(val.b_n_size) ^ hash<dnnl_dim_t>()(val.b_k_size);
  }
 };
 template <>
 struct hash<MatMulPrimitiveHandler::MSizeCacheKey> {
  size_t operator()(const MatMulPrimitiveHandler::MSizeCacheKey& val) const {
    return hash<dnnl_dim_t>()(val.a_m_size) ^
           hash<dnnl_dim_t>()(val.a_m_stride) ^ hash<bool>()(val.use_bias) ^
           hash<int>()(static_cast<int>(val.bias_type));
  }
 };
 }  // namespace std
 bool operator==(const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
                const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size &&
         l.a_qs == r.a_qs && l.b_qs == r.b_qs && l.use_azp == r.use_azp &&
         l.c_type == r.c_type;
 }
 bool operator==(const W8A8MatMulPrimitiveHandler::MSizeCacheKey& l,
                const W8A8MatMulPrimitiveHandler::MSizeCacheKey& r) {
  return l.use_bias == r.use_bias && l.a_m_size == r.a_m_size &&
         l.bias_type == r.bias_type;
 }
 bool operator==(const MatMulPrimitiveHandler::ClassMatmulCacheKey& l,
                const MatMulPrimitiveHandler::ClassMatmulCacheKey& r) {
  return l.b_n_size == r.b_n_size && l.b_k_size == r.b_k_size;
 }
 bool operator==(const MatMulPrimitiveHandler::MSizeCacheKey& l,
                const MatMulPrimitiveHandler::MSizeCacheKey& r) {
  return l.a_m_size == r.a_m_size && l.a_m_stride == r.a_m_stride &&
         l.use_bias == r.use_bias && l.bias_type == r.bias_type;
 }
 static std::shared_ptr<W8A8MatMulPrimitiveHandler::MSizeCache>
 get_w8a8_class_primitive_cache(
    const W8A8MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
    int64_t cache_size) {
  static W8A8MatMulPrimitiveHandler::ClassMatmulCache cache(128);
  assert(cache_size > 0);
  return cache.get_or_create(key, [&]() {
    return std::make_shared<W8A8MatMulPrimitiveHandler::MSizeCache>(cache_size);
  });
 }
 W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args)
    : DNNLMatMulPrimitiveHandler(
          static_cast<const DNNLMatMulPrimitiveHandler::Args&>(args),
          dnnl::memory::data_type::s8),
      use_azp_(args.use_a_zero_point),
      a_qs_(args.a_quantization_strategy),
      b_qs_(args.b_quantization_strategy),
      m_size_cache_(nullptr) {
  assert(a_qs_ != QuantizationStrategy::PER_OUTPUT_CHANNEL);
  assert(b_qs_ != QuantizationStrategy::PER_TOKEN);
  if (a_qs_ == QuantizationStrategy::PER_TOKEN) {
    assert(!use_azp_);
  };
  prepack_weight(args.b_ptr,
                 create_primitive_desc(
                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
                                   .use_bias = false,
                                   .bias_type = dnnl::memory::data_type::undef},
                     true)
                     .weights_desc());
  init_runtime_memory_cache(args);
 }
 void W8A8MatMulPrimitiveHandler::execute(ExecArgs& args) {
  auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
  auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
  a_storage->set_data_handle((void*)args.a_ptr);
  a_mem_desc->dims[0] = args.a_m_size;
  c_storage->set_data_handle((void*)args.c_ptr);
  c_mem_desc->dims[0] = args.a_m_size;
  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
    auto&& [a_scale_storage, a_scale_mem_desc] = get_runtime_memory_ptr(2);
    a_scale_storage->set_data_handle((void*)args.a_scales_ptr);
  }
  if (use_azp_) {
    auto&& [a_zero_point_storage, a_zero_point_mem_desc] =
        get_runtime_memory_ptr(3);
    a_zero_point_storage->set_data_handle((void*)args.a_zero_points_ptr);
  }
  if (args.use_bias) {
    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(4);
    bias_storage->set_data_handle((void*)args.bias_ptr);
  }
  dnnl::matmul matmul = get_matmul_cache(args);
  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(5);
  scratchpad_storage->set_data_handle(
      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
  matmul.execute(default_stream(), memory_cache_);
  default_stream().wait();
 }
 dnnl::matmul W8A8MatMulPrimitiveHandler::get_matmul_cache(
    const MSizeCacheKey& key) {
  if (m_size_cache_.get() == nullptr) {
    ClassMatmulCacheKey key = {.b_n_size = b_n_size_,
                               .b_k_size = b_k_size_,
                               .a_qs = a_qs_,
                               .b_qs = b_qs_,
                               .use_azp = use_azp_,
                               .c_type = c_type_};
    m_size_cache_ = get_w8a8_class_primitive_cache(key, primitive_cache_size_);
  }
  return m_size_cache_->get_or_create(key, [&]() {
    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
    manager->realloc(desc.scratchpad_desc().get_size());
    return dnnl::matmul(desc);
  });
 }
 void W8A8MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
  memory_cache_[DNNL_ARG_SRC] = dnnl::memory({{1, b_k_size_},
                                              dnnl::memory::data_type::s8,
                                              dnnl::memory::format_tag::ab},
                                             default_engine(), nullptr);
  set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
  memory_cache_[DNNL_ARG_DST] =
      dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
  // For PER_TOKEN, scales will be applied in outside epilogue
  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
    memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = dnnl::memory(
        {{1}, dnnl::memory::data_type::f32, {1}}, default_engine(), nullptr);
    set_runtime_memory_ptr(
        2, memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC].get());
    if (use_azp_) {
      memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC] = dnnl::memory(
          {{1}, dnnl::memory::data_type::s32, {1}}, default_engine(), nullptr);
      set_runtime_memory_ptr(
          3, memory_cache_[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC].get());
    }
  }
  if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
    memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
        dnnl::memory({{1}, dnnl::memory::data_type::f32, {1}}, default_engine(),
                     (void*)args.b_scales_ptr);
  } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
    memory_cache_[DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS] =
        dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                     default_engine(), (void*)args.b_scales_ptr);
  }
  memory_cache_[DNNL_ARG_BIAS] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(4, memory_cache_[DNNL_ARG_BIAS].get());
  memory_cache_[DNNL_ARG_SCRATCHPAD] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(5, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }
 dnnl::matmul::primitive_desc W8A8MatMulPrimitiveHandler::create_primitive_desc(
    const MSizeCacheKey& key, bool first_time) {
  dnnl::memory::desc a_md({key.a_m_size, b_k_size_},
                          dnnl::memory::data_type::s8,
                          dnnl::memory::format_tag::ab);
  dnnl::memory::desc b_md;
  if (first_time) {
    b_md =
        dnnl::memory::desc({b_k_size_, b_n_size_}, dnnl::memory::data_type::s8,
                           dnnl::memory::format_tag::any);
  } else {
    b_md = b_target_mem_desc_;
  }
  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
                          dnnl::memory::format_tag::ab);
  dnnl::primitive_attr attr;
  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
  // For PER_TOKEN, scales will be applied in outside epilogue
  if (a_qs_ == QuantizationStrategy::PER_TENSOR) {
    attr.set_scales_mask(DNNL_ARG_SRC, 0);
    if (use_azp_) {
      attr.set_zero_points_mask(DNNL_ARG_SRC, 0);
    }
  }
  if (b_qs_ == QuantizationStrategy::PER_TENSOR) {
    attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
  } else if (b_qs_ == QuantizationStrategy::PER_OUTPUT_CHANNEL) {
    attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
  }
  if (key.use_bias) {
    // For PER_TOKEN, bias will be applied in epilogue
    assert(a_qs_ == QuantizationStrategy::PER_TENSOR);
    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
                                        c_md, attr);
  } else {
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                        attr);
  }
 }
 MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
    : DNNLMatMulPrimitiveHandler(
          static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
      m_size_cache_(nullptr) {
  assert(ab_type_ == dnnl::memory::data_type::f32 ||
         ab_type_ == dnnl::memory::data_type::bf16 ||
         ab_type_ == dnnl::memory::data_type::f16);
  prepack_weight(args.b_ptr,
                 create_primitive_desc(
                     MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL,
                                   .a_m_stride = DNNL_RUNTIME_DIM_VAL,
                                   .use_bias = false,
                                   .bias_type = dnnl::memory::data_type::undef},
                     true)
                     .weights_desc());
  init_runtime_memory_cache(args);
 }
 static std::shared_ptr<MatMulPrimitiveHandler::MSizeCache>
 get_matul_class_primitive_cache(
    const MatMulPrimitiveHandler::ClassMatmulCacheKey& key,
    int64_t cache_size) {
  static MatMulPrimitiveHandler::ClassMatmulCache cache(128);
  assert(cache_size > 0);
  return cache.get_or_create(key, [&]() {
    return std::make_shared<MatMulPrimitiveHandler::MSizeCache>(cache_size);
  });
 }
 void MatMulPrimitiveHandler::execute(ExecArgs& args) {
  auto&& [a_storage, a_mem_desc] = get_runtime_memory_ptr(0);
  auto&& [c_storage, c_mem_desc] = get_runtime_memory_ptr(1);
  a_storage->set_data_handle((void*)args.a_ptr);
  a_mem_desc->dims[0] = args.a_m_size;
  a_mem_desc->format_desc.blocking.strides[0] = args.a_m_stride;
  c_storage->set_data_handle((void*)args.c_ptr);
  c_mem_desc->dims[0] = args.a_m_size;
  if (args.use_bias) {
    auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2);
    bias_storage->set_data_handle((void*)args.bias_ptr);
  }
  dnnl::matmul matmul = get_matmul_cache(args);
  auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3);
  scratchpad_storage->set_data_handle(
      DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>());
  matmul.execute(default_stream(), memory_cache_);
  default_stream().wait();
 }
 dnnl::matmul MatMulPrimitiveHandler::get_matmul_cache(
    const MSizeCacheKey& key) {
  if (m_size_cache_.get() == nullptr) {
    ClassMatmulCacheKey key = {.b_n_size = b_n_size_, .b_k_size = b_k_size_};
    m_size_cache_ = get_matul_class_primitive_cache(key, primitive_cache_size_);
  }
  return m_size_cache_->get_or_create(key, [&]() {
    dnnl::matmul::primitive_desc desc = this->create_primitive_desc(key, false);
    auto manager = DNNLScratchPadManager::get_dnnl_scratchpad_manager();
    manager->realloc(desc.scratchpad_desc().get_size());
    return dnnl::matmul(desc);
  });
 }
 dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc(
    const MSizeCacheKey& key, bool first_time) {
  dnnl::memory::desc a_md;
  dnnl::memory::desc b_md;
  if (first_time) {
    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
                              dnnl::memory::format_tag::ab);
    b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_,
                              dnnl::memory::format_tag::any);
  } else {
    a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_,
                              {key.a_m_stride, 1});
    b_md = b_target_mem_desc_;
  }
  dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_,
                          dnnl::memory::format_tag::ab);
  dnnl::primitive_attr attr;
  attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
  if (key.use_bias) {
    dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1});
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md,
                                        c_md, attr);
  } else {
    return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md,
                                        attr);
  }
 }
 void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) {
  memory_cache_[DNNL_ARG_SRC] = dnnl::memory(
      {{1, b_k_size_}, b_type_, {b_k_size_, 1}}, default_engine(), nullptr);
  set_runtime_memory_ptr(0, memory_cache_[DNNL_ARG_SRC].get());
  memory_cache_[DNNL_ARG_DST] =
      dnnl::memory({{1, b_n_size_}, c_type_, dnnl::memory::format_tag::ab},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get());
  memory_cache_[DNNL_ARG_BIAS] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get());
  memory_cache_[DNNL_ARG_SCRATCHPAD] =
      dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}},
                   default_engine(), nullptr);
  set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get());
 }
--- a/csrc/cpu/dnnl_helper.h
+++ b/csrc/cpu/dnnl_helper.h
@ -1,243 +0,0 @@
 #ifndef DNNL_HELPER_H
 #define DNNL_HELPER_H
 #include <optional>
 #include <cassert>
 #include "oneapi/dnnl/dnnl.hpp"
 namespace c10 {
 struct BFloat16;
 struct Half;
 }  // namespace c10
 namespace dnnl {
 namespace impl {
 struct memory_storage_t;
 struct matmul_pd_t;
 struct matmul_desc_t;
 }  // namespace impl
 }  // namespace dnnl
 struct dnnl_memory_desc;
 template <typename KT, typename VT>
 class DNNLPrimitiveCache;
 template <typename T>
 struct DNNLType {
  static constexpr dnnl::memory::data_type type =
      dnnl::memory::data_type::undef;
 };
 template <>
 struct DNNLType<int8_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
 };
 template <>
 struct DNNLType<int32_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
 };
 template <>
 struct DNNLType<float> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
 };
 template <>
 struct DNNLType<c10::BFloat16> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
 };
 template <>
 struct DNNLType<c10::Half> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
 };
 template <typename T>
 constexpr inline dnnl::memory::data_type get_dnnl_type() {
  return DNNLType<std::decay_t<T>>::type;
 }
 class DNNLScratchPadManager {
 public:
  static constexpr size_t allocation_unit = 4 * 1024 * 1024;  // 4KB
  static DNNLScratchPadManager* get_dnnl_scratchpad_manager();
  DNNLScratchPadManager();
  template <typename T>
  T* get_data() {
    return reinterpret_cast<T*>(ptr_);
  }
  static size_t round(size_t size) {
    return ((size + allocation_unit - 1) / allocation_unit) * allocation_unit;
  }
  void realloc(size_t new_size);
 private:
  size_t size_;
  void* ptr_;
 };
 class DNNLMatMulPrimitiveHandler {
 public:
  virtual ~DNNLMatMulPrimitiveHandler() = default;
 protected:
  struct Args {
    dnnl_dim_t b_n_size;
    dnnl_dim_t b_n_stride;
    dnnl_dim_t b_k_size;
    dnnl_dim_t b_k_stride;
    void* b_ptr;
    dnnl::memory::data_type c_type;
    size_t primitive_cache_size;
  };
 protected:
  DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type);
  void prepack_weight(void* original_b_ptr,
                      dnnl::memory::desc b_target_mem_desc);
  void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr);
  std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>
  get_runtime_memory_ptr(size_t index);
 protected:
  const dnnl_dim_t b_n_size_;
  const dnnl_dim_t b_n_stride_;
  const dnnl_dim_t b_k_size_;
  const dnnl_dim_t b_k_stride_;
  dnnl::memory::data_type b_type_;
  dnnl::memory::data_type c_type_;
  std::unordered_map<int, dnnl::memory> memory_cache_;
  std::vector<std::pair<dnnl::impl::memory_storage_t*, dnnl_memory_desc*>>
      runtime_memory_ptrs_;
  dnnl::memory::desc b_target_mem_desc_;
  int64_t primitive_cache_size_;
 };
 class W8A8MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
 public:
  enum class QuantizationStrategy { PER_TOKEN, PER_TENSOR, PER_OUTPUT_CHANNEL };
  struct Args : public DNNLMatMulPrimitiveHandler::Args {
    bool use_a_zero_point;
    QuantizationStrategy a_quantization_strategy;
    QuantizationStrategy b_quantization_strategy;
    float* b_scales_ptr;
  };
  struct ClassMatmulCacheKey {
    dnnl_dim_t b_n_size;
    dnnl_dim_t b_k_size;
    QuantizationStrategy a_qs;
    QuantizationStrategy b_qs;
    bool use_azp;
    dnnl::memory::data_type c_type;
    friend bool operator==(const ClassMatmulCacheKey& l,
                           const ClassMatmulCacheKey& r);
  };
  struct MSizeCacheKey {
    dnnl_dim_t a_m_size;
    bool use_bias;
    dnnl::memory::data_type bias_type;
    friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
  };
  using MSizeCache = DNNLPrimitiveCache<MSizeCacheKey, dnnl::matmul>;
  using ClassMatmulCache =
      DNNLPrimitiveCache<ClassMatmulCacheKey, std::shared_ptr<MSizeCache>>;
  struct ExecArgs : public MSizeCacheKey {
    const int8_t* a_ptr;
    const float* a_scales_ptr;
    const int32_t* a_zero_points_ptr;
    const void* bias_ptr;
    void* c_ptr;
  };
 public:
  W8A8MatMulPrimitiveHandler(const Args& args);
  QuantizationStrategy get_input_scale_strategy() const { return a_qs_; }
  bool get_input_use_zero_point() const { return use_azp_; }
  void execute(ExecArgs& args);
 private:
  dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
                                                     bool first_time);
  void init_runtime_memory_cache(const Args& args);
  dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
 private:
  const bool use_azp_;
  const QuantizationStrategy a_qs_;
  const QuantizationStrategy b_qs_;
  std::shared_ptr<MSizeCache> m_size_cache_;
 };
 class MatMulPrimitiveHandler : public DNNLMatMulPrimitiveHandler {
 public:
  struct Args : public DNNLMatMulPrimitiveHandler::Args {
    dnnl::memory::data_type ab_type;
  };
  struct ClassMatmulCacheKey {
    dnnl_dim_t b_n_size;
    dnnl_dim_t b_k_size;
    friend bool operator==(const ClassMatmulCacheKey& l,
                           const ClassMatmulCacheKey& r);
  };
  struct MSizeCacheKey {
    dnnl_dim_t a_m_size;
    dnnl_dim_t a_m_stride;
    bool use_bias;
    dnnl::memory::data_type bias_type;
    friend bool operator==(const MSizeCacheKey& l, const MSizeCacheKey& r);
  };
  using MSizeCache = DNNLPrimitiveCache<MSizeCacheKey, dnnl::matmul>;
  using ClassMatmulCache =
      DNNLPrimitiveCache<ClassMatmulCacheKey, std::shared_ptr<MSizeCache>>;
  struct ExecArgs : public MSizeCacheKey {
    const void* a_ptr;
    const void* bias_ptr;
    void* c_ptr;
  };
 public:
  MatMulPrimitiveHandler(const Args& args);
  void execute(ExecArgs& args);
 private:
  dnnl::matmul::primitive_desc create_primitive_desc(const MSizeCacheKey& key,
                                                     bool first_time);
  void init_runtime_memory_cache(const Args& args);
  dnnl::matmul get_matmul_cache(const MSizeCacheKey& key);
 private:
  std::shared_ptr<MSizeCache> m_size_cache_;
 };
 #endif
--- a/csrc/cpu/dnnl_helper.hpp
+++ b/csrc/cpu/dnnl_helper.hpp
@ -0,0 +1,206 @@
 #ifndef DNNL_HELPER_HPP
 #define DNNL_HELPER_HPP
 #include <c10/util/BFloat16.h>
 #include <c10/util/Half.h>
 #include "oneapi/dnnl/dnnl.hpp"
 namespace {
 template <typename T>
 struct DNNLType {
  static constexpr dnnl::memory::data_type type =
      dnnl::memory::data_type::undef;
 };
 template <>
 struct DNNLType<int8_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
 };
 template <>
 struct DNNLType<int32_t> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
 };
 template <>
 struct DNNLType<float> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
 };
 template <>
 struct DNNLType<c10::BFloat16> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
 };
 template <>
 struct DNNLType<c10::Half> {
  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
 };
 template <typename T>
 constexpr inline dnnl::memory::data_type get_dnnl_type() {
  return DNNLType<std::decay_t<T>>::type;
 }
 };  // namespace
 template <bool InputNoScale>
 class DNNLPrimitiveHelper {
 public:
  // I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
  // A: [M, K], row-major
  // B: [K, N], column-major
  // C: [M, N], row-major
  // bias: [N], row-major, optional
  // a_scales: [MS]
  // b_scales: [NS]
  // Note: Due to the limitation of oneDNN
  // (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
  // not supported.
  template <typename OutputT, typename BiasT>
  static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
                            const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
                            dnnl_dim_t K, const float* a_scales,
                            const float* b_scales, dnnl_dim_t MS,
                            dnnl_dim_t NS) {
    auto&& OutputType = get_dnnl_type<OutputT>();
    auto&& BiasType = get_dnnl_type<BiasT>();
    dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
    dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
    dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
    dnnl::primitive_attr attr;
    if constexpr (!InputNoScale) {
      if (MS == 1) {
        // per-tensor
        attr.set_scales_mask(DNNL_ARG_SRC, 0);
      } else {
        // per-token
        TORCH_CHECK(false, "per-token quantization is unsupported.");
      }
    }
    if (NS == 1) {
      // per-tensor
      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
    } else {
      // per-channel
      attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
    }
    dnnl::matmul::primitive_desc matmul_pd;
 // Create memory descriptors with format_tag::any for the primitive. This
 // enables the matmul primitive to choose memory layouts for an
 // optimized primitive implementation, and these layouts may differ from the
 // ones provided by the user.
 #ifdef __aarch64__
    auto mat_src_md = dnnl::memory::desc({M, K}, dnnl::memory::data_type::s8,
                                         dnnl::memory::format_tag::any);
    auto mat_weights_md = dnnl::memory::desc(
        {K, N}, dnnl::memory::data_type::s8, dnnl::memory::format_tag::any);
    auto mat_dst_md =
        dnnl::memory::desc({M, N}, OutputType, dnnl::memory::format_tag::any);
    if (bias) {
      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), mat_src_md,
                                               mat_weights_md, bias_md,
                                               mat_dst_md, attr);
    } else {
      matmul_pd = dnnl::matmul::primitive_desc(
          default_engine(), mat_src_md, mat_weights_md, mat_dst_md, attr);
    }
 #else
    if (bias) {
      dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
                                               bias_md, c_md, attr);
    } else {
      matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
                                               c_md, attr);
    }
 #endif
    dnnl::matmul matmul(matmul_pd);
    auto& engine = default_engine();
    dnnl::memory a_m(a_md, engine, (void*)a);
    dnnl::memory b_m(b_md, engine, (void*)b);
    dnnl::memory c_m(c_md, engine, (void*)c);
    dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
                            (void*)a_scales);
    dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
                            (void*)b_scales);
    auto& stream = default_stream();
    auto mat_src_mem = a_m;
    auto mat_weights_mem = b_m;
    auto mat_dst_mem = c_m;
 #ifdef __aarch64__
    if (matmul_pd.weights_desc() != b_m.get_desc()) {
      mat_weights_mem = dnnl::memory(matmul_pd.weights_desc(), engine);
      dnnl::reorder(b_m, mat_weights_mem).execute(stream, b_m, mat_weights_mem);
    }
 #endif
    if constexpr (InputNoScale) {
      if (bias) {
        dnnl::memory::desc bias_md({N}, BiasType, {1});
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, mat_src_mem},
                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
                        {DNNL_ARG_BIAS, bias_m},
                        {DNNL_ARG_DST, mat_dst_mem},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, mat_src_mem},
                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
                        {DNNL_ARG_DST, mat_dst_mem},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      }
    } else {
      if (bias) {
        dnnl::memory::desc bias_md({N}, BiasType, {1});
        dnnl::memory bias_m(bias_md, engine, (void*)bias);
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, mat_src_mem},
                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
                        {DNNL_ARG_BIAS, bias_m},
                        {DNNL_ARG_DST, mat_dst_mem},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      } else {
        matmul.execute(
            stream, {
                        {DNNL_ARG_SRC, mat_src_mem},
                        {DNNL_ARG_WEIGHTS, mat_weights_mem},
                        {DNNL_ARG_DST, mat_dst_mem},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
                        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
                    });
      }
    }
    stream.wait();
  }
 private:
  static dnnl::engine& default_engine() {
    static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
    return engine;
  }
  static dnnl::stream& default_stream() {
    static dnnl::stream stream(default_engine());
    return stream;
  }
 };
 #endif
--- a/csrc/cpu/dnnl_kernels.cpp
+++ b/csrc/cpu/dnnl_kernels.cpp
@ -1,549 +0,0 @@
 #include "cpu_types.hpp"
 #include "dnnl_helper.h"
 namespace {
 template <typename scalar_t>
 struct KernelVecType {
  using load_vec_type = void;
  using cvt_vec_type = void;
 };
 template <>
 struct KernelVecType<float> {
  using load_vec_type = vec_op::FP32Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 #if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct KernelVecType<c10::BFloat16> {
  using load_vec_type = vec_op::BF16Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 #endif
 template <>
 struct KernelVecType<c10::Half> {
 #if defined(__powerpc64__) || defined(__s390x__)
  // Power architecture-specific vector type
  using load_vec_type = vec_op::FP32Vec16;
 #else
  // Fallback for other architectures
  using load_vec_type = vec_op::FP16Vec16;
 #endif
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 template <bool AZP, typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int32_t* azp,
                                   const int64_t num_tokens,
                                   const int64_t input_stride,
                                   const int64_t hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int64_t vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  constexpr float i8_min =
      static_cast<float>(std::numeric_limits<int8_t>::min());
  constexpr float i8_max =
      static_cast<float>(std::numeric_limits<int8_t>::max());
  const cvt_vec_t inv_scale(1.0 / *scale);
  const cvt_vec_t i8_min_vec(i8_min);
  const cvt_vec_t i8_max_vec(i8_max);
  cvt_vec_t zp_vec;
  if constexpr (AZP) {
    zp_vec = cvt_vec_t(static_cast<float>(*azp));
  }
 #pragma omp parallel for
  for (int64_t i = 0; i < num_tokens; ++i) {
    int64_t j = 0;
    const scalar_t* input_ptr = input + i * input_stride;
    int8_t* output_ptr = output + i * hidden_size;
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      load_vec_t elems(input_ptr + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = elems_fp32 * inv_scale;
      if constexpr (AZP) {
        elems_fp32 = elems_fp32 + zp_vec;
      }
      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      elems_int8.save(output_ptr + j);
    }
    load_vec_t elems(input_ptr + j);
    cvt_vec_t elems_fp32(elems);
    elems_fp32 = elems_fp32 * inv_scale;
    if constexpr (AZP) {
      elems_fp32 = elems_fp32 + zp_vec;
    }
    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
    vec_op::INT8Vec16 elems_int8(elems_fp32);
    elems_int8.save(output_ptr + j, hidden_size - j);
  }
 }
 template <bool AZP, typename scalar_t>
 void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                    float* scale, int32_t* azp,
                                    const int64_t num_tokens,
                                    const int64_t input_stride,
                                    const int64_t hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  constexpr float i8_min =
      static_cast<float>(std::numeric_limits<int8_t>::min());
  constexpr float i8_max =
      static_cast<float>(std::numeric_limits<int8_t>::max());
  const cvt_vec_t i8_min_vec(i8_min);
  const cvt_vec_t i8_max_vec(i8_max);
 #pragma omp parallel for
  for (int64_t i = 0; i < num_tokens; ++i) {
    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
    cvt_vec_t min_value(std::numeric_limits<float>::max());
    {
      int64_t j = 0;
      const scalar_t* input_ptr = input + i * input_stride;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input_ptr + j);
        cvt_vec_t elems_fp32(elems);
        if constexpr (AZP) {
          max_value = max_value.max(elems_fp32);
          min_value = min_value.min(elems_fp32);
        } else {
          max_value = max_value.max(elems_fp32.abs());
        }
      }
      load_vec_t elems(input_ptr + j);
      cvt_vec_t elems_fp32(elems);
      if (j + vec_elem_num == hidden_size) {
        if constexpr (AZP) {
          max_value = max_value.max(elems_fp32);
          min_value = min_value.min(elems_fp32);
        } else {
          max_value = max_value.max(elems_fp32.abs());
        }
      } else {
        if constexpr (AZP) {
          max_value = max_value.max(elems_fp32, hidden_size - j);
          min_value = min_value.min(elems_fp32, hidden_size - j);
        } else {
          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
        }
      }
    }
    float scale_val;
    float azp_val = 0.0f;
    if constexpr (AZP) {
      float max_scalar = max_value.reduce_max();
      float min_scalar = min_value.reduce_min();
      scale_val = (max_scalar - min_scalar) / 255.0f;
      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
      azp[i] = azp_val;
      scale[i] = scale_val;
    } else {
      scale_val = max_value.reduce_max() / 127.0f;
      scale[i] = scale_val;
    }
    const cvt_vec_t inv_scale(1.0 / scale_val);
    const cvt_vec_t azp_vec(azp_val);
    {
      int64_t j = 0;
      const scalar_t* input_ptr = input + i * input_stride;
      int8_t* output_ptr = output + i * hidden_size;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input_ptr + j);
        cvt_vec_t elems_fp32(elems);
        elems_fp32 = (elems_fp32 * inv_scale);
        if constexpr (AZP) {
          elems_fp32 = elems_fp32 + azp_vec;
        }
        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
        vec_op::INT8Vec16 elems_int8(elems_fp32);
        elems_int8.save(output_ptr + j);
      }
      load_vec_t elems(input_ptr + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = (elems_fp32 * inv_scale);
      if constexpr (AZP) {
        elems_fp32 = elems_fp32 + azp_vec;
      }
      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      elems_int8.save(output_ptr + j, hidden_size - j);
    }
  }
 }
 template <bool AZP, bool Bias, typename scalar_t>
 void dynamic_quant_epilogue(const float* input, scalar_t* output,
                            const float* a_scale, const int32_t* azp,
                            const float* azp_adj, const scalar_t* bias,
                            const int64_t num_tokens,
                            const int64_t hidden_size) {
  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  const int64_t thread_num = omp_get_max_threads();
  if (num_tokens > thread_num) {
 #pragma omp parallel for
    for (int64_t i = 0; i < num_tokens; ++i) {
      const float* input_ptr = input + i * hidden_size;
      scalar_t* output_ptr = output + i * hidden_size;
      int64_t j = 0;
      cvt_vec_t token_scale_vec(a_scale[i]);
      cvt_vec_t token_zp_scale_vec;
      if constexpr (AZP) {
        float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
        token_zp_scale_vec = cvt_vec_t(zp_scale_val);
      }
      for (; j < hidden_size - vec_elem_num; ++j) {
        cvt_vec_t elems_fp32(input_ptr + j);
        elems_fp32 = elems_fp32 * token_scale_vec;
        if constexpr (AZP) {
          cvt_vec_t azp_adj_fp32(azp_adj + j);
          elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
        }
        if constexpr (Bias) {
          load_vec_t bias_vec(bias + j);
          cvt_vec_t bias_vec_fp32(bias_vec);
          elems_fp32 = elems_fp32 + bias_vec_fp32;
        }
        load_vec_t elems_out(elems_fp32);
        elems_out.save(output_ptr + j);
      }
      cvt_vec_t elems_fp32(input_ptr + j);
      elems_fp32 = elems_fp32 * token_scale_vec;
      if constexpr (AZP) {
        cvt_vec_t azp_adj_fp32(azp_adj + j);
        elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
      }
      if constexpr (Bias) {
        load_vec_t bias_vec(bias + j);
        cvt_vec_t bias_vec_fp32(bias_vec);
        elems_fp32 = elems_fp32 + bias_vec_fp32;
      }
      load_vec_t elems_out(elems_fp32);
      elems_out.save(output_ptr + j, hidden_size - j);
    }
  } else {
    const int64_t vec_iteration =
        (hidden_size + vec_elem_num - 1) / vec_elem_num;
    const int64_t vec_iteration_per_thread =
        (vec_iteration + thread_num - 1) / thread_num;
    const int64_t elem_num_per_thread = vec_iteration_per_thread * vec_elem_num;
 #pragma omp parallel for schedule(static, 1)
    for (int64_t i = 0; i < thread_num; ++i) {
      const int64_t start = elem_num_per_thread * i;
      const int64_t end = std::min(hidden_size, elem_num_per_thread + start);
      for (int64_t j = 0; j < num_tokens; ++j) {
        cvt_vec_t token_scale_vec(a_scale[j]);
        cvt_vec_t token_zp_scale_vec;
        if constexpr (AZP) {
          float zp_scale_val = a_scale[j] * static_cast<float>(azp[j]);
          token_zp_scale_vec = cvt_vec_t(zp_scale_val);
        }
        int64_t k = start;
        const float* input_ptr = input + j * hidden_size;
        scalar_t* output_ptr = output + j * hidden_size;
        for (; k < end - vec_elem_num; k += vec_elem_num) {
          cvt_vec_t elems_fp32(input_ptr + k);
          elems_fp32 = elems_fp32 * token_scale_vec;
          if constexpr (AZP) {
            cvt_vec_t azp_adj_fp32(azp_adj + k);
            elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
          }
          if constexpr (Bias) {
            load_vec_t bias_vec(bias + k);
            cvt_vec_t bias_vec_fp32(bias_vec);
            elems_fp32 = elems_fp32 + bias_vec_fp32;
          }
          load_vec_t elems_out(elems_fp32);
          elems_out.save(output_ptr + k);
        }
        if (k < end) {
          cvt_vec_t elems_fp32(input_ptr + k);
          elems_fp32 = elems_fp32 * token_scale_vec;
          if constexpr (AZP) {
            cvt_vec_t azp_adj_fp32(azp_adj + k);
            elems_fp32 = elems_fp32 - azp_adj_fp32 * token_zp_scale_vec;
          }
          if constexpr (Bias) {
            load_vec_t bias_vec(bias + k);
            cvt_vec_t bias_vec_fp32(bias_vec);
            elems_fp32 = elems_fp32 + bias_vec_fp32;
          }
          load_vec_t elems_out(elems_fp32);
          elems_out.save(output_ptr + k, end - k);
        }
      }
    }
  }
 }
 }  // namespace
 int64_t create_onednn_scaled_mm_handler(
    const torch::Tensor& b,         // [IC, OC], column-major
    const torch::Tensor& b_scales,  // [1] or [OC]
    at::ScalarType output_type, bool dynamic_act_quant, bool use_azp,
    int64_t primitive_cache_size) {
  TORCH_CHECK(b.dim() == 2);
  TORCH_CHECK(b.stride(0) == 1);  // Column-major
  TORCH_CHECK(b_scales.is_contiguous());
  W8A8MatMulPrimitiveHandler::Args args;
  args.primitive_cache_size = primitive_cache_size;
  if (b_scales.numel() == 1) {
    args.b_quantization_strategy =
        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR;
  } else {
    TORCH_CHECK_EQ(b_scales.numel(), b.size(1));
    args.b_quantization_strategy =
        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_OUTPUT_CHANNEL;
  }
  args.b_scales_ptr = b_scales.data_ptr<float>();
  args.b_k_size = b.size(0);
  args.b_k_stride = b.stride(0);
  args.b_n_size = b.size(1);
  args.b_n_stride = b.stride(1);
  args.b_ptr = b.data_ptr<int8_t>();
  if (dynamic_act_quant) {
    // dynamic per-token, bias, A scales and A zps will be applied in outside.
    args.a_quantization_strategy =
        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN;
    args.use_a_zero_point = false;
  } else {
    // static per-tensor
    args.a_quantization_strategy =
        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR;
    args.use_a_zero_point = use_azp;
  }
  VLLM_DISPATCH_FLOATING_TYPES(output_type, "create_onednn_scaled_mm_handler",
                               [&] {
                                 if (dynamic_act_quant) {
                                   args.c_type = get_dnnl_type<float>();
                                 } else {
                                   args.c_type = get_dnnl_type<scalar_t>();
                                 }
                               });
  return reinterpret_cast<int64_t>(new W8A8MatMulPrimitiveHandler(args));
 }
 void onednn_scaled_mm(
    torch::Tensor& c,                             // [M, OC], row-major
    const torch::Tensor& a,                       // [M, IC], row-major
    const torch::Tensor& a_scales,                // [M] or [1]
    const std::optional<torch::Tensor>& azp,      // [M] or [1]
    const std::optional<torch::Tensor>& azp_adj,  // [M] or [1]
    const std::optional<torch::Tensor>& bias,     // [N]
    int64_t handler) {
  CPU_KERNEL_GUARD_IN(onednn_scaled_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.is_contiguous());
  TORCH_CHECK(c.is_contiguous());
  W8A8MatMulPrimitiveHandler* ptr =
      reinterpret_cast<W8A8MatMulPrimitiveHandler*>(handler);
  const int32_t* azp_ptr = nullptr;
  if (azp.has_value()) {
    azp_ptr = azp->data_ptr<int32_t>();
  }
  if (ptr->get_input_scale_strategy() ==
      W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) {
    TORCH_CHECK_EQ(a_scales.numel(), 1);
  }
  W8A8MatMulPrimitiveHandler::ExecArgs exec_args;
  exec_args.a_ptr = a.data_ptr<int8_t>();
  exec_args.a_m_size = a.size(0);
  exec_args.bias_ptr = nullptr;
  exec_args.bias_type = get_dnnl_type<void>();
  exec_args.use_bias = false;
  exec_args.a_scales_ptr = nullptr;
  exec_args.a_zero_points_ptr = nullptr;
  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "onednn_scaled_mm", [&] {
    if (ptr->get_input_scale_strategy() ==
        W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TENSOR) {
      if (bias.has_value()) {
        exec_args.bias_ptr = bias->data_ptr<scalar_t>();
        exec_args.bias_type = get_dnnl_type<scalar_t>();
        exec_args.use_bias = true;
      }
      exec_args.a_scales_ptr = a_scales.data_ptr<float>();
      exec_args.a_zero_points_ptr = azp_ptr;
      exec_args.c_ptr = c.data_ptr<scalar_t>();
      ptr->execute(exec_args);
    } else if (ptr->get_input_scale_strategy() ==
               W8A8MatMulPrimitiveHandler::QuantizationStrategy::PER_TOKEN) {
      torch::Tensor tmp_fp32_out =
          torch::empty_like(c, ::at::ScalarType::Float);
      exec_args.c_ptr = tmp_fp32_out.data_ptr<float>();
      ptr->execute(exec_args);
      if (bias.has_value()) {
        if (azp.has_value()) {
          dynamic_quant_epilogue<true, true>(
              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
              a_scales.data_ptr<float>(), azp_ptr, azp_adj->data_ptr<float>(),
              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
        } else {
          dynamic_quant_epilogue<false, true>(
              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
              a_scales.data_ptr<float>(), azp_ptr, nullptr,
              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
        }
      } else {
        if (azp.has_value()) {
          dynamic_quant_epilogue<true, false>(
              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
              a_scales.data_ptr<float>(), azp_ptr, azp_adj->data_ptr<float>(),
              (scalar_t*)nullptr, c.size(0), c.size(1));
        } else {
          dynamic_quant_epilogue<false, false>(
              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
              a_scales.data_ptr<float>(), azp_ptr, nullptr, (scalar_t*)nullptr,
              c.size(0), c.size(1));
        }
      }
    } else {
      TORCH_CHECK(false, "invalid act quant type.");
    }
  });
 }
 // static-per-tensor quantization.
 void static_scaled_int8_quant(
    torch::Tensor& out,          // [batch, hidden_size]
    const torch::Tensor& input,  // [batch, hidden_size]
    const torch::Tensor& scale, std::optional<torch::Tensor> const& azp) {
  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK_EQ(input.dim(), 2);
  TORCH_CHECK_EQ(input.stride(1), 1);
  TORCH_CHECK(scale.numel() == 1);
  TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
  const int64_t stride = input.stride(0);
  const int64_t hidden_size = input.size(1);
  const int64_t num_tokens = input.size(0);
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
        if (azp.has_value()) {
          static_scaled_int8_quant_impl<true>(
              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
              stride, hidden_size);
        } else {
          static_scaled_int8_quant_impl<false>(input.data_ptr<scalar_t>(),
                                               out.data_ptr<int8_t>(),
                                               scale.data_ptr<float>(), nullptr,
                                               num_tokens, stride, hidden_size);
        }
      });
 }
 // dynamic-per-token quantization.
 void dynamic_scaled_int8_quant(
    torch::Tensor& out,          // [batch, hidden_size]
    const torch::Tensor& input,  // [batch, hidden_size]
    torch::Tensor& scale,        // [batch, 1]
    std::optional<torch::Tensor> const& azp) {
  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK_EQ(input.dim(), 2);
  TORCH_CHECK_EQ(input.stride(1), 1);
  const int64_t hidden_size = input.size(1);
  const int64_t num_tokens = input.size(0);
  const int64_t stride = input.stride(0);
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
        if (azp.has_value()) {
          dynamic_scaled_int8_quant_impl<true>(
              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
              stride, hidden_size);
        } else {
          dynamic_scaled_int8_quant_impl<false>(
              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
              scale.data_ptr<float>(), nullptr, num_tokens, stride,
              hidden_size);
        }
      });
 }
 int64_t create_onednn_mm_handler(const torch::Tensor& b,
                                 int64_t primitive_cache_size) {
  TORCH_CHECK(b.dim() == 2);
  MatMulPrimitiveHandler::Args args;
  args.primitive_cache_size = primitive_cache_size;
  args.b_k_size = b.size(0);
  args.b_k_stride = b.stride(0);
  args.b_n_size = b.size(1);
  args.b_n_stride = b.stride(1);
  args.b_ptr = b.data_ptr();
  VLLM_DISPATCH_FLOATING_TYPES(b.scalar_type(), "create_onednn_mm_handler",
                               [&] {
                                 args.c_type = get_dnnl_type<scalar_t>();
                                 args.ab_type = get_dnnl_type<scalar_t>();
                               });
  return reinterpret_cast<int64_t>(new MatMulPrimitiveHandler(args));
 }
 void onednn_mm(torch::Tensor& c,        // [M, OC], row-major
               const torch::Tensor& a,  // [M, IC], row-major
               const std::optional<torch::Tensor>& bias, int64_t handler) {
  CPU_KERNEL_GUARD_IN(onednn_mm)
  TORCH_CHECK(a.dim() == 2);
  TORCH_CHECK(a.stride(-1) == 1);
  TORCH_CHECK(c.is_contiguous());
  MatMulPrimitiveHandler* ptr =
      reinterpret_cast<MatMulPrimitiveHandler*>(handler);
  MatMulPrimitiveHandler::ExecArgs exec_args;
  exec_args.a_m_size = a.size(0);
  exec_args.a_m_stride = a.stride(0);
  VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] {
    if (bias.has_value()) {
      exec_args.use_bias = true;
      exec_args.bias_type = get_dnnl_type<scalar_t>();
      exec_args.bias_ptr = bias->data_ptr<scalar_t>();
    } else {
      exec_args.use_bias = false;
      exec_args.bias_type = get_dnnl_type<void>();
      exec_args.bias_ptr = nullptr;
    }
    exec_args.a_ptr = a.data_ptr<scalar_t>();
    exec_args.c_ptr = c.data_ptr<scalar_t>();
    ptr->execute(exec_args);
  });
 }
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@ -0,0 +1,951 @@
 #include "cpu_types.hpp"
 #include "dnnl_helper.hpp"
 namespace {
 template <typename scalar_t>
 struct KernelVecType {
  using load_vec_type = void;
  using azp_adj_load_vec_type = void;
  using cvt_vec_type = void;
 };
 template <>
 struct KernelVecType<float> {
  using load_vec_type = vec_op::FP32Vec16;
  using azp_adj_load_vec_type = vec_op::INT32Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 #if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
 template <>
 struct KernelVecType<c10::BFloat16> {
  using load_vec_type = vec_op::BF16Vec16;
  using azp_adj_load_vec_type = vec_op::INT32Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 #endif
 template <>
 struct KernelVecType<c10::Half> {
 #if defined(__powerpc64__) || defined(__s390x__)
  // Power architecture-specific vector type
  using load_vec_type = vec_op::FP32Vec16;
 #else
  // Fallback for other architectures
  using load_vec_type = vec_op::FP16Vec16;
 #endif
  using azp_adj_load_vec_type = vec_op::INT32Vec16;
  using cvt_vec_type = vec_op::FP32Vec16;
 };
 #if defined(__AVX512F__) || defined(__aarch64__)
 template <bool AZP, typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int32_t* azp,
                                   const int num_tokens,
                                   const int hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  constexpr float i8_min =
      static_cast<float>(std::numeric_limits<int8_t>::min());
  constexpr float i8_max =
      static_cast<float>(std::numeric_limits<int8_t>::max());
  const cvt_vec_t inv_scale(1.0 / *scale);
  const cvt_vec_t i8_min_vec(i8_min);
  const cvt_vec_t i8_max_vec(i8_max);
  cvt_vec_t zp_vec;
  if constexpr (AZP) {
    zp_vec = cvt_vec_t(static_cast<float>(*azp));
  }
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    int j = 0;
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = elems_fp32 * inv_scale;
      if constexpr (AZP) {
        elems_fp32 = elems_fp32 + zp_vec;
      }
      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      elems_int8.save(output + i * hidden_size + j);
    }
    load_vec_t elems(input + i * hidden_size + j);
    cvt_vec_t elems_fp32(elems);
    elems_fp32 = elems_fp32 * inv_scale;
    if constexpr (AZP) {
      elems_fp32 = elems_fp32 + zp_vec;
    }
    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
    vec_op::INT8Vec16 elems_int8(elems_fp32);
    elems_int8.save(output + i * hidden_size + j, hidden_size - j);
  }
 }
 template <bool AZP, typename scalar_t>
 void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                    float* scale, int32_t* azp,
                                    const int num_tokens,
                                    const int hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  constexpr float i8_min =
      static_cast<float>(std::numeric_limits<int8_t>::min());
  constexpr float i8_max =
      static_cast<float>(std::numeric_limits<int8_t>::max());
  const cvt_vec_t i8_min_vec(i8_min);
  const cvt_vec_t i8_max_vec(i8_max);
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
    cvt_vec_t min_value(std::numeric_limits<float>::max());
    {
      int j = 0;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input + i * hidden_size + j);
        cvt_vec_t elems_fp32(elems);
        if constexpr (AZP) {
          max_value = max_value.max(elems_fp32);
          min_value = min_value.min(elems_fp32);
        } else {
          max_value = max_value.max(elems_fp32.abs());
        }
      }
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      if (j + vec_elem_num == hidden_size) {
        if constexpr (AZP) {
          max_value = max_value.max(elems_fp32);
          min_value = min_value.min(elems_fp32);
        } else {
          max_value = max_value.max(elems_fp32.abs());
        }
      } else {
        if constexpr (AZP) {
          max_value = max_value.max(elems_fp32, hidden_size - j);
          min_value = min_value.min(elems_fp32, hidden_size - j);
        } else {
          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
        }
      }
    }
    float scale_val, azp_val;
    if constexpr (AZP) {
      float max_scalar = max_value.reduce_max();
      float min_scalar = min_value.reduce_min();
      scale_val = (max_scalar - min_scalar) / 255.0f;
      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
      azp[i] = static_cast<int32_t>(azp_val);
      scale[i] = scale_val;
    } else {
      scale_val = max_value.reduce_max() / 127.0f;
      scale[i] = scale_val;
    }
    const cvt_vec_t inv_scale(1.0 / scale_val);
    const cvt_vec_t azp_vec(azp_val);
    {
      int j = 0;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input + i * hidden_size + j);
        cvt_vec_t elems_fp32(elems);
        elems_fp32 = (elems_fp32 * inv_scale);
        if constexpr (AZP) {
          elems_fp32 = elems_fp32 + azp_vec;
        }
        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
        vec_op::INT8Vec16 elems_int8(elems_fp32);
        elems_int8.save(output + i * hidden_size + j);
      }
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = (elems_fp32 * inv_scale);
      if constexpr (AZP) {
        elems_fp32 = elems_fp32 + azp_vec;
      }
      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
    }
  }
 }
 template <bool PerChannel, typename scalar_t>
 void static_quant_epilogue(const float* input, scalar_t* output,
                           const float a_scale, const float* b_scale,
                           const int32_t* azp_with_adj, const int num_tokens,
                           const int hidden_size) {
  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using azp_adj_load_vec_t =
      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    cvt_vec_t a_scale_vec(a_scale);
    cvt_vec_t b_scale_vec(*b_scale);
    cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
    int j = 0;
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      cvt_vec_t elems_fp32(input + i * hidden_size + j);
      azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
      cvt_vec_t azp_adj_fp32(azp_adj_vec);
      if constexpr (PerChannel) {
        b_scale_vec = cvt_vec_t(b_scale + j);
        scale_vec = b_scale_vec * a_scale_vec;
      }
      elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
      load_vec_t elems_out(elems_fp32);
      elems_out.save(output + i * hidden_size + j);
    }
    cvt_vec_t elems_fp32(input + i * hidden_size + j);
    azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
    cvt_vec_t azp_adj_fp32(azp_adj_vec);
    if constexpr (PerChannel) {
      b_scale_vec = cvt_vec_t(b_scale + j);
      scale_vec = b_scale_vec * a_scale_vec;
    }
    elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
    load_vec_t elems_out(elems_fp32);
    elems_out.save(output + i * hidden_size + j, hidden_size - j);
  }
 }
 template <bool AZP, bool PerChannel, bool Bias, typename scalar_t>
 void dynamic_quant_epilogue(const float* input, scalar_t* output,
                            const float* a_scale, const float* b_scale,
                            const int32_t* azp, const int32_t* azp_adj,
                            const scalar_t* bias, const int num_tokens,
                            const int hidden_size) {
  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using azp_adj_load_vec_t =
      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    int j = 0;
    cvt_vec_t token_scale_vec(a_scale[i]);
    cvt_vec_t token_zp_scale_vec;
    if constexpr (AZP) {
      float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
      if constexpr (!PerChannel) {
        zp_scale_val *= *b_scale;
      }
      token_zp_scale_vec = cvt_vec_t(zp_scale_val);
    }
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      cvt_vec_t elems_fp32(input + i * hidden_size + j);
      elems_fp32 = elems_fp32 * token_scale_vec;
      if constexpr (AZP) {
        azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
        cvt_vec_t azp_adj_fp32(azp_adj_vec);
        azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
        if constexpr (PerChannel) {
          cvt_vec_t b_scale_vec(b_scale + j);
          azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
        }
        elems_fp32 = elems_fp32 - azp_adj_fp32;
      }
      if constexpr (Bias) {
        load_vec_t bias_vec(bias + j);
        cvt_vec_t bias_vec_fp32(bias_vec);
        elems_fp32 = elems_fp32 + bias_vec_fp32;
      }
      load_vec_t elems_out(elems_fp32);
      elems_out.save(output + i * hidden_size + j);
    }
    cvt_vec_t elems_fp32(input + i * hidden_size + j);
    elems_fp32 = elems_fp32 * token_scale_vec;
    if constexpr (AZP) {
      azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
      cvt_vec_t azp_adj_fp32(azp_adj_vec);
      azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
      if constexpr (PerChannel) {
        cvt_vec_t b_scale_vec(b_scale + j);
        azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
      }
      elems_fp32 = elems_fp32 - azp_adj_fp32;
    }
    if constexpr (Bias) {
      load_vec_t bias_vec(bias + j);
      cvt_vec_t bias_vec_fp32(bias_vec);
      elems_fp32 = elems_fp32 + bias_vec_fp32;
    }
    load_vec_t elems_out(elems_fp32);
    elems_out.save(output + i * hidden_size + j, hidden_size - j);
  }
 }
 #elif defined(__powerpc64__)
 template <bool AZP, typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int32_t* azp,
                                   const int num_tokens,
                                   const int hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  constexpr float i8_min =
      static_cast<float>(std::numeric_limits<int8_t>::min());
  constexpr float i8_max =
      static_cast<float>(std::numeric_limits<int8_t>::max());
  const cvt_vec_t inv_scale(1.0 / *scale);
  const cvt_vec_t i8_min_vec(i8_min);
  const cvt_vec_t i8_max_vec(i8_max);
  cvt_vec_t zp_vec;
  if constexpr (AZP) {
    zp_vec = cvt_vec_t(static_cast<float>(*azp));
  }
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    int j = 0;
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = elems_fp32 * inv_scale;
      if constexpr (AZP) {
        elems_fp32 = elems_fp32 + zp_vec;
      }
      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      elems_int8.save(output + i * hidden_size + j);
    }
    load_vec_t elems(input + i * hidden_size + j);
    cvt_vec_t elems_fp32(elems);
    elems_fp32 = elems_fp32 * inv_scale;
    if constexpr (AZP) {
      elems_fp32 = elems_fp32 + zp_vec;
    }
    elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
    vec_op::INT8Vec16 elems_int8(elems_fp32);
    elems_int8.save(output + i * hidden_size + j, hidden_size - j);
  }
 }
 template <bool AZP, typename scalar_t>
 void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                    float* scale, int32_t* azp,
                                    const int num_tokens,
                                    const int hidden_size) {
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  constexpr float i8_min =
      static_cast<float>(std::numeric_limits<int8_t>::min());
  constexpr float i8_max =
      static_cast<float>(std::numeric_limits<int8_t>::max());
  const cvt_vec_t i8_min_vec(i8_min);
  const cvt_vec_t i8_max_vec(i8_max);
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    cvt_vec_t max_value(std::numeric_limits<float>::lowest());
    cvt_vec_t min_value(std::numeric_limits<float>::max());
    {
      int j = 0;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input + i * hidden_size + j);
        cvt_vec_t elems_fp32(elems);
        if constexpr (AZP) {
          max_value = max_value.max(elems_fp32);
          min_value = min_value.min(elems_fp32);
        } else {
          max_value = max_value.max(elems_fp32.abs());
        }
      }
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      if (j + vec_elem_num == hidden_size) {
        if constexpr (AZP) {
          max_value = max_value.max(elems_fp32);
          min_value = min_value.min(elems_fp32);
        } else {
          max_value = max_value.max(elems_fp32.abs());
        }
      } else {
        if constexpr (AZP) {
          max_value = max_value.max(elems_fp32, hidden_size - j);
          min_value = min_value.min(elems_fp32, hidden_size - j);
        } else {
          max_value = max_value.max(elems_fp32.abs(), hidden_size - j);
        }
      }
    }
    float scale_val, azp_val;
    if constexpr (AZP) {
      float max_scalar = max_value.reduce_max();
      float min_scalar = min_value.reduce_min();
      scale_val = (max_scalar - min_scalar) / 255.0f;
      azp_val = std::nearbyint(-128.0f - min_scalar / scale_val);
      azp[i] = static_cast<int32_t>(azp_val);
      scale[i] = scale_val;
    } else {
      scale_val = max_value.reduce_max() / 127.0f;
      scale[i] = scale_val;
    }
    const cvt_vec_t inv_scale(1.0 / scale_val);
    const cvt_vec_t azp_vec(azp_val);
    {
      int j = 0;
      for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
        load_vec_t elems(input + i * hidden_size + j);
        cvt_vec_t elems_fp32(elems);
        elems_fp32 = (elems_fp32 * inv_scale);
        if constexpr (AZP) {
          elems_fp32 = elems_fp32 + azp_vec;
        }
        elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
        vec_op::INT8Vec16 elems_int8(elems_fp32);
        elems_int8.save(output + i * hidden_size + j);
      }
      load_vec_t elems(input + i * hidden_size + j);
      cvt_vec_t elems_fp32(elems);
      elems_fp32 = (elems_fp32 * inv_scale);
      if constexpr (AZP) {
        elems_fp32 = elems_fp32 + azp_vec;
      }
      elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec);
      vec_op::INT8Vec16 elems_int8(elems_fp32);
      elems_int8.save(output + i * hidden_size + j, hidden_size - j);
    }
  }
 }
 template <bool PerChannel, typename scalar_t>
 void static_quant_epilogue(const float* input, scalar_t* output,
                           const float a_scale, const float* b_scale,
                           const int32_t* azp_with_adj, const int num_tokens,
                           const int hidden_size) {
  CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using azp_adj_load_vec_t =
      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    cvt_vec_t a_scale_vec(a_scale);
    cvt_vec_t b_scale_vec(*b_scale);
    cvt_vec_t scale_vec = a_scale_vec * b_scale_vec;
    int j = 0;
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      cvt_vec_t elems_fp32(input + i * hidden_size + j);
      azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
      cvt_vec_t azp_adj_fp32(azp_adj_vec);
      if constexpr (PerChannel) {
        b_scale_vec = cvt_vec_t(b_scale + j);
        scale_vec = b_scale_vec * a_scale_vec;
      }
      elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
      load_vec_t elems_out(elems_fp32);
      elems_out.save(output + i * hidden_size + j);
    }
    cvt_vec_t elems_fp32(input + i * hidden_size + j);
    azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j);
    cvt_vec_t azp_adj_fp32(azp_adj_vec);
    if constexpr (PerChannel) {
      b_scale_vec = cvt_vec_t(b_scale + j);
      scale_vec = b_scale_vec * a_scale_vec;
    }
    elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32;
    load_vec_t elems_out(elems_fp32);
    elems_out.save(output + i * hidden_size + j, hidden_size - j);
  }
 }
 template <bool AZP, bool PerChannel, bool Bias, typename scalar_t>
 void dynamic_quant_epilogue(const float* input, scalar_t* output,
                            const float* a_scale, const float* b_scale,
                            const int32_t* azp, const int32_t* azp_adj,
                            const scalar_t* bias, const int num_tokens,
                            const int hidden_size) {
  CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue)
  using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
  using azp_adj_load_vec_t =
      typename KernelVecType<scalar_t>::azp_adj_load_vec_type;
  using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
  constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
  #pragma omp parallel for
  for (int i = 0; i < num_tokens; ++i) {
    int j = 0;
    cvt_vec_t token_scale_vec(a_scale[i]);
    cvt_vec_t token_zp_scale_vec;
    if constexpr (AZP) {
      float zp_scale_val = a_scale[i] * static_cast<float>(azp[i]);
      if constexpr (!PerChannel) {
        zp_scale_val *= *b_scale;
      }
      token_zp_scale_vec = cvt_vec_t(zp_scale_val);
    }
    for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
      cvt_vec_t elems_fp32(input + i * hidden_size + j);
      elems_fp32 = elems_fp32 * token_scale_vec;
      if constexpr (AZP) {
        azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
        cvt_vec_t azp_adj_fp32(azp_adj_vec);
        azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
        if constexpr (PerChannel) {
          cvt_vec_t b_scale_vec(b_scale + j);
          azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
        }
        elems_fp32 = elems_fp32 - azp_adj_fp32;
      }
      if constexpr (Bias) {
        load_vec_t bias_vec(bias + j);
        cvt_vec_t bias_vec_fp32(bias_vec);
        elems_fp32 = elems_fp32 + bias_vec_fp32;
      }
      load_vec_t elems_out(elems_fp32);
      elems_out.save(output + i * hidden_size + j);
    }
    cvt_vec_t elems_fp32(input + i * hidden_size + j);
    elems_fp32 = elems_fp32 * token_scale_vec;
    if constexpr (AZP) {
      azp_adj_load_vec_t azp_adj_vec(azp_adj + j);
      cvt_vec_t azp_adj_fp32(azp_adj_vec);
      azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec;
      if constexpr (PerChannel) {
        cvt_vec_t b_scale_vec(b_scale + j);
        azp_adj_fp32 = azp_adj_fp32 * b_scale_vec;
      }
      elems_fp32 = elems_fp32 - azp_adj_fp32;
    }
    if constexpr (Bias) {
      load_vec_t bias_vec(bias + j);
      cvt_vec_t bias_vec_fp32(bias_vec);
      elems_fp32 = elems_fp32 + bias_vec_fp32;
    }
    load_vec_t elems_out(elems_fp32);
    elems_out.save(output + i * hidden_size + j, hidden_size - j);
  }
 }
 #else
 template <typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                   const float* scale, const int32_t* azp,
                                   const int num_tokens,
                                   const int hidden_size) {
  TORCH_CHECK(false,
              "static_scaled_int8_quant_impl requires AVX512/powerpc64/AArch64 "
              "support.")
 }
 template <typename scalar_t>
 void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
                                    float* scale, int32_t* azp,
                                    const int num_tokens,
                                    const int hidden_size) {
  TORCH_CHECK(false,
              "dynamic_scaled_int8_quant_impl requires "
              "AVX512/powerpc64/AArch64 support.")
 }
 template <bool PerChannel, typename scalar_t>
 void static_quant_epilogue(const float* input, scalar_t* output,
                           const float a_scale, const float* b_scale,
                           const int32_t* azp_with_adj, const int num_tokens,
                           const int hidden_size) {
  TORCH_CHECK(
      false, "static_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
 }
 template <typename scalar_t>
 void dynamic_quant_epilogue(const float* input, scalar_t* output,
                            const float* a_scale, const float* b_scale,
                            const int32_t* azp, const int32_t* azp_with_adj,
                            const scalar_t* bias, const int num_tokens,
                            const int hidden_size) {
  TORCH_CHECK(
      false,
      "dynamic_quant_epilogue requires AVX512/powerpc64/AArch64 support.")
 }
 #endif
 }  // namespace
 void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
                    const torch::Tensor& a,         // [M, IC], row-major
                    const torch::Tensor& b,         // [IC, OC], column-major
                    const torch::Tensor& a_scales,  // [1] or [M]
                    const torch::Tensor& b_scales,  // [1] or [OC]
                    const std::optional<torch::Tensor>& bias  // [OC]
 ) {
  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
  // Checks for conformality
  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
              "int8_scaled_mm only supports INT8 inputs.")
  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
              b.size(1) == c.size(1));
  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
  // Check for strides and alignment
  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
  TORCH_CHECK(c.stride(0) % 16 == 0 &&
              b.stride(1) % 16 == 0);  // 16 Byte Alignment
  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
  if (bias) {
    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
                bias->dim() == 1);
  }
  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] {
    if (a_scales.numel() != 1) {
      // per-token
      // Note: oneDNN doesn't support per-token activation quantization
      // Ideally we want to fuse the GEMM and the scale procedure with oneDNN
      // JIT, the intermediate data is cached in registers or L1. But for now
      // the oneDNN GEMM code generation only supports two quantization
      // patterns: per-tensor or per-output-channel of weight.
      // So we have to apply the per-token scale with a 'epilogue'. In C=s_a *
      // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN
      // GEMM, then the per-token scale (and bias) is applied with the epilogue
      // C=s_a * C_inter + bias.
      torch::Tensor tmp_fp32_out =
          torch::empty_like(c, ::at::ScalarType::Float);
      // Compute C_inter=s_b * (A@B)
      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
      if (bias.has_value()) {
        // Compute C=s_a * C_inter + bias
        dynamic_quant_epilogue<false, true, true>(
            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr,
            bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
      } else {
        // Compute C=s_a * C_inter
        dynamic_quant_epilogue<false, true, false, scalar_t>(
            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
            a_scales.data_ptr<float>(), nullptr, nullptr, nullptr, nullptr,
            c.size(0), c.size(1));
      }
    } else {
      // per-tensor
      if (bias.has_value()) {
        // Compute C=s_a * s_b * (A@B) + bias
        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
            bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
            a_scales.numel(), b_scales.numel());
      } else {
        // Compute C=s_a * s_b * (A@B)
        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<scalar_t, void>(
            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
            nullptr, a.size(0), b.size(1), a.size(1),
            a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
            a_scales.numel(), b_scales.numel());
      }
    }
  });
 }
 void int8_scaled_mm_azp(torch::Tensor& c,        // [M, OC], row-major
                        const torch::Tensor& a,  // [M, IC], row-major
                        const torch::Tensor& b,  // [IC, OC], column-major
                        const torch::Tensor& a_scales,            // [1] or [M]
                        const torch::Tensor& b_scales,            // [1] or [OC]
                        const torch::Tensor& azp_adj,             // [OC]
                        const std::optional<torch::Tensor>& azp,  // [1] or [M]
                        const std::optional<torch::Tensor>& bias  // [OC]
 ) {
  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp)
  // Checks for conformality
  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
              "int8_scaled_mm_azp only supports INT8 inputs.")
  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
              b.size(1) == c.size(1));
  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
  // Check for strides and alignment
  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
  TORCH_CHECK(c.stride(0) % 16 == 0 &&
              b.stride(1) % 16 == 0);  // 16 Byte Alignment
  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
  if (bias) {
    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous());
  }
  if (azp) {
    TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous());
  }
  TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous());
  // azp & bias types
  TORCH_CHECK(azp_adj.dtype() == torch::kInt32);
  TORCH_CHECK(!azp || azp->dtype() == torch::kInt32);
  TORCH_CHECK(!bias || bias->dtype() == c.dtype(),
              "currently bias dtype must match output dtype ", c.dtype());
  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] {
    torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
    if (a_scales.numel() != 1) {
      // per-token
      // Note: oneDNN doesn't support per-token activation quantization
      // Compute C_inter=s_b * (A@B)
      DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
          a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
          tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
          a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
      if (bias.has_value()) {
        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias
        if (b_scales.numel() != 1) {
          // Per-Channel
          dynamic_quant_epilogue<true, true, true>(
              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
        } else {
          // Per-Tensor
          dynamic_quant_epilogue<true, false, true>(
              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(),
              bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
        }
      } else {
        // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj
        if (b_scales.numel() != 1) {
          // Per-Channel
          dynamic_quant_epilogue<true, true, false, scalar_t>(
              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
              c.size(0), c.size(1));
        } else {
          // Per-Tensor
          dynamic_quant_epilogue<true, false, false, scalar_t>(
              tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
              a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
              azp->data_ptr<int32_t>(), azp_adj.data_ptr<int32_t>(), nullptr,
              c.size(0), c.size(1));
        }
      }
    } else {
      // per-tensor
      if (bias.has_value()) {
        // Compute C_inter=s_a * s_b * (A@B) + bias
        DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
            tmp_fp32_out.data_ptr<float>(), bias->data_ptr<scalar_t>(),
            a.size(0), b.size(1), a.size(1), a_scales.data_ptr<float>(),
            b_scales.data_ptr<float>(), a_scales.numel(), b_scales.numel());
      } else {
        // Compute C_inter=s_a * s_b * (A@B)
        DNNLPrimitiveHelper<false>::gemm_s8s8_jit<float, void>(
            a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
            tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
            a.size(1), a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
            a_scales.numel(), b_scales.numel());
      }
      // Compute C=C_inter - s_a * s_b * azp_adj
      if (b_scales.numel() != 1) {
        // Per-Channel
        static_quant_epilogue<true>(
            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
      } else {
        // Per-Tensor
        static_quant_epilogue<false>(
            tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
            *a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
            azp_adj.data_ptr<int32_t>(), a.size(0), b.size(1));
      }
    }
  });
 }
 // static-per-tensor quantization.
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                              const torch::Tensor& input,  // [..., hidden_size]
                              const torch::Tensor& scale,
                              std::optional<torch::Tensor> const& azp) {
  CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(scale.numel() == 1);
  TORCH_CHECK(!azp.has_value() || azp->numel() == 1);
  const int hidden_size = input.size(-1);
  const int num_tokens = input.numel() / hidden_size;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
        if (azp.has_value()) {
          static_scaled_int8_quant_impl<true>(
              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
              hidden_size);
        } else {
          static_scaled_int8_quant_impl<false>(
              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
        }
      });
 }
 // dynamic-per-token quantization.
 void dynamic_scaled_int8_quant(
    torch::Tensor& out,          // [..., hidden_size]
    const torch::Tensor& input,  // [..., hidden_size]
    torch::Tensor& scale,        // [..., 1]
    std::optional<torch::Tensor> const& azp) {
  CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.is_contiguous());
  int const hidden_size = input.size(-1);
  int const num_tokens = input.numel() / hidden_size;
  VLLM_DISPATCH_FLOATING_TYPES(
      input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
        if (azp.has_value()) {
          dynamic_scaled_int8_quant_impl<true>(
              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
              scale.data_ptr<float>(), azp->data_ptr<int32_t>(), num_tokens,
              hidden_size);
        } else {
          dynamic_scaled_int8_quant_impl<false>(
              input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
              scale.data_ptr<float>(), nullptr, num_tokens, hidden_size);
        }
      });
 }
 #if defined(__powerpc64__)
 void int8_scaled_mm_ppc64le(torch::Tensor& c,        // [M, OC], row-major
                            const torch::Tensor& a,  // [M, IC], row-major
                            const torch::Tensor& b,  // [IC, OC], column-major
                            const torch::Tensor& a_scales,
                            const torch::Tensor& b_scales,
                            const std::optional<torch::Tensor>& bias  // [OC]
 ) {
  CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
  // Checks for conformality
  TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
              "int8_scaled_mm_ppc64le only supports INT8 inputs.");
  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
              b.size(1) == c.size(1));
  // We dont need this
  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
  // Check for strides and alignment
  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
  TORCH_CHECK(c.stride(0) % 16 == 0 &&
              b.stride(1) % 16 == 0);  // 16 Byte Alignment
  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
  if (bias) {
    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
                bias->dim() == 1);
  }
  VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_ppc64le", [&] {
    torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float);
    // Compute C_inter=s_b * (A@B)
    DNNLPrimitiveHelper<true>::gemm_s8s8_jit<float, void>(
        a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
        tmp_fp32_out.data_ptr<float>(), nullptr, a.size(0), b.size(1),
        a.size(1), nullptr, b_scales.data_ptr<float>(), 0, b_scales.numel());
    if (bias.has_value()) {
      // Compute C=s_a * C_inter + bias
      dynamic_quant_epilogue<false, true, true>(
          tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
          a_scales.data_ptr<float>(), nullptr, nullptr, nullptr,
          bias->data_ptr<scalar_t>(), c.size(0), c.size(1));
    } else {
      // Compute C=s_a * C_inter
      dynamic_quant_epilogue<false, true, false, scalar_t>(
          tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
          a_scales.data_ptr<float>(), nullptr, nullptr, nullptr, nullptr,
          c.size(0), c.size(1));
    }
  });
 }
 #endif
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@ -6,26 +6,25 @@
 std::string init_cpu_threads_env(const std::string& cpu_ids);
-void release_dnnl_matmul_handler(int64_t handler);
+void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
                    const torch::Tensor& b, const torch::Tensor& a_scales,
                    const torch::Tensor& b_scales,
                    const std::optional<torch::Tensor>& bias);
-int64_t create_onednn_scaled_mm_handler(const torch::Tensor& b,
+void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
-                                        const torch::Tensor& b_scales,
+                        const torch::Tensor& b, const torch::Tensor& a_scales,
-                                        at::ScalarType output_type,
+                        const torch::Tensor& b_scales,
-                                        bool dynamic_act_quant, bool use_azp,
+                        const torch::Tensor& azp_adj,
-                                        int64_t primitive_cache_size);
+                        const std::optional<torch::Tensor>& azp,
                        const std::optional<torch::Tensor>& bias);
-void onednn_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
+#if defined(__powerpc64__)
-                      const torch::Tensor& a_scales,
+void int8_scaled_mm_ppc64le(torch::Tensor& c, const torch::Tensor& a,
-                      const std::optional<torch::Tensor>& azp,
+                            const torch::Tensor& b,
-                      const std::optional<torch::Tensor>& azp_adj,
+                            const torch::Tensor& a_scales,
-                      const std::optional<torch::Tensor>& bias,
+                            const torch::Tensor& b_scales,
-                      int64_t handler);
+                            const std::optional<torch::Tensor>& bias);
-
+#endif
 int64_t create_onednn_mm_handler(const torch::Tensor& b,
                                 int64_t primitive_cache_size);
 void onednn_mm(torch::Tensor& c, const torch::Tensor& a,
               const std::optional<torch::Tensor>& bias, int64_t handler);
 void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
                        torch::Tensor& kv_cache, double scale,
@ -152,37 +151,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
  // Quantization
-#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
+#if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__))
    defined(__powerpc64__)
  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
  // Helper function to release oneDNN handlers
  ops.def("release_dnnl_matmul_handler(int handler) -> ()",
          &release_dnnl_matmul_handler);
  // Create oneDNN GEMM handler
  ops.def(
      "create_onednn_mm_handler(Tensor b, int "
      "primitive_cache_size) -> int",
      &create_onednn_mm_handler);
  // oneDNN GEMM
  ops.def(
      "onednn_mm(Tensor! c, Tensor a, Tensor? bias, "
      "int handler) -> ()");
  ops.impl("onednn_mm", torch::kCPU, &onednn_mm);
  // Create oneDNN W8A8 handler
  ops.def(
      "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType "
      "output_type, bool dynamic_act_quant, bool use_azp, int "
      "primitive_cache_size) -> int",
      &create_onednn_scaled_mm_handler);
  // oneDNN scaled_mm for W8A8 with static per-tensor activation quantization
  ops.def(
      "onednn_scaled_mm(Tensor! c, Tensor a, Tensor a_scales, Tensor? azp, "
      "Tensor? azp_adj, Tensor? bias, int handler) -> ()");
  ops.impl("onednn_scaled_mm", torch::kCPU, &onednn_scaled_mm);
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
@ -198,6 +168,50 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      {stride_tag});
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
  // W8A8 GEMM, supporting symmetric per-tensor or per-row/column
  // quantization.
  ops.def(
      "cutlass_scaled_mm(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
      "                  Tensor b_scales, Tensor? bias) -> ()",
      {stride_tag});
  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
  // quantization.
  ops.def(
      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
      "                  Tensor b_scales, Tensor azp_adj,"
      "                  Tensor? azp, Tensor? bias) -> ()",
      {stride_tag});
  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #elif defined(__powerpc64__)
  // Compute int8 quantized tensor for given scaling factor.
  ops.def(
      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
      "Tensor? azp) -> ()");
  ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
  // Compute int8 quantized tensor and scaling factor
  ops.def(
      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
      "Tensor!? azp) -> ()");
  ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
           &dynamic_scaled_int8_quant);
  // W8A8 GEMM, supporting symmetric quantization.
  ops.def(
      "cutlass_scaled_mm(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
      "                  Tensor b_scales, Tensor? bias) -> ()");
  ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm_ppc64le);
  // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
  // quantization.
  ops.def(
      "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
      "                  Tensor b, Tensor a_scales,"
      "                  Tensor b_scales, Tensor azp_adj,"
      "                  Tensor? azp, Tensor? bias) -> ()");
  ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
 #endif
 // SHM CCL
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@ -15,8 +15,6 @@ typedef __hip_bfloat16 nv_bfloat16;
 #include <map>
 #include <unordered_map>
 #include <vector>
 #include <cstdlib>
 #include <cstring>
 namespace vllm {
 #define CUDACHECK(cmd)                                              \
@ -557,47 +555,22 @@ class CustomAllreduce {
    size /= d;
    auto bytes = size * sizeof(typename packed_t<T>::P);
    int blocks = std::min(block_limit, (size + threads - 1) / threads);
    // Check environment variable once
    const char* env_algo = std::getenv("VLLM_CUSTOM_ALLREDUCE_ALGO");
    bool force_1stage = false;
    bool force_2stage = false;
    if (env_algo != nullptr) {
      if (std::strcmp(env_algo, "1stage") == 0 ||
          std::strcmp(env_algo, "oneshot") == 0) {
        force_1stage = true;
      } else if (std::strcmp(env_algo, "2stage") == 0 ||
                 std::strcmp(env_algo, "twoshot") == 0) {
        force_2stage = true;
      } else {
        throw std::runtime_error(
            "Invalid VLLM_CUSTOM_ALLREDUCE_ALGO: " + std::string(env_algo) +
            ". Valid values: 1stage, oneshot, 2stage, twoshot");
      }
    }
 #define KL(ngpus, name)                                                       \
  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
                                                 rank_, size);
-#define REDUCE_CASE(ngpus)                              \
+#define REDUCE_CASE(ngpus)                            \
-  case ngpus: {                                         \
+  case ngpus: {                                       \
-    if (force_1stage) {                                 \
+    if (world_size_ == 2) {                           \
-      KL(ngpus, cross_device_reduce_1stage);            \
+      KL(ngpus, cross_device_reduce_1stage);          \
-    } else if (force_2stage) {                          \
+    } else if (fully_connected_) {                    \
-      KL(ngpus, cross_device_reduce_2stage);            \
+      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
-    } else {                                            \
+          (world_size_ <= 8 && bytes < 256 * 1024)) { \
-      if (world_size_ == 2) {                           \
+        KL(ngpus, cross_device_reduce_1stage);        \
-        KL(ngpus, cross_device_reduce_1stage);          \
+      } else {                                        \
-      } else if (fully_connected_) {                    \
+        KL(ngpus, cross_device_reduce_2stage);        \
-        if ((world_size_ <= 4 && bytes < 512 * 1024) || \
+      }                                               \
-            (world_size_ <= 8 && bytes < 256 * 1024)) { \
+    }                                                 \
-          KL(ngpus, cross_device_reduce_1stage);        \
+    break;                                            \
        } else {                                        \
          KL(ngpus, cross_device_reduce_2stage);        \
        }                                               \
      }                                                 \
    }                                                   \
    break;                                              \
  }
    switch (world_size_) {
--- a/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/collective_builder.hpp
@ -0,0 +1,123 @@
 // Modified from: cutlass/gemm/collective/builders/sm90_gmma_builder.inl
 // clang-format off
 #pragma once
 #include "cutlass/gemm/collective/builders/sm90_gmma_builder.inl"
 #include "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 namespace cutlass::gemm::collective {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // GMMA_TMA_WS_SS (BlockScaled Builders)
 template <
  class ElementA,
  class GmemLayoutATag,
  int AlignmentA,
  class ElementB,
  class GmemLayoutBTag,
  int AlignmentB,
  class ElementAccumulator,
  class TileShape_MNK,
  class ClusterShape_MNK,
  class StageCountType,
  int ScaleGranularityM
 >
 struct CollectiveBuilder<
    arch::Sm90,
    arch::OpClassTensorOp,
    ElementA,
    GmemLayoutATag,
    AlignmentA,
    ElementB,
    GmemLayoutBTag,
    AlignmentB,
    ElementAccumulator,
    TileShape_MNK,
    ClusterShape_MNK,
    StageCountType,
    KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>,
    cute::enable_if_t<
      not detail::is_use_rmem_A<ElementA, GmemLayoutATag, ElementB, GmemLayoutBTag>()>
 > {
  using KernelScheduleType = KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>;
  static_assert(is_static<TileShape_MNK>::value);
  static_assert(is_static<ClusterShape_MNK>::value);
 #ifndef CUTLASS_SM90_COLLECTIVE_BUILDER_SUPPORTED
  static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
 #endif
  static_assert(detail::is_aligned<ElementA, AlignmentA, ElementB, AlignmentB, detail::tma_alignment_bytes>(),
                "Should meet TMA alignment requirement\n");
  static constexpr bool IsArrayOfPointersGemm = (cute::is_any_of_v<KernelScheduleType,
                                                                   KernelPtrArrayTmaWarpSpecializedCooperative,
                                                                   KernelPtrArrayTmaWarpSpecializedPingpong>);
  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
  static_assert((!IsFP8Input || !IsArrayOfPointersGemm),
                "KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum is only compatible with FP8 Blocked Scaled version right now.");
  // For fp32 types, map to tf32 MMA value type
  using ElementAMma = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
  using ElementBMma = cute::conditional_t<cute::is_same_v<ElementB, float>, tfloat32_t, ElementB>;
  static constexpr cute::GMMA::Major GmmaMajorA = detail::gmma_ss_tag_to_major_A<ElementAMma, GmemLayoutATag>();
  static constexpr cute::GMMA::Major GmmaMajorB = detail::gmma_ss_tag_to_major_B<ElementBMma, GmemLayoutBTag>();
  static constexpr bool IsCooperative = cute::is_any_of_v<KernelScheduleType,
                                                          KernelTmaWarpSpecializedCooperative,
                                                          KernelPtrArrayTmaWarpSpecializedCooperative,
                                                          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<ScaleGranularityM>>;
  using AtomLayoutMNK = cute::conditional_t<IsCooperative,
      Layout<Shape<_2,_1,_1>>, Layout<Shape<_1,_1,_1>>>;
  using TiledMma = decltype(cute::make_tiled_mma(cute::GMMA::ss_op_selector<
      ElementAMma, ElementBMma, ElementAccumulator, TileShape_MNK, GmmaMajorA, GmmaMajorB>(), AtomLayoutMNK{}));
  using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape_MNK{})));
  using GmemTiledCopyB = decltype(detail::sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape_MNK{})));
  using SmemLayoutAtomA = decltype(detail::ss_smem_selector<
      GmmaMajorA, ElementAMma, decltype(cute::get<0>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
  using SmemLayoutAtomB = decltype(detail::ss_smem_selector<
      GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>());
  static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0;
  static constexpr int KernelSmemCarveout = static_cast<int>(TensorMapStorage);
  static constexpr int PipelineStages = detail::compute_stage_count_or_override<detail::sm90_smem_capacity_bytes - KernelSmemCarveout,
      ElementAMma, ElementBMma, TileShape_MNK>(StageCountType{});
  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType, ScaleGranularityM>;
  using SmemCopyAtomA = void;
  using SmemCopyAtomB = void;
  using CollectiveOp = CollectiveMma<
      DispatchPolicy,
      TileShape_MNK,
      ElementA,
      TagToStrideA_t<GmemLayoutATag>,
      ElementB,
      TagToStrideB_t<GmemLayoutBTag>,
      TiledMma,
      GmemTiledCopyA,
      SmemLayoutAtomA,
      SmemCopyAtomA,
      cute::identity,
      GmemTiledCopyB,
      SmemLayoutAtomB,
      SmemCopyAtomB,
      cute::identity
    >;
 };
 /////////////////////////////////////////////////////////////////////////////////////////////////
 }  // namespace cutlass::gemm::collective
 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
@ -0,0 +1,183 @@
 // clang-format off
 // adapted from: https://github.com/soundOfDestiny/cutlass/blob/a4208aa6958864923505cade9c63eb2a6daf16e5/include/cutlass/gemm/collective/fp8_accumulation.hpp
 /***************************************************************************************************
 * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 #pragma once
 #include "cute/algorithm/clear.hpp"
 #include "cute/tensor.hpp"
 //////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////FP8 Accumulation///////////////////////////
 //////////////////////////////////////////////////////////////////////////////
 /// This class provides API to promote (add) or scale (multiply_add) the results
 /// from the tensor core accumulators to the main accumulators when the number 
 /// of MMAs reaches the max number of MMA interval specified by user, after that
 /// the tensor core accumulators are zeroed.
 //////////////////////////////////////////////////////////////////////////////
 namespace cutlass::gemm::collective {
 template <
    class EngineAccum,
    class LayoutAccum>
 struct GmmaFP8AccumulationWithScale {  
  using TensorAccum = cute::Tensor<EngineAccum, LayoutAccum>;
  using ElementAccumulator = typename EngineAccum::value_type;
  static_assert(is_static<LayoutAccum>::value, "Accumulator Layout should be static");
  static_assert(is_rmem<TensorAccum>::value , "Accumulator tensor must be rmem resident.");
 private:
  TensorAccum& accum_;
  TensorAccum accum_temp_;
  uint32_t accum_promotion_interval_;         // defines the max num of executed MMAs after which accum should be promoted.
  uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
  uint32_t mma_count_;                        // current executed MMAs
  uint32_t reset_accum_flag_;                 // accum needs to be zeroed or not. 
  // promote or `add` the partial accumulators to main accumulator (FADD).
  CUTLASS_DEVICE
  void promote_core() {
    warpgroup_wait<0>();
    CUTLASS_PRAGMA_UNROLL
    for (int i = 0; i < size(accum_); ++i) {
      accum_(i) += accum_temp_(i);
    }
  }
  // `multiply` scale the partial accumulators and `add` to main accumulator (FFMA).
  template <
    class EngineScale,
    class LayoutScale>
  CUTLASS_DEVICE
  void scale_core(const cute::Tensor<EngineScale, LayoutScale> &scale) {
    using TensorScale = cute::Tensor<EngineScale, LayoutScale>;
    static_assert(is_static<LayoutScale>::value, "Scale Layout should be static");
    static_assert(is_rmem<TensorScale>::value , "Scale tensor must be rmem resident.");
    static_assert(LayoutAccum{}.shape() == LayoutScale{}.shape(), "Accumulator and scale must have same shape.");
    warpgroup_wait<0>();
    CUTLASS_PRAGMA_UNROLL
    for (int i = 0; i < size(accum_); ++i) {
      accum_(i) += accum_temp_(i) * scale(i);
    }
  }
 public:
  CUTLASS_DEVICE
  GmmaFP8AccumulationWithScale(
      TensorAccum &accum,
      uint32_t accum_promotion_interval,
      uint32_t mma_count_per_mainloop_iteration)
      : accum_(accum), 
        accum_promotion_interval_(accum_promotion_interval),
        mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
        mma_count_(0), 
        reset_accum_flag_(0) 
  {
    accum_temp_ = cute::make_fragment_like(accum);
  }
  //
  // Methods (Common)
  //
  CUTLASS_DEVICE 
  TensorAccum& operator()() {
    return accum_temp_;
  }
  /// prepare the MMA accumulators when initialization or zeroing is required.
  CUTLASS_DEVICE
  bool prepare_if_needed() { 
    return reset_accum_flag_;
  }
  //
  // Methods (for FADD version)
  //
  /// promote (add) the results from the MMA accumulators to main accumulator if needed.
  CUTLASS_DEVICE
  void promote_if_needed() {
    mma_count_ += mma_count_per_mainloop_iteration_;
    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
    if (reset_accum_flag_) {
      promote_core();
      mma_count_ = 0;
    }
  }
  /// promote (add) the residue results from the MMA accumulators to main accumulator if needed.
  CUTLASS_DEVICE
  void promote_residue_if_needed() {
    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
      promote_core();
    }
  }
  //
  // Methods (for FFMA version)
  //
  /// scale (multiply_add) the results from the MMA accumulators to main accumulator if needed.
  template <
    class EngineScale,
    class LayoutScale>
  CUTLASS_DEVICE
  void scale_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
    mma_count_ += mma_count_per_mainloop_iteration_;
    reset_accum_flag_ = __shfl_sync(0xffffffff, mma_count_ == accum_promotion_interval_, 0);
    if (reset_accum_flag_) {
      scale_core(scale);
      mma_count_ = 0;
    }
  }
  /// scale (multiply_add) the residue results from the MMA accumulators to main accumulator if needed.
  template <
    class EngineScale,
    class LayoutScale>
  CUTLASS_DEVICE
  void scale_residue_if_needed(const cute::Tensor<EngineScale, LayoutScale> &scale) {
    if (__shfl_sync(0xffffffff, mma_count_ > 0, 0)) {
      scale_core(scale);
    }
  }
 };
 } // namespace cutlass::gemm::collective
--- a/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/csrc/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@ -0,0 +1,729 @@
 // clang-format off
 // Adapted (Heavily) from: https://github.com/soundOfDestiny/cutlass/blob/9d997ce0dea4c5fa1a617db6b7ff29aa9235822c/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
 /***************************************************************************************************
 * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * 3. Neither the name of the copyright holder nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 **************************************************************************************************/
 #pragma once
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/dispatch_policy.hpp"
 #include "cutlass/trace.h"
 #include "cutlass/numeric_types.h"
 #include "cute/arch/cluster_sm90.hpp"
 #include "cute/arch/copy_sm80.hpp"
 #include "cute/arch/copy_sm90.hpp"
 #include "cute/algorithm/functional.hpp"
 #include "cute/atom/mma_atom.hpp"
 #include "cute/algorithm/gemm.hpp"
 #include "cute/numeric/arithmetic_tuple.hpp"
 #include "cutlass_extensions/gemm/dispatch_policy.hpp"
 #include "cutlass_extensions/gemm/collective/fp8_accumulation.hpp"
 /////////////////////////////////////////////////////////////////////////////////////////////////
 namespace cutlass::gemm::collective {
 using namespace cute;
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // WarpSpecialized Mainloop
 template <
  int Stages,
  class ClusterShape,
  class KernelSchedule,
  int ScaleGranularityM_,
  class TileShape_,
  class ElementA_,
  class StrideA_,
  class ElementB_,
  class StrideB_,
  class TiledMma_,
  class GmemTiledCopyA_,
  class SmemLayoutAtomA_,
  class SmemCopyAtomA_,
  class TransformA_,
  class GmemTiledCopyB_,
  class SmemLayoutAtomB_,
  class SmemCopyAtomB_,
  class TransformB_>
 struct CollectiveMma<
    MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>,
    TileShape_,
    ElementA_,
    StrideA_,
    ElementB_,
    StrideB_,
    TiledMma_,
    GmemTiledCopyA_,
    SmemLayoutAtomA_,
    SmemCopyAtomA_,
    TransformA_,
    GmemTiledCopyB_,
    SmemLayoutAtomB_,
    SmemCopyAtomB_,
    TransformB_>
 {
  //
  // Type Aliases
  //
  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8<Stages, ClusterShape, KernelSchedule, ScaleGranularityM_>;
  using TileShape = TileShape_;
  using ElementA = ElementA_;
  using StrideA = StrideA_;
  using ElementB = ElementB_;
  using StrideB = StrideB_;
  using TiledMma = TiledMma_;
  using ElementAccumulator = typename TiledMma::ValTypeC;
  using ElementBlockScale = ElementAccumulator;
  using GmemTiledCopyA = GmemTiledCopyA_;
  using GmemTiledCopyB = GmemTiledCopyB_;
  using SmemLayoutAtomA = SmemLayoutAtomA_;
  using SmemLayoutAtomB = SmemLayoutAtomB_;
  using SmemCopyAtomA = SmemCopyAtomA_;
  using SmemCopyAtomB = SmemCopyAtomB_;
  using TransformA = TransformA_;
  using TransformB = TransformB_;
  using ArchTag = typename DispatchPolicy::ArchTag;
  using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
  using PipelineParams = typename MainloopPipeline::Params;
  // Two threads per CTA are producers (1 for operand tile and 32 for scales)
  static constexpr int NumProducerThreadEvents = 33; 
  static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
  static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
  static_assert((size<0>(TileShape{}) % ScaleGranularityM) == 0, "FP8 scaling granularity must evenly divide tile shape along M.");
  // Tile along modes in a way that maximizes the TMA box size.
  using SmemLayoutA = decltype(tile_to_shape(
      SmemLayoutAtomA{},
      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideA>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
  using SmemLayoutB = decltype(tile_to_shape(
      SmemLayoutAtomB{},
      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
      cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
  // Block scaling gmem-to-smem copy atom 
  using SmemBlockScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
  using SmemBlockScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
  // Block scaling smem layout
  using SmemLayoutScaleA = Layout<Shape<Int<ScaleMsPerTile>, Int<DispatchPolicy::Stages>>>;
  using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 1 or more.");
  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
  static_assert(cute::is_same_v<ElementAccumulator, ElementBlockScale>,
             "ElementAccumulator and ElementBlockScale should be same datatype");
  struct SharedStorage
  {
    struct TensorStorage : cute::aligned_struct<128> {
      cute::array_aligned<typename TiledMma::ValTypeA, cute::cosize_v<SmemLayoutA>> smem_A;  // mxk
      cute::array_aligned<typename TiledMma::ValTypeB, cute::cosize_v<SmemLayoutB>> smem_B;  // nxk
      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleA>> smem_scale_A; // ScaleMsPerTile x k
      cute::array_aligned<ElementBlockScale, cute::cosize_v<SmemLayoutScaleB>> smem_scale_B; // 1xk
    } tensors;
    using PipelineStorage = typename MainloopPipeline::SharedStorage;
    PipelineStorage pipeline;
  };
  using TensorStorage = typename SharedStorage::TensorStorage;
  using PipelineStorage = typename SharedStorage::PipelineStorage;
  // Host side kernel arguments
  struct Arguments {
    ElementA const* ptr_A;
    StrideA dA;
    ElementB const* ptr_B;
    StrideB dB;
    ElementBlockScale const* ptr_scale_A; 
    ElementBlockScale const* ptr_scale_B;
  };
  // Device side kernel params
  struct Params {
    // Assumption: StrideA is congruent with Problem_MK
    using TMA_A = decltype(make_tma_copy_A_sm90(
        GmemTiledCopyA{},
        make_tensor(static_cast<ElementA const*>(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}),
        SmemLayoutA{}(_,_,0),
        TileShape{},
        ClusterShape{}));
    // Assumption: StrideB is congruent with Problem_NK
    using TMA_B = decltype(make_tma_copy_B_sm90(
        GmemTiledCopyB{},
        make_tensor(static_cast<ElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
        SmemLayoutB{}(_,_,0),
        TileShape{},
        ClusterShape{}));
    TMA_A tma_load_a;
    TMA_B tma_load_b;
    uint32_t tma_transaction_bytes = TmaTransactionBytes;
    uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
    uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
    // Block scaling factors for A and B
    ElementBlockScale const* ptr_scale_A; 
    ElementBlockScale const* ptr_scale_B;
  };
  //
  // Methods
  //
  template <class ProblemShape>
  static constexpr Params
  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
    (void) workspace;
    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
    auto problem_shape_MNKL = append<4>(problem_shape, 1);
    auto [M,N,K,L] = problem_shape_MNKL;
    auto ptr_A = reinterpret_cast<ElementA const*>(args.ptr_A);
    auto ptr_B = reinterpret_cast<ElementB const*>(args.ptr_B);
    Tensor tensor_a = make_tensor(ptr_A, make_layout(make_shape(M,K,L), args.dA));
    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90(
        GmemTiledCopyA{},
        tensor_a,
        SmemLayoutA{}(_,_,cute::Int<0>{}),
        TileShape{},
        ClusterShape{});
    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90(
        GmemTiledCopyB{},
        tensor_b,
        SmemLayoutB{}(_,_,cute::Int<0>{}),
        TileShape{},
        ClusterShape{});
    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
    return {
      tma_load_a,
      tma_load_b,
      transaction_bytes,
      transaction_bytes_mk,
      transaction_bytes_nk,
      args.ptr_scale_A,
      args.ptr_scale_B
    };
  }
  template<class ProblemShape>
  static bool
  can_implement(
      ProblemShape const& problem_shape,
      [[maybe_unused]] Arguments const& args) {
    constexpr int tma_alignment_bits = 128;
    auto problem_shape_MNKL = append<4>(problem_shape, 1);
    auto [M,N,K,L] = problem_shape_MNKL;
    bool implementable = true;
    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
    implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
    if (!implementable) {
      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
    }
    return implementable;
  }
  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
  static constexpr int K_PIPE_MMAS = 1;
  static constexpr uint32_t TmaTransactionBytesMK =
        cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof_bits<ElementA>::value));
  static constexpr uint32_t TmaTransactionBytesNK =
        cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof_bits<ElementB>::value));
  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
  CUTLASS_DEVICE
  static void prefetch_tma_descriptors(Params const& mainloop_params)
  {
    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
  }
  /// Set up the data needed by this collective for load and mma.
  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
  /// Returned tuple must contain at least two elements, with the first two elements being:
  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
  template <class ProblemShape_MNKL>
  CUTLASS_DEVICE auto
  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
    using X = Underscore;
    // Separate out problem shape for convenience
    auto [M,N,K,L] = problem_shape_MNKL;
    // TMA requires special handling of strides to deal with coord codomain mapping
    // Represent the full tensors -- get these from TMA
    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(make_shape(M,K,L));                            // (m,k,l)
    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
    // Make tiled views, defer the slice
    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
    constexpr auto scales_m = Int<ScaleMsPerTile>{};
    auto tM = get<2>(gA_mkl.shape());
    auto tN = get<2>(gB_nkl.shape());
    auto tK = get<3>(gA_mkl.shape());
    // Make the tiled views of scale tensors
    auto scaleA_shape = make_shape(M / ScaleGranularityM, tK, L); // (scale_m,k,l)
    auto scaleA_layout = make_ordered_layout(scaleA_shape,  Step<_0, _1, _2>{});
    auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
    auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
    // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and 
    // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
    Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
    Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
    return cute::make_tuple(gA_mkl, gB_nkl, mScaleA_mkl, mScaleB_nkl);
  }
  /// Perform a collective-scoped matrix multiply-accumulate
  /// Producer Perspective
  template <
    class TensorA, class TensorB,
    class TensorScaleA, class TensorScaleB,
    class KTileIterator, class BlockCoord
  >
  CUTLASS_DEVICE void
  load(
      Params const& mainloop_params,
      MainloopPipeline pipeline,
      PipelineState smem_pipe_write,
      cute::tuple<TensorA, TensorB, TensorScaleA, TensorScaleB> const& load_inputs,
      BlockCoord const& blk_coord,
      KTileIterator k_tile_iter, int k_tile_count,
      int thread_idx,
      uint32_t block_rank_in_cluster,
      TensorStorage& shared_tensors) {
    int lane_predicate = cute::elect_one_sync();
    // Blockscaling: Tma loads for load_input and CpAsync for load_scale
    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
    Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (ScaleMsPerTile,k)
    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
    //
    // Prepare the TMA loads for A and B
    //
    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
    uint2 cluster_local_block_id = {block_rank_in_cluster % cluster_shape_x, block_rank_in_cluster / cluster_shape_x};
    Tensor gA_mkl = get<0>(load_inputs);
    Tensor gB_nkl = get<1>(load_inputs);
    auto block_tma_a = mainloop_params.tma_load_a.get_slice(cluster_local_block_id.y);
    auto block_tma_b = mainloop_params.tma_load_b.get_slice(cluster_local_block_id.x);
    // Partition the inputs based on the current block coordinates.
    auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
    Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
    Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
    // Block scaling: load_scale has scaling tensors in global memory which are not tiled
    Tensor mScaleA_mkl = get<2>(load_inputs);
    Tensor mScaleB_nkl = get<3>(load_inputs);
    auto scales_m = get<0>(mScaleA_mkl.shape());
    Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
    Tensor gScaleA = local_tile( 
      mScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
      make_coord(m_coord,_,l_coord));                   // (ScaleMsPerTile,k,1)
    Tensor cScaleA = local_tile( 
      cScaleA_mkl, make_tile(Int<ScaleMsPerTile>{}), 
      make_coord(m_coord,_,l_coord));
    Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
    // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
    TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, 
      Layout<Shape<_32>>{}, Layout<Shape<_1>>{}); // (1,1,1)
    TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, 
      Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
    ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
    ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
    Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
    Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
    Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
    Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
    Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
    // Applies the mapping from block_tma_a
    Tensor tAgA = block_tma_a.partition_S(gA);                                              // (TMA,TMA_M,TMA_K,k)
    Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
    Tensor tBgB = block_tma_b.partition_S(gB);                                              // (TMA,TMA_N,TMA_K,k)
    Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
    uint16_t mcast_mask_a = 0;
    uint16_t mcast_mask_b = 0;
    // Issue TmaLoads for GEMM operands A/B and CpAsync for scale tensors
    // Maps the tile -> block, value
    if constexpr (cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>) {
      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
      for (int n = 0; n < size<1>(block_layout); ++n) {
        mcast_mask_a |= (uint16_t(1) << block_layout(cluster_local_block_id.x,n,Int<0>{}));
      }
    }
    if constexpr (cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>) {
      auto block_layout = Layout<typename DispatchPolicy::ClusterShape>{};                       // (m,n) -> block_id
      for (int m = 0; m < size<0>(block_layout); ++m) {
        mcast_mask_b |= (uint16_t(1) << block_layout(m,cluster_local_block_id.y,Int<0>{}));
      }
    }
    // Allocate predicate tensors for a_scales (since we can't guarantee that 
    // all scales are valid, since we could have a partial tiles along M)
    Tensor tApA_ScaleA = make_tensor<bool>(shape(tAsA_ScaleA(_,_,0)));
    #pragma unroll
    for (int i = 0; i < size(tApA_ScaleA); ++i) {
      tApA_ScaleA(i) = get<0>(tAcA_ScaleA(i)) < scales_m;
    }
    // Mainloop
    CUTLASS_PRAGMA_NO_UNROLL
    for ( ; k_tile_count > 0; --k_tile_count) {
      // LOCK smem_pipe_write for _writing_
      pipeline.producer_acquire(smem_pipe_write);
      //
      // Copy gmem to smem for *k_tile_iter
      //
      int write_stage = smem_pipe_write.index();
      using BarrierType = typename MainloopPipeline::ProducerBarrierType;
      BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
      // Copy operands A and B from global memory to shared memory
      if (lane_predicate) copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
      if (lane_predicate) copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
      // Copy scale tensors from global memory to shared memory
      copy_if(scale_copy_a, tApA_ScaleA, tAgA_ScaleA(_,_,*k_tile_iter), tAsA_ScaleA(_,_,write_stage));
      copy(scale_copy_b, tBgB_ScaleB(_,*k_tile_iter), tBsB_ScaleB(_,write_stage));
      pipeline.producer_commit(smem_pipe_write, cutlass::arch::cpasync_barrier_arrive_noinc);
      ++k_tile_iter;
      // Advance smem_pipe_write
      ++smem_pipe_write;
    }
  }
  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
  CUTLASS_DEVICE void
  load_tail(
      MainloopPipeline pipeline,
      PipelineState smem_pipe_write) {
    int lane_predicate = cute::elect_one_sync();
    // Issue the epilogue waits
    if (lane_predicate) {
      /* This helps avoid early exit of blocks in Cluster
       * Waits for all stages to either be released (all
       * Consumer UNLOCKs), or if the stage was never used
       * then would just be acquired since the phase was
       * still inverted from make_producer_start_state
       */
      pipeline.producer_tail(smem_pipe_write);
    }
  }
  /// Perform a collective-scoped matrix multiply-accumulate
  /// Consumer Perspective
  template <
    class FrgTensorC
  >
  CUTLASS_DEVICE void
  mma(MainloopPipeline pipeline,
      PipelineState smem_pipe_read,
      FrgTensorC& accum,
      int k_tile_count,
      int thread_idx,
      TensorStorage& shared_tensors,
      Params const& mainloop_params) {
    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
    static_assert(cute::is_void_v<SmemCopyAtomA>,
      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
    static_assert(cute::is_void_v<SmemCopyAtomB>,
      "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
    // Block scaling
    Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
      Layout<
        Shape<Shape<Int<ScaleGranularityM>, Int<ScaleMsPerTile>>, cute::tuple_element_t<1, TileShape>, Int<DispatchPolicy::Stages>>,
        Stride<Stride<_0, _1>, _0, Int<ScaleMsPerTile>>
      >{}); // ((ScaleGranularityM,ScaleMsPerTile),n,k)
    Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
    //
    // Define C accumulators and A/B partitioning
    //
    // Layout of warp group to thread mapping
    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
                  stride<0>(typename TiledMma::BLayout{}) == 0 and
                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
                                                  Int<NumThreadsPerWarpGroup>{});
    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
    TiledMma tiled_mma;
    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
    Tensor tCsScaleAViewAsC = tiled_mma.get_slice(thread_idx).partition_C(sScaleAViewAsC);    // (MMA,MMA_M,MMA_N,PIPE), `thread_mma` above is correct when partitioning A and B, but it is not correct when partitioning C.
    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
    // Allocate "fragments/descriptors"
    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
    //
    // PIPELINED MAIN LOOP
    //
    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
        "ERROR : Incorrect number of MMAs in flight");
    // We release buffers to producer warps(dma load) with some mmas in flight
    PipelineState smem_pipe_release = smem_pipe_read;
    // Per block scale values for operand A and B
    using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
    using RegLayoutScaleAEssential = decltype(filter_zeros(RegLayoutScaleAViewAsC{}.stride(), RegLayoutScaleAViewAsC{}.shape())); // an interface to traverse the underlying storage for the compact layout mentioned above
    Tensor tCrScaleAViewAsC = make_tensor<ElementBlockScale>(RegLayoutScaleAViewAsC{});              // (MMA,MMA_M,MMA_N)
    ElementBlockScale scale_b;
    // Prologue GMMAs
    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
    GmmaFP8AccumulationWithScale accumulation(accum, size<2>(TileShape{}) / size<2>(typename TiledMma::AtomShape_MNK{}), size<2>(tCrA));
    warpgroup_fence_operand(accumulation());
    CUTLASS_PRAGMA_UNROLL
    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
    {
      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
      pipeline.consumer_wait(smem_pipe_read, barrier_token);
      if (accumulation.prepare_if_needed()) {
        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
      }
      int read_stage = smem_pipe_read.index();
      // Load per block scale values from shared memory to registers.
      scale_b = sScaleB[read_stage];
      CUTLASS_PRAGMA_UNROLL
      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
      }
      if constexpr (ScaleMsPerTile == 1) {
        static_assert(size(RegLayoutScaleAEssential{}) == 1);
        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
      } else {
        CUTLASS_PRAGMA_UNROLL
        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
        }
      }
      warpgroup_arrive();
      // Unroll the K mode manually to set scale D to 1
      CUTLASS_PRAGMA_UNROLL
      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
        // (V,M,K) x (V,N,K) => (V,M,N)
        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
      }
      warpgroup_commit_batch();
      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
      accumulation.scale_if_needed(tCrScaleAViewAsC);
      ++smem_pipe_read;
    }
    warpgroup_fence_operand(accumulation());
    // Mainloop GMMAs
    k_tile_count -= prologue_mma_count;
    CUTLASS_PRAGMA_NO_UNROLL
    for ( ; k_tile_count > 0; --k_tile_count)
    {
      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
      pipeline.consumer_wait(smem_pipe_read, barrier_token);
      //
      // Compute on k_tile
      //
      int read_stage = smem_pipe_read.index();
      // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) 
      scale_b = sScaleB[read_stage];
      CUTLASS_PRAGMA_UNROLL
      for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
        tCrScaleAViewAsC.data()[i] = tCsScaleAViewAsC(_, _, _, read_stage)(idx2crd(i, RegLayoutScaleAEssential{}));
      }
      if constexpr (ScaleMsPerTile == 1) {
        static_assert(size(RegLayoutScaleAEssential{}) == 1);
        tCrScaleAViewAsC.data()[0] = __shfl_sync(0xffffffff, tCrScaleAViewAsC.data()[0] * scale_b, 0); // `tCrScaleAViewAsC.data()[0]` are all same in a warp group when `ScaleMsPerTile == 1`.
      } else {
        CUTLASS_PRAGMA_UNROLL
        for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
          tCrScaleAViewAsC.data()[i] = tCrScaleAViewAsC.data()[i] * scale_b;
        }
      }
      if (accumulation.prepare_if_needed()) {
        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
      }
      warpgroup_fence_operand(accumulation());
      warpgroup_arrive();
      // Unroll the K mode manually to set scale D to 1
      CUTLASS_PRAGMA_UNROLL
      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
        // (V,M,K) x (V,N,K) => (V,M,N)
        cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation());
        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
      }
      warpgroup_commit_batch();
      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
      warpgroup_wait<K_PIPE_MMAS>();
      warpgroup_fence_operand(accumulation());
      // Block scale the accumulators with reg tensor `tCrScaleAViewAsC`
      accumulation.scale_if_needed(tCrScaleAViewAsC);
      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
      // Advance smem_pipe_read and smem_pipe_release
      ++smem_pipe_read;
      ++smem_pipe_release;
    }
    accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
    warpgroup_fence_operand(accumulation());
  }
  /// Perform a Consumer Epilogue to release all buffers
  CUTLASS_DEVICE void
  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
    // Prologue GMMAs
    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
    k_tile_count -= prologue_mma_count;
    smem_pipe_release.advance(k_tile_count);
    // Wait on all GMMAs to complete
    warpgroup_wait<0>();
    for (int count = 0; count < prologue_mma_count; ++count) {
      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
      ++smem_pipe_release;
    }
  }
 };
 /////////////////////////////////////////////////////////////////////////////////////////////////
 } // namespace cutlass::gemm::collective
 /////////////////////////////////////////////////////////////////////////////////////////////////
--- a/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
+++ b/csrc/cutlass_extensions/gemm/dispatch_policy.hpp
@ -0,0 +1,39 @@
 #pragma once
 #include "cutlass/gemm/dispatch_policy.hpp"
 namespace cutlass::gemm {
 //////////////////////////////////////////////////////////////////////////////
 // FP8 related policies (including Blocked Scaled Accumulation)
 //  `ScaleGranularityM` specifies scaling granularity along M, while zero-value
 //  `ScaleGranularityM` indicates that scaling granularity is
 //  `size<0>(TileShape_MNK{})` along M.
 template <int ScaleGranularityM = 0>
 struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum
    : KernelTmaWarpSpecializedCooperative {};
 // n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
 // specialized dynamic schedule For FP8 kernels with Block Scaling
 template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
          class KernelSchedule = KernelTmaWarpSpecialized,
          int ScaleGranularityM =
              0  // `ScaleGranularityM` specifies scaling granularity along M,
                 // while zero-value `ScaleGranularityM` indicates that scaling
                 // granularity is `size<0>(TileShape_MNK{})` along M.
          >
 struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
                                         KernelSchedule> {
  static_assert(
      cute::is_same_v<
          KernelSchedule,
          KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<
              ScaleGranularityM>>,
      "KernelSchedule must be one of the warp specialized policies");
 };
 //////////////////////////////////////////////////////////////////////////////
 }  // namespace cutlass::gemm
--- a/csrc/cutlass_extensions/vllm_collective_builder.cuh
+++ b/csrc/cutlass_extensions/vllm_collective_builder.cuh
@ -1,6 +1,6 @@
 #pragma once
-#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
 namespace cutlass::gemm::collective {
 using namespace cute;
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@ -19,13 +19,6 @@
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
 #define VLLM_DISPATCH_CASE_HALF_TYPES(...)            \
  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
 #define VLLM_DISPATCH_HALF_TYPES(TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_HALF_TYPES(__VA_ARGS__))
 // ROCm devices might use either fn or fnuz, so set up dispatch table for both.
 // A host-based check at runtime will create a preferred FP8 type for ROCm
 // such that the correct kernel is dispatched.
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@ -140,211 +140,6 @@ fused_add_rms_norm_kernel(
  }
 }
 /* Function specialization in the case of FP16/BF16 tensors.
   Additional optimizations we can make in this case are
   packed and vectorized operations, which help with the
   memory latency bottleneck.
   _f16VecPN struct extends _f16Vec to add operations specifically required for
   polynomial normalization (poly norm).
   The original _f16Vec does not include the sum-of-powers computation or
   in-place polynomial normalization logic. */
 template <typename scalar_t, int width>
 struct alignas(16) _f16VecPN : _f16Vec<scalar_t, width> {
  using Base = _f16Vec<scalar_t, width>;
  using Converter = typename Base::Converter;
  using T1 = typename Base::T1;
  using T2 = typename Base::T2;
  using Base::data;
  __device__ auto sum_pows() const {
    float s2 = 0.0f, s4 = 0.0f, s6 = 0.0f;
 #pragma unroll
    for (int i = 0; i < width; i += 2) {
      float2 z = Converter::convert(T2{data[i], data[i + 1]});
      float x2 = z.x * z.x;
      float x4 = x2 * x2;
      float x6 = x4 * x2;
      float y2 = z.y * z.y;
      float y4 = y2 * y2;
      float y6 = y4 * y2;
      s2 += x2 + y2;
      s4 += x4 + y4;
      s6 += x6 + y6;
    }
    return std::make_tuple(s2, s4, s6);
  }
  __device__ void poly_norm_inplace(const float w2_inv_std,
                                    const float w1_inv_std2,
                                    const float w0_inv_std3, const float bias) {
 #pragma unroll
    for (int i = 0; i < width; i += 2) {
      float2 z = Converter::convert(T2{data[i], data[i + 1]});
      float x2 = z.x * z.x;
      float x3 = x2 * z.x;
      z.x = w2_inv_std * z.x + w1_inv_std2 * x2 + w0_inv_std3 * x3 + bias;
      float y2 = z.y * z.y;
      float y3 = y2 * z.y;
      z.y = w2_inv_std * z.y + w1_inv_std2 * y2 + w0_inv_std3 * y3 + bias;
      auto out = Converter::convert(z);
      data[i] = out.x;
      data[i + 1] = out.y;
    }
  }
 };
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
 poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
                 const scalar_t* __restrict__ input,   // [..., hidden_size]
                 const scalar_t* __restrict__ weight,  // [3]
                 const scalar_t* __restrict__ bias,    // [1]
                 const float epsilon, const int hidden_size) {
  // Sanity checks on our vector struct and type-punned pointer arithmetic
  static_assert(std::is_pod_v<_f16VecPN<scalar_t, width>>);
  static_assert(sizeof(_f16VecPN<scalar_t, width>) == sizeof(scalar_t) * width);
  /* These and the argument pointers are all declared `restrict` as they are
     not aliased in practice. Argument pointers should not be dereferenced
     in this kernel as that would be undefined behavior */
  auto* __restrict__ input_v =
      reinterpret_cast<const _f16VecPN<scalar_t, width>*>(input);
  const int vec_hidden_size = hidden_size / width;
  float variance = 0.0f;
  float variance2 = 0.0f;
  float variance3 = 0.0f;
  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
    _f16VecPN<scalar_t, width> temp = input_v[id];
    auto [x2, x4, x6] = temp.sum_pows();
    variance += x2;
    variance2 += x4;
    variance3 += x6;
  }
  float3 thread_variances = make_float3(variance, variance2, variance3);
  struct SumOp {
    __device__ float3 operator()(const float3& a, const float3& b) const {
      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
    }
  };
  using BlockReduce = cub::BlockReduce<float3, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
  float3 block_variances =
      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
  variance = block_variances.x;
  variance2 = block_variances.y;
  variance3 = block_variances.z;
  __shared__ float s_w2_inv_std;
  __shared__ float s_w1_inv_std2;
  __shared__ float s_w0_inv_std3;
  __shared__ float s_bias;
  if (threadIdx.x == 0) {
    float w0 = (float)weight[0];
    float w1 = (float)weight[1];
    float w2 = (float)weight[2];
    s_bias = (float)bias[0];
    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
  }
  __syncthreads();
  auto* __restrict__ out_v = reinterpret_cast<_f16VecPN<scalar_t, width>*>(out);
  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
    int id = blockIdx.x * vec_hidden_size + idx;
    _f16VecPN<scalar_t, width> temp = input_v[id];
    temp.poly_norm_inplace(s_w2_inv_std, s_w1_inv_std2, s_w0_inv_std3, s_bias);
    out_v[id] = temp;
  }
 }
 /* Generic poly_norm_kernel
   The width field is not used here but necessary for other specializations.
 */
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
 poly_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
                 const scalar_t* __restrict__ input,   // [..., hidden_size]
                 const scalar_t* __restrict__ weight,  // [3]
                 const scalar_t* __restrict__ bias,    // [1]
                 const float epsilon, const int hidden_size) {
  float variance = 0.0f;
  float variance2 = 0.0f;
  float variance3 = 0.0f;
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
    float x = (float)input[blockIdx.x * hidden_size + idx];
    float x2 = x * x;
    float x4 = x2 * x2;
    float x6 = x4 * x2;
    variance += x2;
    variance2 += x4;
    variance3 += x6;
  }
  float3 thread_variances = make_float3(variance, variance2, variance3);
  struct SumOp {
    __device__ float3 operator()(const float3& a, const float3& b) const {
      return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
    }
  };
  using BlockReduce = cub::BlockReduce<float3, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStore;
  float3 block_variances =
      BlockReduce(reduceStore).Reduce(thread_variances, SumOp{}, blockDim.x);
  variance = block_variances.x;
  variance2 = block_variances.y;
  variance3 = block_variances.z;
  __shared__ float s_w2_inv_std;
  __shared__ float s_w1_inv_std2;
  __shared__ float s_w0_inv_std3;
  __shared__ float s_bias;
  if (threadIdx.x == 0) {
    float w0 = (float)weight[0];
    float w1 = (float)weight[1];
    float w2 = (float)weight[2];
    s_bias = (float)bias[0];
    s_w2_inv_std = w2 * rsqrtf(variance / hidden_size + epsilon);
    s_w1_inv_std2 = w1 * rsqrtf(variance2 / hidden_size + epsilon);
    s_w0_inv_std3 = w0 * rsqrtf(variance3 / hidden_size + epsilon);
  }
  __syncthreads();
  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
    float x = (float)input[blockIdx.x * hidden_size + idx];
    float x2 = x * x;
    float x3 = x2 * x;
    out[blockIdx.x * hidden_size + idx] =
        (scalar_t)(x * s_w2_inv_std + x2 * s_w1_inv_std2 + x3 * s_w0_inv_std3 +
                   s_bias);
  }
 }
 }  // namespace vllm
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
@ -424,49 +219,3 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
 }
 #define LAUNCH_FUSED_POLY_NORM(width)                                         \
  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "poly_norm_kernel", [&] { \
    vllm::poly_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                 \
        weight.data_ptr<scalar_t>(), bias.data_ptr<scalar_t>(), epsilon,      \
        hidden_size);                                                         \
  });
 void poly_norm(torch::Tensor& out,     // [..., hidden_size]
               torch::Tensor& input,   // [..., hidden_size]
               torch::Tensor& weight,  // [3]
               torch::Tensor& bias,    // [1]
               double epsilon) {
  TORCH_CHECK(out.is_contiguous());
  TORCH_CHECK(input.is_contiguous());
  TORCH_CHECK(out.data_ptr() != input.data_ptr());
  int hidden_size = input.size(-1);
  int num_tokens = input.numel() / hidden_size;
  dim3 grid(num_tokens);
  /* This kernel is memory-latency bound in many scenarios.
     When num_tokens is large, a smaller block size allows
     for increased block occupancy on CUs and better latency
     hiding on global mem ops. */
  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
  dim3 block(std::min(hidden_size, max_block_size));
  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  /*If the tensor types are FP16/BF16, try to use the optimized kernel
    with packed + vectorized ops.
    Max optimization is achieved with a width-8 vector of FP16/BF16s
    since we can load at most 128 bits at once in a global memory op.
    However, this requires each tensor's data to be aligned to 16
    bytes.
   */
  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
  auto out_ptr = reinterpret_cast<std::uintptr_t>(out.data_ptr());
  bool ptrs_are_aligned = inp_ptr % 16 == 0 && out_ptr % 16 == 0;
  if (ptrs_are_aligned && hidden_size % 8 == 0) {
    LAUNCH_FUSED_POLY_NORM(8);
  } else {
    LAUNCH_FUSED_POLY_NORM(0);
  }
 }
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@ -27,12 +27,11 @@
 template<int kNThreads_, int kNItems_, int kNRows_, bool kIsEvenLen_,
         bool kIsVariableB_, bool kIsVariableC_,
-         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_, typename state_t_>
+         bool kHasZ_, bool kVarlen_, typename input_t_, typename weight_t_>
 struct Selective_Scan_fwd_kernel_traits {
    static_assert(kNItems_ % 4 == 0);
    using input_t = input_t_;
    using weight_t = weight_t_;
    using state_t = state_t_;
    static constexpr int kNThreads = kNThreads_;
    // Setting MinBlocksPerMP to be 3 (instead of 2) for 128 threads improves occupancy.
    static constexpr int kMinBlocks = kNThreads < 128 ? 5 : 3;
@ -133,7 +132,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
    weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
    input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
+    input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + 
    cache_index * params.ssm_states_batch_stride + 
    dim_id * kNRows * params.ssm_states_dim_stride;
@ -262,7 +261,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                if (threadIdx.x == 0) {
                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
                    if (chunk == n_chunks - 1) {
-                        ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
+                        ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
                    }
                }
                #pragma unroll
@ -311,7 +310,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
    }
 }
-template<int kNThreads, int kNItems, typename input_t, typename weight_t, typename state_t>
+template<int kNThreads, int kNItems, typename input_t, typename weight_t>
 void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    // Only kNRows == 1 is tested for now, which ofc doesn't differ from previously when we had each block
    // processing 1 row.
@ -322,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    BOOL_SWITCH(params.seqlen % (kNThreads * kNItems) == 0, kIsEvenLen, [&] {
        BOOL_SWITCH(params.z_ptr != nullptr , kHasZ, [&] {
            BOOL_SWITCH(params.query_start_loc_ptr != nullptr , kVarlen, [&] {
-                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t, state_t>;
+                using Ktraits = Selective_Scan_fwd_kernel_traits<kNThreads, kNItems, kNRows, kIsEvenLen, kIsVariableB, kIsVariableC, kHasZ,  kVarlen, input_t, weight_t>;
                constexpr int kSmemSize = Ktraits::kSmemSize + kNRows * MAX_DSTATE * sizeof(typename Ktraits::scan_t);
                dim3 grid(params.batch, params.dim / kNRows);
                auto kernel = &selective_scan_fwd_kernel<Ktraits>;
@ -342,78 +341,59 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
    });
 }
-template<typename input_t, typename weight_t, typename state_t>
+template<typename input_t, typename weight_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
    #ifndef USE_ROCM
        if (params.seqlen <= 128) {           
-            selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<32, 4, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<32, 8, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<32, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<32, 16, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
        } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
        }
    #else
        if (params.seqlen <= 256) {
-            selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<64, 4, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 512) {
-            selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<64, 8, input_t, weight_t>(params, stream);
        } else if (params.seqlen <= 1024) {
-            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<64, 16, input_t, weight_t>(params, stream);
        } else {
-            selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
+            selective_scan_fwd_launch<128, 16, input_t, weight_t>(params, stream);
        }
    #endif
 }
-template void selective_scan_fwd_cuda<at::BFloat16, float, at::BFloat16>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::BFloat16, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::BFloat16, float, float>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<at::Half, float>(SSMParamsBase &params, cudaStream_t stream);
-template void selective_scan_fwd_cuda<at::Half, float, at::Half>(SSMParamsBase &params, cudaStream_t stream);
+template void selective_scan_fwd_cuda<float, float>(SSMParamsBase &params, cudaStream_t stream);
 template void selective_scan_fwd_cuda<at::Half, float, float>(SSMParamsBase &params, cudaStream_t stream);
 template void selective_scan_fwd_cuda<float, float, float>(SSMParamsBase &params, cudaStream_t stream);
 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
-#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, STYPE, NAME, ...)       \
+#define DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(ITYPE, NAME, ...)              \
    if (ITYPE == at::ScalarType::Half) {                                            \
        using input_t = at::Half;                                                   \
        using weight_t = float;                                                     \
-        if (STYPE == at::ScalarType::Half) {                                        \
+        __VA_ARGS__();                                                              \
            using state_t = at::Half;                                               \
            __VA_ARGS__();                                                          \
        } else if (STYPE == at::ScalarType::Float) {                                \
            using state_t = float;                                                  \
            __VA_ARGS__();                                                          \
        } else {                                                                    \
            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
        }                                                                           \
    } else if (ITYPE == at::ScalarType::BFloat16) {                                 \
        using input_t = at::BFloat16;                                               \
        using weight_t = float;                                                     \
-        if (STYPE == at::ScalarType::BFloat16) {                                    \
+        __VA_ARGS__();                                                              \
            using state_t = at::BFloat16;                                           \
            __VA_ARGS__();                                                          \
        } else if (STYPE == at::ScalarType::Float) {                                \
            using state_t = float;                                                  \
            __VA_ARGS__();                                                          \
        } else {                                                                    \
            AT_ERROR(#NAME, " not implemented for state type '", toString(STYPE), "'"); \
        }                                                                           \
    } else if (ITYPE == at::ScalarType::Float)  {                                   \
        using input_t = float;                                                      \
        using weight_t = float;                                                     \
        using state_t = float;                                                      \
        __VA_ARGS__();                                                              \
    } else {                                                                        \
        AT_ERROR(#NAME, " not implemented for input type '", toString(ITYPE), "'"); \
    }
-template<typename input_t, typename weight_t, typename state_t>
+template<typename input_t, typename weight_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream);
 void set_ssm_params_fwd(SSMParamsBase &params,
@ -668,9 +648,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
    // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout
    at::Tensor out = delta;
-    // ssm_states can now be either the same as input_type or float32
+    TORCH_CHECK(ssm_states.scalar_type() == input_type);
    auto state_type = ssm_states.scalar_type();
    TORCH_CHECK(state_type == input_type || state_type == at::ScalarType::Float);
    TORCH_CHECK(ssm_states.is_cuda());
    TORCH_CHECK(ssm_states.stride(-1) == 1);
@ -692,7 +670,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
    const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), ssm_states.scalar_type(), "selective_scan_fwd", [&] {
+    DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
-        selective_scan_fwd_cuda<input_t, weight_t, state_t>(params, stream);
+        selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
    });
 }
--- a/csrc/moe/grouped_topk_kernels.cu
+++ b/csrc/moe/grouped_topk_kernels.cu
@ -1,758 +0,0 @@
 /*
 * Adapted from
 * https://github.com/NVIDIA/TensorRT-LLM/blob/v0.21.0/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu
 * Copyright (c) 2025, The vLLM team.
 * SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION &
 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <c10/cuda/CUDAStream.h>
 #include <torch/all.h>
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 namespace cg = cooperative_groups;
 namespace vllm {
 namespace moe {
 constexpr float kNegInfinity = INFINITY * -1;
 constexpr unsigned FULL_WARP_MASK = 0xffffffff;
 constexpr int32_t WARP_SIZE = 32;
 constexpr int32_t BLOCK_SIZE = 512;
 constexpr int32_t NUM_WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
 namespace warp_topk {
 template <int size, typename T>
 __host__ __device__ constexpr T round_up_to_multiple_of(T len) {
  if (len == 0) {
    return 0;
  }
  return ((len - 1) / size + 1) * size;
 }
 template <typename T>
 constexpr __host__ __device__ bool isPowerOf2(T v) {
  return (v && !(v & (v - 1)));
 }
 template <bool greater, typename T>
 __forceinline__ __device__ bool is_better_than(T val, T baseline) {
  return (val > baseline && greater) || (val < baseline && !greater);
 }
 template <bool greater, typename T, typename idxT>
 __forceinline__ __device__ bool is_better_than(T val, T baseline, idxT index,
                                               idxT baseline_index) {
  bool res = (val > baseline && greater) || (val < baseline && !greater);
  if (val == baseline) {
    res = (index < baseline_index && greater) ||
          (index < baseline_index && !greater);
  }
  return res;
 }
 template <typename T, typename idxT>
 int calc_smem_size_for_block_wide(int num_of_warp, int64_t k) {
  int64_t cache_topk = (sizeof(T) + sizeof(idxT)) * num_of_warp * k;
  int64_t n = std::max<int>(num_of_warp / 2 * k, num_of_warp * WARP_SIZE);
  return max(cache_topk,
             round_up_to_multiple_of<256>(n * sizeof(T)) + n * sizeof(idxT));
 }
 template <int size, bool ascending, bool reverse, typename T, typename idxT,
          bool is_stable>
 struct BitonicMerge {
  // input should be a bitonic sequence, and sort it to be a monotonic sequence
  __device__ static void merge(T* __restrict__ val_arr,
                               idxT* __restrict__ idx_arr) {
    static_assert(isPowerOf2(size));
    static_assert(size >= 2 * WARP_SIZE);
    constexpr int arr_len = size / WARP_SIZE;
    constexpr int stride = arr_len / 2;
    for (int i = 0; i < stride; ++i) {
      int const other_i = i + stride;
      T& val = val_arr[i];
      T& other_val = val_arr[other_i];
      bool is_better;
      if constexpr (is_stable) {
        is_better = is_better_than<ascending>(val, other_val, idx_arr[i],
                                              idx_arr[other_i]);
      } else {
        is_better = is_better_than<ascending>(val, other_val);
      }
      if (is_better) {
        T tmp = val;
        val = other_val;
        other_val = tmp;
        idxT tmp2 = idx_arr[i];
        idx_arr[i] = idx_arr[other_i];
        idx_arr[other_i] = tmp2;
      }
    }
    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
        val_arr, idx_arr);
    BitonicMerge<size / 2, ascending, reverse, T, idxT, is_stable>::merge(
        val_arr + arr_len / 2, idx_arr + arr_len / 2);
  }
 };
 template <int size, bool ascending, typename T, typename idxT, bool is_stable>
 struct BitonicSort {
  __device__ static void sort(T* __restrict__ val_arr,
                              idxT* __restrict__ idx_arr) {
    static_assert(isPowerOf2(size));
    static_assert(size >= 2 * WARP_SIZE);
    constexpr int arr_len = size / WARP_SIZE;
    BitonicSort<size / 2, true, T, idxT, is_stable>::sort(val_arr, idx_arr);
    BitonicSort<size / 2, false, T, idxT, is_stable>::sort(
        val_arr + arr_len / 2, idx_arr + arr_len / 2);
    BitonicMerge<size, ascending, ascending, T, idxT, is_stable>::merge(
        val_arr, idx_arr);
  }
 };
 template <bool ascending, typename T, typename idxT, bool is_stable>
 struct BitonicSort<32, ascending, T, idxT, is_stable> {
  __device__ static void sort(T* __restrict__ val_arr,
                              idxT* __restrict__ idx_arr) {
    int const lane = threadIdx.x % WARP_SIZE;
    // ascending doesn't matter before merging since all we need is a bitonic
    // sequence
    for (int stage = 0; stage < 4; ++stage) {
      for (int stride = (1 << stage); stride > 0; stride /= 2) {
        bool reverse = (lane >> stage) & 2;
        bool is_second = lane & stride;
        T other = __shfl_xor_sync(FULL_WARP_MASK, *val_arr, stride);
        idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, *idx_arr, stride);
        bool is_better;
        if constexpr (is_stable) {
          if constexpr (ascending) {
            is_better = ((*val_arr > other) ||
                         ((*val_arr == other) && (*idx_arr < other_idx))) !=
                        (reverse != is_second);
          } else {
            is_better = ((*val_arr > other) ||
                         ((*val_arr == other) && (*idx_arr > other_idx))) !=
                        (reverse != is_second);
          }
        } else {
          is_better = (*val_arr != other &&
                       (*val_arr > other) != (reverse != is_second));
        }
        if (is_better) {
          *val_arr = other;
          *idx_arr = other_idx;
        }
      }
    }
    BitonicMerge<32, ascending, ascending, T, idxT, is_stable>::merge(val_arr,
                                                                      idx_arr);
  }
 };
 template <bool ascending, bool reverse, typename T, typename idxT,
          bool is_stable>
 struct BitonicMerge<32, ascending, reverse, T, idxT, is_stable> {
  __device__ static void merge(T* __restrict__ val_arr,
                               idxT* __restrict__ idx_arr) {
    int const lane = threadIdx.x % WARP_SIZE;
    for (int stride = WARP_SIZE / 2; stride > 0; stride /= 2) {
      bool is_second = lane & stride;
      T& val = *val_arr;
      T other = __shfl_xor_sync(FULL_WARP_MASK, val, stride);
      idxT& idx = *idx_arr;
      idxT other_idx = __shfl_xor_sync(FULL_WARP_MASK, idx, stride);
      bool is_better;
      if constexpr (is_stable) {
        if constexpr (ascending) {
          is_better = ((*val_arr > other) ||
                       ((*val_arr == other) && (*idx_arr < other_idx))) ==
                      (reverse != is_second);  // for min
        } else {
          is_better = ((*val_arr > other) ||
                       ((*val_arr == other) && (*idx_arr > other_idx))) ==
                      (reverse != is_second);  // for max
        }
      } else {
        is_better =
            (val != other && ((val > other) == (ascending != is_second)));
      }
      if (is_better) {
        val = other;
        idx = other_idx;
      }
    }
  }
 };
 template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
 class WarpSort {
 public:
  __device__ WarpSort(idxT k, T dummy)
      : lane_(threadIdx.x % WARP_SIZE), k_(k), dummy_(dummy) {
    static_assert(capacity >= WARP_SIZE && isPowerOf2(capacity));
    for (int i = 0; i < max_arr_len_; ++i) {
      val_arr_[i] = dummy_;
      idx_arr_[i] = 0;
    }
  }
  // load and merge k sorted values
  __device__ void load_sorted(T const* __restrict__ in,
                              idxT const* __restrict__ in_idx, idxT start) {
    idxT idx = start + WARP_SIZE - 1 - lane_;
    for (int i = max_arr_len_ - 1; i >= 0; --i, idx += WARP_SIZE) {
      if (idx < start + k_) {
        T t = in[idx];
        bool is_better;
        if constexpr (is_stable) {
          is_better =
              is_better_than<greater>(t, val_arr_[i], in_idx[idx], idx_arr_[i]);
        } else {
          is_better = is_better_than<greater>(t, val_arr_[i]);
        }
        if (is_better) {
          val_arr_[i] = t;
          idx_arr_[i] = in_idx[idx];
        }
      }
    }
    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
        val_arr_, idx_arr_);
  }
  __device__ void dump(T* __restrict__ out, idxT* __restrict__ out_idx) const {
    for (int i = 0; i < max_arr_len_; ++i) {
      idxT out_i = i * WARP_SIZE + lane_;
      if (out_i < k_) {
        out[out_i] = val_arr_[i];
        out_idx[out_i] = idx_arr_[i];
      }
    }
  }
  __device__ void dumpIdx(idxT* __restrict__ out_idx) const {
    for (int i = 0; i < max_arr_len_; ++i) {
      idxT out_i = i * WARP_SIZE + lane_;
      if (out_i < k_) {
        out_idx[out_i] = idx_arr_[i];
      }
    }
  }
 protected:
  static constexpr int max_arr_len_ = capacity / WARP_SIZE;
  T val_arr_[max_arr_len_];
  idxT idx_arr_[max_arr_len_];
  int const lane_;
  idxT const k_;
  T const dummy_;
 };  // end class WarpSort
 template <int capacity, bool greater, typename T, typename idxT, bool is_stable>
 class WarpSelect : public WarpSort<capacity, greater, T, idxT, is_stable> {
 public:
  __device__ WarpSelect(idxT k, T dummy)
      : WarpSort<capacity, greater, T, idxT, is_stable>(k, dummy),
        k_th_(dummy),
        k_th_lane_((k - 1) % WARP_SIZE) {
    extern __shared__ char smem_buf[];  // extern __shared__ T smem_buf[];
    int const num_of_warp = blockDim.x / WARP_SIZE;
    int const warp_id = threadIdx.x / WARP_SIZE;
    val_smem_ = reinterpret_cast<T*>(smem_buf);
    val_smem_ += warp_id * WARP_SIZE;
    idx_smem_ = reinterpret_cast<idxT*>(
        smem_buf +
        round_up_to_multiple_of<256>(num_of_warp * sizeof(T) * WARP_SIZE));
    idx_smem_ += warp_id * WARP_SIZE;
  }
  __device__ void add(T const* in, idxT start, idxT end) {
    idxT const end_for_fullwarp =
        round_up_to_multiple_of<WARP_SIZE>(end - start) + start;
    for (idxT i = start + lane_; i < end_for_fullwarp; i += WARP_SIZE) {
      T val = (i < end) ? in[i] : dummy_;
      add(val, i);
    }
  }
  __device__ void add(T val, idxT idx) {
    bool do_add;
    if constexpr (is_stable) {
      do_add = is_better_than<greater>(val, k_th_, idx, k_th_idx_);
    } else {
      do_add = is_better_than<greater>(val, k_th_);
    }
    uint32_t mask = __ballot_sync(FULL_WARP_MASK, do_add);
    if (mask == 0) {
      return;
    }
    int pos = smem_buf_len_ + __popc(mask & ((0x1u << lane_) - 1));
    if (do_add && pos < WARP_SIZE) {
      val_smem_[pos] = val;
      idx_smem_[pos] = idx;
      do_add = false;
    }
    smem_buf_len_ += __popc(mask);
    if (smem_buf_len_ >= WARP_SIZE) {
      __syncwarp();
      merge_buf_(val_smem_[lane_], idx_smem_[lane_]);
      smem_buf_len_ -= WARP_SIZE;
    }
    if (do_add) {
      pos -= WARP_SIZE;
      val_smem_[pos] = val;
      idx_smem_[pos] = idx;
    }
    __syncwarp();
  }
  __device__ void done() {
    if (smem_buf_len_) {
      T val = (lane_ < smem_buf_len_) ? val_smem_[lane_] : dummy_;
      idxT idx = (lane_ < smem_buf_len_) ? idx_smem_[lane_] : 0;
      merge_buf_(val, idx);
    }
    // after done(), smem is used for merging results among warps
    __syncthreads();
  }
 private:
  __device__ void set_k_th_() {
    k_th_ = __shfl_sync(FULL_WARP_MASK, val_arr_[max_arr_len_ - 1], k_th_lane_);
    if constexpr (is_stable) {
      k_th_idx_ =
          __shfl_sync(FULL_WARP_MASK, idx_arr_[max_arr_len_ - 1], k_th_lane_);
    }
  }
  __device__ void merge_buf_(T val, idxT idx) {
    BitonicSort<WARP_SIZE, greater, T, idxT, is_stable>::sort(&val, &idx);
    T& old = val_arr_[max_arr_len_ - 1];
    bool is_better;
    if constexpr (is_stable) {
      is_better =
          is_better_than<greater>(val, old, idx, idx_arr_[max_arr_len_ - 1]);
    } else {
      is_better = is_better_than<greater>(val, old);
    }
    if (is_better) {
      old = val;
      idx_arr_[max_arr_len_ - 1] = idx;
    }
    BitonicMerge<capacity, greater, !greater, T, idxT, is_stable>::merge(
        val_arr_, idx_arr_);
    set_k_th_();
  }
  using WarpSort<capacity, greater, T, idxT, is_stable>::max_arr_len_;
  using WarpSort<capacity, greater, T, idxT, is_stable>::val_arr_;
  using WarpSort<capacity, greater, T, idxT, is_stable>::idx_arr_;
  using WarpSort<capacity, greater, T, idxT, is_stable>::lane_;
  using WarpSort<capacity, greater, T, idxT, is_stable>::k_;
  using WarpSort<capacity, greater, T, idxT, is_stable>::dummy_;
  T* val_smem_;
  idxT* idx_smem_;
  int smem_buf_len_ = 0;
  T k_th_;
  idxT k_th_idx_;
  int const k_th_lane_;
 };  // end class WarpSelect
 }  // namespace warp_topk
 template <typename T_OUT, typename T_IN>
 __device__ inline T_OUT cuda_cast(T_IN val) {
  return val;
 }
 template <>
 __device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) {
  return __bfloat162float(val);
 }
 template <typename T>
 __device__ void topk_with_k2(T* output, T const* input,
                             cg::thread_block_tile<32> const& tile,
                             int32_t const lane_id,
                             int const num_experts_per_group) {
  // Get the top2 per thread
  T largest = -INFINITY;
  T second_largest = -INFINITY;
  if (num_experts_per_group > WARP_SIZE) {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
      T value = input[i];
      if (value > largest) {
        second_largest = largest;
        largest = value;
      } else if (value > second_largest) {
        second_largest = value;
      }
    }
  } else {
    for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) {
      largest = input[i];
    }
  }
  __syncwarp();  // Ensure all threads have valid data before reduction
  // Get the top2 warpwise
  T max1 = cg::reduce(tile, largest, cg::greater<T>());
  T max2 = max1;
  bool equal_to_max1 = (max1 == largest);
  int count_max1 = __popc(__ballot_sync(FULL_WARP_MASK, equal_to_max1));
  if (count_max1 == 1) {
    largest = (largest == max1) ? second_largest : largest;
    max2 = cg::reduce(tile, largest, cg::greater<T>());
  }
  if (lane_id == 0) {
    *output = max1 + max2;
  }
 }
 template <typename T>
 __global__ void topk_with_k2_kernel(T* output, T* input,
                                    int64_t const num_tokens,
                                    int64_t const num_cases,
                                    int64_t const n_group,
                                    int64_t const num_experts_per_group) {
  int32_t warp_id = threadIdx.x / WARP_SIZE;
  int32_t lane_id = threadIdx.x % WARP_SIZE;
  int32_t case_id = blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;
  if (case_id < num_cases) {
    input += case_id * num_experts_per_group;
    output += case_id;
    cg::thread_block block = cg::this_thread_block();
    cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
    asm volatile("griddepcontrol.wait;");
 #endif
    topk_with_k2(output, input, tile, lane_id, num_experts_per_group);
  }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 template <typename T, typename IdxT>
 __global__ void group_idx_and_topk_idx_kernel(
    T* scores, T const* group_scores, T* topk_values, IdxT* topk_indices,
    T* scores_with_bias, int64_t const num_tokens, int64_t const n_group,
    int64_t const topk_group, int64_t const topk, int64_t const num_experts,
    int64_t const num_experts_per_group, bool renormalize,
    double routed_scaling_factor) {
  int32_t warp_id = threadIdx.x / WARP_SIZE;
  int32_t lane_id = threadIdx.x % WARP_SIZE;
  int32_t case_id =
      blockIdx.x * NUM_WARPS_PER_BLOCK + warp_id;  // one per token
  scores_with_bias += case_id * num_experts;
  scores += case_id * num_experts;
  group_scores += case_id * n_group;
  topk_values += case_id * topk;
  topk_indices += case_id * topk;
  int32_t align_num_experts_per_group =
      warp_topk::round_up_to_multiple_of<WARP_SIZE>(num_experts_per_group);
  cg::thread_block block = cg::this_thread_block();
  cg::thread_block_tile<32> tile = cg::tiled_partition<32>(block);
  extern __shared__ char smem_buf[];  // NOTE: reuse the shared memory here to
                                      // store the target topk idx
  int32_t* s_topk_idx = reinterpret_cast<int32_t*>(smem_buf);
  T* s_topk_value =
      reinterpret_cast<T*>(s_topk_idx + NUM_WARPS_PER_BLOCK * topk) +
      warp_id * topk;
  s_topk_idx += warp_id * topk;
  T value = kNegInfinity;
  T topk_group_value = kNegInfinity;
  int32_t num_equalto_topkth_group;
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.wait;");  // I think all prolog can be put before
                                         // acqbulk because it's ptr arithmetic
 #endif
  if (case_id < num_tokens) {
    // calculate group_idx
    int32_t target_num_min = WARP_SIZE - n_group + topk_group;
    if (lane_id < n_group &&
        (isfinite(cuda_cast<float, T>(
            group_scores[lane_id]))))  // The check is necessary to avoid
                                       // abnormal input
    {
      value = group_scores[lane_id];
    }
    int count_equal_to_top_value = WARP_SIZE - n_group;
    int pre_count_equal_to_top_value = 0;
    // Use loop to find the largset top_group
    while (count_equal_to_top_value < target_num_min) {
      __syncwarp();  // Ensure all threads have valid data before reduction
      topk_group_value = cg::reduce(tile, value, cg::greater<T>());
      if (value == topk_group_value) {
        value = kNegInfinity;
      }
      pre_count_equal_to_top_value = count_equal_to_top_value;
      count_equal_to_top_value = __popc(__ballot_sync(
          FULL_WARP_MASK, (value == cuda_cast<T, float>(kNegInfinity))));
    }
    num_equalto_topkth_group = target_num_min - pre_count_equal_to_top_value;
  }
  __syncthreads();
  warp_topk::WarpSelect</*capability*/ WARP_SIZE, /*greater*/ true, T, int32_t,
                        /* is_stable */ true>
      queue((int32_t)topk, -INFINITY);
  int count_equalto_topkth_group = 0;
  bool if_proceed_next_topk =
      (topk_group_value != cuda_cast<T, float>(kNegInfinity));
  if (case_id < num_tokens && if_proceed_next_topk) {
    for (int i_group = 0; i_group < n_group; i_group++) {
      if ((group_scores[i_group] > topk_group_value) ||
          ((group_scores[i_group] == topk_group_value) &&
           (count_equalto_topkth_group < num_equalto_topkth_group))) {
        int32_t offset = i_group * num_experts_per_group;
        for (int32_t i = lane_id; i < align_num_experts_per_group;
             i += WARP_SIZE) {
          T candidates =
              (i < num_experts_per_group) && isfinite(cuda_cast<float, T>(
                                                 scores_with_bias[offset + i]))
                  ? scores_with_bias[offset + i]
                  : cuda_cast<T, float>(kNegInfinity);
          queue.add(candidates, offset + i);
        }
        if (group_scores[i_group] == topk_group_value) {
          count_equalto_topkth_group++;
        }
      }
    }
    queue.done();
    __syncwarp();
    // Get the topk_idx
    queue.dumpIdx(s_topk_idx);
    __syncwarp();
  }
  // Load the valid score value
  // Calculate the summation
  float topk_sum = 1e-20;
  if (case_id < num_tokens && if_proceed_next_topk) {
    for (int i = lane_id;
         i < warp_topk::round_up_to_multiple_of<WARP_SIZE>(topk);
         i += WARP_SIZE) {
      T value =
          i < topk
              ? scores[s_topk_idx[i]]
              : cuda_cast<T, float>(0.0f);  // Load the valid value of expert
      if (i < topk) {
        s_topk_value[i] = value;
      }
      topk_sum += reduce(tile, cuda_cast<float, T>(value), cg::plus<float>());
    }
  }
  __syncthreads();
  if (case_id < num_tokens) {
    if (if_proceed_next_topk) {
      for (int i = lane_id; i < topk; i += WARP_SIZE) {
        float value;
        if (renormalize) {
          value = cuda_cast<float, T>(s_topk_value[i]) / topk_sum *
                  routed_scaling_factor;
        } else {
          value = cuda_cast<float, T>(s_topk_value[i]) * routed_scaling_factor;
        }
        topk_indices[i] = s_topk_idx[i];
        topk_values[i] = cuda_cast<T, float>(value);
      }
    } else {
      for (int i = lane_id; i < topk; i += WARP_SIZE) {
        topk_indices[i] = i;
        topk_values[i] = cuda_cast<T, float>(1.0f / topk);
      }
    }
    // Note: when if_proceed_next_topk==false, choose the first 8 experts as the
    // default result.
  }
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
  asm volatile("griddepcontrol.launch_dependents;");
 #endif
 }
 template <typename T, typename IdxT>
 void invokeNoAuxTc(T* scores, T* group_scores, T* topk_values,
                   IdxT* topk_indices, T* scores_with_bias,
                   int64_t const num_tokens, int64_t const num_experts,
                   int64_t const n_group, int64_t const topk_group,
                   int64_t const topk, bool const renormalize,
                   double const routed_scaling_factor, bool enable_pdl = false,
                   cudaStream_t const stream = 0) {
  int64_t num_cases = num_tokens * n_group;
  int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1;
  auto* kernel_instance1 = &topk_with_k2_kernel<T>;
  cudaLaunchConfig_t config;
  config.gridDim = topk_with_k2_num_blocks;
  config.blockDim = BLOCK_SIZE;
  config.dynamicSmemBytes = 0;
  config.stream = stream;
  cudaLaunchAttribute attrs[1];
  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
  config.numAttrs = 1;
  config.attrs = attrs;
  cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores_with_bias,
                     num_tokens, num_cases, n_group, num_experts / n_group);
  int64_t topk_with_k_group_num_blocks =
      (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1;
  size_t dynamic_smem_in_bytes =
      warp_topk::calc_smem_size_for_block_wide<T, int32_t>(NUM_WARPS_PER_BLOCK,
                                                           topk);
  auto* kernel_instance2 = &group_idx_and_topk_idx_kernel<T, IdxT>;
  config.gridDim = topk_with_k_group_num_blocks;
  config.blockDim = BLOCK_SIZE;
  config.dynamicSmemBytes = dynamic_smem_in_bytes;
  config.stream = stream;
  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
  config.numAttrs = 1;
  config.attrs = attrs;
  cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores,
                     topk_values, topk_indices, scores_with_bias, num_tokens,
                     n_group, topk_group, topk, num_experts,
                     num_experts / n_group, renormalize, routed_scaling_factor);
 }
 #define INSTANTIATE_NOAUX_TC(T, IdxT)                                       \
  template void invokeNoAuxTc<T, IdxT>(                                     \
      T * scores, T * group_scores, T * topk_values, IdxT * topk_indices,   \
      T * scores_with_bias, int64_t const num_tokens,                       \
      int64_t const num_experts, int64_t const n_group,                     \
      int64_t const topk_group, int64_t const topk, bool const renormalize, \
      double const routed_scaling_factor, bool enable_pdl,                  \
      cudaStream_t const stream);
 INSTANTIATE_NOAUX_TC(float, int32_t);
 INSTANTIATE_NOAUX_TC(half, int32_t);
 INSTANTIATE_NOAUX_TC(__nv_bfloat16, int32_t);
 }  // end namespace moe
 }  // namespace vllm
 std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
    double routed_scaling_factor) {
  auto data_type = scores_with_bias.scalar_type();
  auto input_size = scores_with_bias.sizes();
  int64_t num_tokens = input_size[0];
  int64_t num_experts = input_size[1];
  TORCH_CHECK(input_size.size() == 2, "scores_with_bias must be a 2D Tensor");
  TORCH_CHECK(num_experts % n_group == 0,
              "num_experts should be divisible by n_group");
  TORCH_CHECK(n_group <= 32,
              "n_group should be smaller than or equal to 32 for now");
  TORCH_CHECK(topk <= 32, "topk should be smaller than or equal to 32 for now");
  torch::Tensor group_scores = torch::empty(
      {num_tokens, n_group}, torch::dtype(data_type).device(torch::kCUDA));
  torch::Tensor topk_values = torch::empty(
      {num_tokens, topk}, torch::dtype(data_type).device(torch::kCUDA));
  torch::Tensor topk_indices = torch::empty(
      {num_tokens, topk}, torch::dtype(torch::kInt32).device(torch::kCUDA));
  auto stream = c10::cuda::getCurrentCUDAStream(scores_with_bias.get_device());
  switch (data_type) {
    case torch::kFloat16:
      // Handle Float16
      vllm::moe::invokeNoAuxTc<half, int32_t>(
          reinterpret_cast<half*>(scores.mutable_data_ptr()),
          reinterpret_cast<half*>(group_scores.mutable_data_ptr()),
          reinterpret_cast<half*>(topk_values.mutable_data_ptr()),
          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
          reinterpret_cast<half*>(scores_with_bias.data_ptr()), num_tokens,
          num_experts, n_group, topk_group, topk, renormalize,
          routed_scaling_factor, false, stream);
      break;
    case torch::kFloat32:
      // Handle Float32
      vllm::moe::invokeNoAuxTc<float, int32_t>(
          reinterpret_cast<float*>(scores.mutable_data_ptr()),
          reinterpret_cast<float*>(group_scores.mutable_data_ptr()),
          reinterpret_cast<float*>(topk_values.mutable_data_ptr()),
          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
          reinterpret_cast<float*>(scores_with_bias.data_ptr()), num_tokens,
          num_experts, n_group, topk_group, topk, renormalize,
          routed_scaling_factor, false, stream);
      break;
    case torch::kBFloat16:
      // Handle BFloat16
      vllm::moe::invokeNoAuxTc<__nv_bfloat16, int32_t>(
          reinterpret_cast<__nv_bfloat16*>(scores.mutable_data_ptr()),
          reinterpret_cast<__nv_bfloat16*>(group_scores.mutable_data_ptr()),
          reinterpret_cast<__nv_bfloat16*>(topk_values.mutable_data_ptr()),
          reinterpret_cast<int32_t*>(topk_indices.mutable_data_ptr()),
          reinterpret_cast<__nv_bfloat16*>(scores_with_bias.data_ptr()),
          num_tokens, num_experts, n_group, topk_group, topk, renormalize,
          routed_scaling_factor, false, stream);
      break;
    default:
      // Handle other data types
      throw std::invalid_argument(
          "Invalid dtype, only supports float16, float32, and bfloat16");
      break;
  }
  return {topk_values, topk_indices};
 }
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@ -22,11 +22,6 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor num_tokens_post_pad, int64_t top_k,
                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                             int64_t BLOCK_SIZE_K, int64_t bit);
 std::tuple<torch::Tensor, torch::Tensor> grouped_topk(
    torch::Tensor const& scores, torch::Tensor const& scores_with_bias,
    int64_t n_group, int64_t topk_group, int64_t topk, bool renormalize,
    double routed_scaling_factor);
 #endif
 bool moe_permute_unpermute_supported();
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@ -45,6 +45,8 @@ void moe_permute(
  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
  auto permuted_experts_id = torch::empty_like(topk_ids);
  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
  auto align_expert_first_token_offset =
      torch::zeros_like(expert_first_token_offset);
  CubKeyValueSorter sorter{};
  int64_t* valid_num_ptr = nullptr;
@ -83,14 +85,12 @@ void moe_permute(
  });
  // get m_indices and update expert_first_token_offset with align block
-  // this is only required for DeepGemm and not required for CUTLASS group gemm
+  getMIndices(get_ptr<int64_t>(expert_first_token_offset),
              get_ptr<int64_t>(align_expert_first_token_offset),
              get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
              stream);
  if (align_block_size.has_value()) {
-    auto align_expert_first_token_offset =
+    // update align_expert_first_token_offset
        torch::zeros_like(expert_first_token_offset);
    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
                get_ptr<int64_t>(align_expert_first_token_offset),
                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
                stream);
    expert_first_token_offset.copy_(align_expert_first_token_offset);
  }
 }
@ -195,14 +195,19 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
                 torch::Tensor& expert_first_token_offset,
                 torch::Tensor& src_row_id2dst_row_id_map,
                 torch::Tensor& m_indices) {
-  TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0");
+  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
 }
-void moe_unpermute(
+void moe_unpermute(const torch::Tensor& input,
-    const torch::Tensor& permuted_hidden_states,
+                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
-    const torch::Tensor& topk_weights, const torch::Tensor& inv_permuted_idx,
+                   const torch::Tensor& token_expert_indices,
-    const std::optional<torch::Tensor>& expert_first_token_offset, int64_t topk,
+                   const std::optional<torch::Tensor>& expert_map,
-    torch::Tensor& hidden_states) {
+                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
                   const std::optional<int64_t>& align_block_size,
                   torch::Tensor& permuted_input,
                   torch::Tensor& expert_first_token_offset,
                   torch::Tensor& src_row_id2dst_row_id_map,
                   torch::Tensor& m_indices) {
  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
 }
@ -219,4 +224,4 @@ bool moe_permute_unpermute_supported() {
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("moe_permute", &moe_permute);
  m.impl("moe_unpermute", &moe_unpermute);
-}
+}
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@ -573,7 +573,7 @@ void topk_softmax(
            stream);
    }
    else {
-        TORCH_CHECK(topk_indices.scalar_type() == at::ScalarType::Long);
+        assert(topk_indices.scalar_type() == at::ScalarType::Int64);
        vllm::moe::topkGatingSoftmaxKernelLauncher(
            gating_output.data_ptr<float>(),
            topk_weights.data_ptr<float>(),
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@ -78,12 +78,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "output_tensor) -> ()");
  m.impl("shuffle_rows", torch::kCUDA, &shuffle_rows);
  // Apply grouped topk routing to select experts.
  m.def(
      "grouped_topk(Tensor scores, Tensor scores_with_bias, int n_group, int "
      "topk_group, int topk, bool renormalize, float "
      "routed_scaling_factor) -> (Tensor, Tensor)");
  m.impl("grouped_topk", torch::kCUDA, &grouped_topk);
 #endif
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -92,9 +92,6 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);
 void poly_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
               torch::Tensor& bias, double epsilon);
 void apply_repetition_penalties_(torch::Tensor& logits,
                                 const torch::Tensor& prompt_mask,
                                 const torch::Tensor& output_mask,
@ -133,13 +130,6 @@ void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
 void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
                        torch::Tensor& scale);
 #ifndef USE_ROCM
 void silu_and_mul_nvfp4_quant(torch::Tensor& out,
                              torch::Tensor& output_block_scale,
                              torch::Tensor& input,
                              torch::Tensor& input_global_scale);
 #endif
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
@ -239,11 +229,6 @@ void get_cutlass_moe_mm_data(
    const int64_t num_experts, const int64_t n, const int64_t k,
    const std::optional<torch::Tensor>& blockscale_offsets);
 void get_cutlass_moe_mm_problem_sizes(
    const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1,
    torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
    const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets);
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
                                  torch::Tensor& problem_sizes1,
                                  torch::Tensor& problem_sizes2,
@ -356,4 +341,4 @@ void qr_open_handles(fptr_t _fa, const std::vector<torch::Tensor>& handles);
 void qr_all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
                   int64_t quant_level, bool cast_bf2half = false);
 int64_t qr_max_size();
-#endif
+#endif
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
@ -1,424 +0,0 @@
 //
 // Based off of:
 //   https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu
 //
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/all.h>
 #include "cutlass_extensions/torch_utils.hpp"
 #include "core/registration.h"
 #include "cutlass/cutlass.h"
 #include <limits>
 #include "cute/tensor.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
 #include "cutlass/util/packed_stride.hpp"
 #include "cutlass/util/mixed_dtype_utils.hpp"
 #include "cutlass_extensions/common.hpp"
 #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
 namespace vllm::cutlass_w4a8 {
 using namespace cute;
 // -------------------------------------------------------------------------------------
 // Static configuration shared across all instantiations
 // -------------------------------------------------------------------------------------
 using MmaType = cutlass::float_e4m3_t;  // A/scale element type
 using QuantType = cutlass::int4b_t;     // B element type (packed int4)
 static int constexpr TileShapeK = 128 * 8 / sizeof_bits<MmaType>::value;
 static int constexpr ScalePackSize = 8;  // pack 8 scale elements together
 static int constexpr PackFactor = 8;     // 8 4-bit packed into int32
 // A matrix configuration
 using ElementA = MmaType;                   // Element type for A matrix operand
 using LayoutA = cutlass::layout::RowMajor;  // Layout type for A matrix operand
 using LayoutA_Transpose =
    typename cutlass::layout::LayoutTranspose<LayoutA>::type;
 constexpr int AlignmentA =
    128 / cutlass::sizeof_bits<
              ElementA>::value;  // Memory access granularity/alignment of A
                                 // matrix in units of elements (up to 16 bytes)
 using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
 // B matrix configuration
 using ElementB = QuantType;  // Element type for B matrix operand
 using LayoutB =
    cutlass::layout::ColumnMajor;  // Layout type for B matrix operand
 using LayoutB_Transpose =
    typename cutlass::layout::LayoutTranspose<LayoutB>::type;
 constexpr int AlignmentB =
    128 / cutlass::sizeof_bits<
              ElementB>::value;  // Memory access granularity/alignment of B
                                 // matrix in units of elements (up to 16 bytes)
 using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
 // Define the CuTe layout for reordered quantized tensor B
 // LayoutAtomQuant places values that will be read by the same thread in
 // contiguous locations in global memory. It specifies the reordering within a
 // single warp's fragment
 using LayoutAtomQuant =
    decltype(cutlass::compute_memory_reordering_atom<MmaType>());
 using LayoutB_Reordered = decltype(cute::tile_to_shape(
    LayoutAtomQuant{}, Layout<Shape<int, int, int>, StrideB>{}));
 // Group-wise scales
 using ElementScale = MmaType;
 using LayoutScale = cutlass::layout::RowMajor;
 // Per-tok, per-chan scales
 using ElementSChannel = float;
 // C/D matrix configuration
 using ElementC =
    cutlass::bfloat16_t;  // Element type for C and D matrix operands
 using LayoutC =
    cutlass::layout::RowMajor;  // Layout type for C and D matrix operands
 constexpr int AlignmentC =
    128 / cutlass::sizeof_bits<
              ElementC>::value;  // Memory access granularity/alignment of C
                                 // matrix in units of elements (up to 16 bytes)
 using ElementD = ElementC;
 using LayoutD = LayoutC;
 constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
 // Core kernel configurations
 using ElementAccumulator = float;     // Element type for internal accumulation
 using ElementCompute = float;         // Element type for epilogue computation
 using ArchTag = cutlass::arch::Sm90;  // Tag indicating the minimum SM that
                                      // supports the intended feature
 using OperatorClass = cutlass::arch::OpClassTensorOp;  // Operator class tag
 using KernelSchedule =
    cutlass::gemm::KernelTmaWarpSpecializedCooperative;  // Kernel to launch
                                                         // based on the default
                                                         // setting in the
                                                         // Collective Builder
 using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecializedCooperative;
 using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto;
 // ----------------------------------------------------------------------------
 // Kernel template — Tile/Cluster shapes
 // ----------------------------------------------------------------------------
 template <class TileShape_MN, class ClusterShape_MNK>
 struct W4A8GemmKernel {
  using TileShape =
      decltype(cute::append(TileShape_MN{}, cute::Int<TileShapeK>{}));
  using ClusterShape = ClusterShape_MNK;
  // Epilogue per-tok, per-chan scales
  using ChTokScalesEpilogue =
      typename vllm::c3x::ScaledEpilogue<ElementAccumulator, ElementD,
                                         TileShape>;
  using EVTCompute = typename ChTokScalesEpilogue::EVTCompute;
  using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType,
          ElementAccumulator, ElementSChannel,
          // Transpose layout of D here since we use explicit swap + transpose
          // the void type for C tells the builder to allocate 0 smem for the C
          // matrix. We can enable this if beta == 0 by changing ElementC to
          // void below.
          ElementC, typename cutlass::layout::LayoutTranspose<LayoutC>::type,
          AlignmentC, ElementD,
          typename cutlass::layout::LayoutTranspose<LayoutD>::type, AlignmentD,
          EpilogueSchedule,  // This is the only epi supporting the required
                             // swap + transpose.
          EVTCompute>::CollectiveOp;
  // The Scale information must get paired with the operand that will be scaled.
  // In this example, B is scaled so we make a tuple of B's information and the
  // scale information.
  using CollectiveMainloopShuffled =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag, OperatorClass,
          cute::tuple<ElementB, cutlass::Array<ElementScale, ScalePackSize>>,
          LayoutB_Reordered, AlignmentB, ElementA, LayoutA_Transpose,
          AlignmentA, ElementAccumulator, TileShape, ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          KernelSchedule>::CollectiveOp;
  using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal<
      Shape<int, int, int, int>,  // Indicates ProblemShape
      CollectiveMainloopShuffled, CollectiveEpilogue>;
  using GemmShuffled =
      cutlass::gemm::device::GemmUniversalAdapter<GemmKernelShuffled>;
  using StrideC = typename GemmKernelShuffled::StrideC;
  using StrideD = typename GemmKernelShuffled::StrideD;
  using StrideS = typename CollectiveMainloopShuffled::StrideScale;
  static torch::Tensor mm(torch::Tensor const& A,
                          torch::Tensor const& B,             // already packed
                          torch::Tensor const& group_scales,  // already packed
                          int64_t group_size,
                          torch::Tensor const& channel_scales,
                          torch::Tensor const& token_scales,
                          std::optional<at::ScalarType> const& maybe_out_type) {
    // TODO: param validation
    int m = A.size(0);
    int k = A.size(1);
    int n = B.size(1);
    // safely cast group_size to int
    TORCH_CHECK(group_size > 0 && group_size <= std::numeric_limits<int>::max(),
                "group_size out of supported range for int: ", group_size);
    int const group_size_int = static_cast<int>(group_size);
    // Allocate output
    const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
    auto device = A.device();
    auto stream = at::cuda::getCurrentCUDAStream(device.index());
    torch::Tensor D =
        torch::empty({m, n}, torch::TensorOptions()
                                 .dtype(equivalent_scalar_type_v<ElementD>)
                                 .device(device));
    // prepare arg pointers
    auto A_ptr = static_cast<MmaType const*>(A.const_data_ptr());
    auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
    auto D_ptr = static_cast<ElementD*>(D.data_ptr());
    // can we avoid hardcode the 8 here
    auto S_ptr =
        static_cast<cutlass::Array<ElementScale, ScalePackSize> const*>(
            group_scales.const_data_ptr());
    // runtime layout for B
    auto shape_B = cute::make_shape(n, k, 1);
    LayoutB_Reordered layout_B_reordered =
        cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
    // strides
    int const scale_k = cutlass::ceil_div(k, group_size_int);
    StrideA stride_A =
        cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
    // Reverse stride here due to swap and transpose
    StrideD stride_D =
        cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(n, m, 1));
    StrideS stride_S = cutlass::make_cute_packed_stride(
        StrideS{}, cute::make_shape(n, scale_k, 1));
    // Create a structure of gemm kernel arguments suitable for invoking an
    // instance of Gemm auto arguments =
    // args_from_options<GemmShuffled>(options);
    /// Populates a Gemm::Arguments structure from the given arguments
    /// Swap the A and B tensors, as well as problem shapes here.
    using Args = typename GemmShuffled::Arguments;
    using MainloopArguments = typename GemmKernelShuffled::MainloopArguments;
    using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments;
    MainloopArguments mainloop_arguments{
        B_ptr, layout_B_reordered, A_ptr,         stride_A,
        S_ptr, stride_S,           group_size_int};
    EpilogueArguments epilogue_arguments{
        ChTokScalesEpilogue::prepare_args(channel_scales, token_scales),
        nullptr,
        {},  // no C
        D_ptr,
        stride_D};
    Args arguments{cutlass::gemm::GemmUniversalMode::kGemm,
                   {n, m, k, 1},  // shape
                   mainloop_arguments,
                   epilogue_arguments};
    // Workspace
    size_t workspace_size = GemmShuffled::get_workspace_size(arguments);
    torch::Tensor workspace =
        torch::empty(workspace_size,
                     torch::TensorOptions().dtype(torch::kU8).device(device));
    // Run GEMM
    GemmShuffled gemm;
    CUTLASS_CHECK(gemm.can_implement(arguments));
    CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream));
    CUTLASS_CHECK(gemm.run(stream));
    return D;
  }
 };
 // ----------------------------------------------------------------------------
 // Kernel instantiations and dispatch logic
 // ----------------------------------------------------------------------------
 using Kernel_256x128_1x1x1 =
    W4A8GemmKernel<Shape<_256, _128>, Shape<_1, _1, _1>>;
 using Kernel_256x64_1x1x1 = W4A8GemmKernel<Shape<_256, _64>, Shape<_1, _1, _1>>;
 using Kernel_256x32_1x1x1 = W4A8GemmKernel<Shape<_256, _32>, Shape<_1, _1, _1>>;
 using Kernel_256x16_1x1x1 = W4A8GemmKernel<Shape<_256, _16>, Shape<_1, _1, _1>>;
 using Kernel_128x256_2x1x1 =
    W4A8GemmKernel<Shape<_128, _256>, Shape<_2, _1, _1>>;
 using Kernel_128x256_1x1x1 =
    W4A8GemmKernel<Shape<_128, _256>, Shape<_1, _1, _1>>;
 using Kernel_128x128_1x1x1 =
    W4A8GemmKernel<Shape<_128, _128>, Shape<_1, _1, _1>>;
 using Kernel_128x64_1x1x1 = W4A8GemmKernel<Shape<_128, _64>, Shape<_1, _1, _1>>;
 using Kernel_128x32_1x1x1 = W4A8GemmKernel<Shape<_128, _32>, Shape<_1, _1, _1>>;
 using Kernel_128x16_1x1x1 = W4A8GemmKernel<Shape<_128, _16>, Shape<_1, _1, _1>>;
 torch::Tensor mm_dispatch(torch::Tensor const& A,
                          torch::Tensor const& B,             // already packed
                          torch::Tensor const& group_scales,  // already packed
                          int64_t group_size,
                          torch::Tensor const& channel_scales,
                          torch::Tensor const& token_scales,
                          std::optional<at::ScalarType> const& maybe_out_type,
                          const std::string& schedule) {
  if (schedule == "256x128_1x1x1") {
    return Kernel_256x128_1x1x1::mm(A, B, group_scales, group_size,
                                    channel_scales, token_scales,
                                    maybe_out_type);
  } else if (schedule == "256x64_1x1x1") {
    return Kernel_256x64_1x1x1::mm(A, B, group_scales, group_size,
                                   channel_scales, token_scales,
                                   maybe_out_type);
  } else if (schedule == "256x32_1x1x1") {
    return Kernel_256x32_1x1x1::mm(A, B, group_scales, group_size,
                                   channel_scales, token_scales,
                                   maybe_out_type);
  } else if (schedule == "256x16_1x1x1") {
    return Kernel_256x16_1x1x1::mm(A, B, group_scales, group_size,
                                   channel_scales, token_scales,
                                   maybe_out_type);
  } else if (schedule == "128x256_2x1x1") {
    return Kernel_128x256_2x1x1::mm(A, B, group_scales, group_size,
                                    channel_scales, token_scales,
                                    maybe_out_type);
  } else if (schedule == "128x256_1x1x1") {
    return Kernel_128x256_1x1x1::mm(A, B, group_scales, group_size,
                                    channel_scales, token_scales,
                                    maybe_out_type);
  } else if (schedule == "128x128_1x1x1") {
    return Kernel_128x128_1x1x1::mm(A, B, group_scales, group_size,
                                    channel_scales, token_scales,
                                    maybe_out_type);
  } else if (schedule == "128x64_1x1x1") {
    return Kernel_128x64_1x1x1::mm(A, B, group_scales, group_size,
                                   channel_scales, token_scales,
                                   maybe_out_type);
  } else if (schedule == "128x32_1x1x1") {
    return Kernel_128x32_1x1x1::mm(A, B, group_scales, group_size,
                                   channel_scales, token_scales,
                                   maybe_out_type);
  } else if (schedule == "128x16_1x1x1") {
    return Kernel_128x16_1x1x1::mm(A, B, group_scales, group_size,
                                   channel_scales, token_scales,
                                   maybe_out_type);
  }
  TORCH_CHECK(false, "Unknown W4A8 schedule: ", schedule);
  return {};
 }
 torch::Tensor mm(torch::Tensor const& A,
                 torch::Tensor const& B,             // already packed
                 torch::Tensor const& group_scales,  // already packed
                 int64_t group_size, torch::Tensor const& channel_scales,
                 torch::Tensor const& token_scales,
                 std::optional<at::ScalarType> const& maybe_out_type,
                 std::optional<std::string> maybe_schedule) {
  // requested a specific schedule
  if (maybe_schedule) {
    return mm_dispatch(A, B, group_scales, group_size, channel_scales,
                       token_scales, maybe_out_type, *maybe_schedule);
  }
  std::string schedule;
  int M = A.size(0);
  int K = A.size(1);
  int N = B.size(1);
  // heuristic
  if (M <= 16) {
    schedule = (K == 16384 && N == 18432) ? "256x16_1x1x1" : "128x16_1x1x1";
  } else if (M <= 32) {
    schedule = (K == 16384 && N == 18432) ? "256x32_1x1x1" : "128x32_1x1x1";
  } else if (M <= 64) {
    if (K == 16384 && N == 18432)
      schedule = "256x64_1x1x1";
    else if (N <= 8192 && K <= 8192)
      schedule = "128x32_1x1x1";
    else
      schedule = "128x64_1x1x1";
  } else if (M <= 128) {
    if (K == 16384 && N == 18432)
      schedule = "256x128_1x1x1";
    else if (N <= 8192)
      schedule = "128x64_1x1x1";
    else
      schedule = "128x128_1x1x1";
  } else if (M <= 256) {
    if (N <= 4096)
      schedule = "128x64_1x1x1";
    else if (N <= 8192)
      schedule = "128x128_1x1x1";
    else
      schedule = "128x256_1x1x1";
  } else if (M <= 512 && N <= 4096) {
    schedule = "128x128_1x1x1";
  } else if (M <= 1024) {
    schedule = "128x256_1x1x1";
  } else {
    schedule = "128x256_2x1x1";
  }
  return mm_dispatch(A, B, group_scales, group_size, channel_scales,
                     token_scales, maybe_out_type, schedule);
 }
 // ----------------------------------------------------------------------------
 // Pre-processing utils
 // ----------------------------------------------------------------------------
 torch::Tensor pack_scale_fp8(torch::Tensor const& scales) {
  TORCH_CHECK(scales.dtype() == torch::kFloat8_e4m3fn);
  TORCH_CHECK(scales.is_contiguous());
  TORCH_CHECK(scales.is_cuda());
  auto packed_scales = torch::empty(
      {scales.numel() * ScalePackSize},
      torch::TensorOptions().dtype(scales.dtype()).device(scales.device()));
  auto scales_ptr = static_cast<MmaType const*>(scales.const_data_ptr());
  auto packed_scales_ptr =
      static_cast<cutlass::Array<ElementScale, ScalePackSize>*>(
          packed_scales.data_ptr());
  cutlass::pack_scale_fp8(scales_ptr, packed_scales_ptr, scales.numel());
  return packed_scales;
 }
 torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) {
  TORCH_CHECK(B.dtype() == torch::kInt32);
  TORCH_CHECK(B.dim() == 2);
  torch::Tensor B_packed = torch::empty_like(B);
  int k = B.size(0) * PackFactor;  // logical k
  int n = B.size(1);
  auto B_ptr = static_cast<QuantType const*>(B.const_data_ptr());
  auto B_packed_ptr = static_cast<QuantType*>(B_packed.data_ptr());
  auto shape_B = cute::make_shape(n, k, 1);
  auto layout_B = make_layout(shape_B, LayoutRight{});  // row major
  LayoutB_Reordered layout_B_reordered =
      cute::tile_to_shape(LayoutAtomQuant{}, shape_B);
  cutlass::unified_encode_int4b(B_ptr, B_packed_ptr, n * k);
  cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered);
  return B_packed;
 }
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("cutlass_w4a8_mm", &mm);
  m.impl("cutlass_pack_scale_fp8", &pack_scale_fp8);
  m.impl("cutlass_encode_and_reorder_int4b", &encode_and_reorder_int4b);
 }
 }  // namespace vllm::cutlass_w4a8
--- a/Show More
+++ b/Show More
`@ -1,2 +1 @@`
	`collect_env.py`	`collect_env.py`
	`vllm/model_executor/layers/fla/ops/*.py`